libaom: Pull from upstream am: f1591c667f am: 8c822a4825 am: 411f7d0f3b am: f44c3dd504 Original change: https://android-review.googlesource.com/c/platform/external/libaom/+/1371331 Change-Id: I5b74ca2a71628acedb9452819bc6a3443793dbcb

commit: d1176f5c7c9f6816ca632f3a512faec108134b2a [log] [tgz]
author: James Zern <jzern@google.com> Fri Jul 24 21:24:41 2020 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> Fri Jul 24 21:24:41 2020 +0000
tree: b86816b6b139a3b6c9153ed4cd22706b802eeedb
parent: 151299aacf7b59078f6515687d79746680495af2 [diff]
parent: f44c3dd504951c753ad446544e15d4f5c5549d65 [diff]
diff --git a/Android.bp b/Android.bp
index 3a5ad26..a54136b 100644
--- a/Android.bp
+++ b/Android.bp

@@ -14,6 +14,7 @@
     "libaom/av1/common/x86/jnt_convolve_avx2.c",
     "libaom/av1/common/x86/reconinter_avx2.c",
     "libaom/av1/common/x86/selfguided_avx2.c",
+    "libaom/av1/common/x86/warp_plane_avx2.c",
     "libaom/av1/common/x86/wiener_convolve_avx2.c",
 ]
 
@@ -40,13 +41,13 @@
     "libaom/av1/common/x86/highbd_convolve_2d_sse2.c",
     "libaom/av1/common/x86/jnt_convolve_sse2.c",
     "libaom/av1/common/x86/wiener_convolve_sse2.c",
+    "libaom/av1/common/x86/warp_plane_sse2.c",
 ]
 
 aom_av1_common_intrin_sse4_1 = [
     "libaom/av1/common/cdef_block_sse4.c",
     "libaom/av1/common/x86/av1_convolve_horiz_rs_sse4.c",
     "libaom/av1/common/x86/av1_convolve_scale_sse4.c",
-    "libaom/av1/common/x86/av1_highbd_convolve_sse4.c",
     "libaom/av1/common/x86/av1_txfm_sse4.c",
     "libaom/av1/common/x86/filterintra_sse4.c",
     "libaom/av1/common/x86/highbd_convolve_2d_sse4.c",
@@ -139,6 +140,7 @@
     "libaom/av1/encoder/x86/wedge_utils_avx2.c",
     "libaom/av1/encoder/x86/encodetxb_avx2.c",
     "libaom/av1/encoder/x86/rdopt_avx2.c",
+    "libaom/av1/encoder/x86/temporal_filter_avx2.c",
     "libaom/av1/encoder/x86/pickrst_avx2.c",
 ]
 
@@ -150,6 +152,7 @@
 
 aom_av1_encoder_intrin_neon = [
     "libaom/av1/encoder/arm/neon/quantize_neon.c",
+    "libaom/av1/encoder/arm/neon/av1_error_neon.c",
 ]
 
 aom_av1_encoder_intrin_sse2 = [
@@ -157,6 +160,7 @@
     "libaom/av1/encoder/x86/av1_quantize_sse2.c",
     "libaom/av1/encoder/x86/encodetxb_sse2.c",
     "libaom/av1/encoder/x86/highbd_block_error_intrin_sse2.c",
+    "libaom/av1/encoder/x86/temporal_filter_sse2.c",
     "libaom/av1/encoder/x86/wedge_utils_sse2.c",
 ]
 
@@ -173,7 +177,6 @@
     "libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c",
     "libaom/av1/encoder/x86/rdopt_sse4.c",
     "libaom/av1/encoder/x86/temporal_filter_sse4.c",
-    "libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c",
     "libaom/av1/encoder/x86/pickrst_sse4.c",
 ]
 
@@ -191,6 +194,8 @@
     "libaom/av1/encoder/av1_multi_thread.c",
     "libaom/av1/encoder/av1_quantize.c",
     "libaom/av1/encoder/bitstream.c",
+    "libaom/av1/encoder/cnn.c",
+    "libaom/av1/encoder/compound_type.c",
     "libaom/av1/encoder/context_tree.c",
     "libaom/av1/encoder/corner_detect.c",
     "libaom/av1/encoder/corner_match.c",
@@ -209,11 +214,13 @@
     "libaom/av1/encoder/hash.c",
     "libaom/av1/encoder/hash_motion.c",
     "libaom/av1/encoder/hybrid_fwd_txfm.c",
+    "libaom/av1/encoder/interp_search.c",
     "libaom/av1/encoder/level.c",
     "libaom/av1/encoder/lookahead.c",
-    "libaom/av1/encoder/mbgraph.c",
     "libaom/av1/encoder/mcomp.c",
     "libaom/av1/encoder/ml.c",
+    "libaom/av1/encoder/motion_search_facade.c",
+    "libaom/av1/encoder/mv_prec.c",
     "libaom/av1/encoder/palette.c",
     "libaom/av1/encoder/partition_strategy.c",
     "libaom/av1/encoder/pass2_strategy.c",
@@ -224,12 +231,16 @@
     "libaom/av1/encoder/ratectrl.c",
     "libaom/av1/encoder/rd.c",
     "libaom/av1/encoder/rdopt.c",
+    "libaom/av1/encoder/nonrd_pickmode.c",
     "libaom/av1/encoder/reconinter_enc.c",
     "libaom/av1/encoder/segmentation.c",
     "libaom/av1/encoder/speed_features.c",
+    "libaom/av1/encoder/svc_layercontext.c",
     "libaom/av1/encoder/temporal_filter.c",
     "libaom/av1/encoder/tokenize.c",
     "libaom/av1/encoder/tpl_model.c",
+    "libaom/av1/encoder/tx_search.c",
+    "libaom/av1/encoder/intra_mode_search.c",
     "libaom/av1/encoder/wedge_utils.c",
     "libaom/av1/encoder/var_based_part.c",
     "libaom/third_party/fastfeat/fast.c",
@@ -334,7 +345,7 @@
 
 aom_dsp_decoder_sources = [
     "libaom/aom_dsp/binary_codes_reader.c",
-    "libaom/aom_dsp/daalaboolreader.c",
+    "libaom/aom_dsp/bitreader.c",
     "libaom/aom_dsp/entdec.c",
     "libaom/aom_dsp/grain_synthesis.c",
 ]

diff --git a/README.android b/README.android
index 668d27f..b81597d 100644
--- a/README.android
+++ b/README.android

@@ -1,12 +1,12 @@
 Name: libaom
 URL: https://aomedia.org
-Version: v1.0.0
+Version: v2.0.0
 License: BSD
 License File: libaom/LICENSE
 
-Date: Thursday November 29 2018
-Branch: origin/master
-Commit: 250bc3a61c501344523eec07fec35f304767260b
+Date: Wednesday May 20 2020
+Branch: 2.0.0
+Commit: bb35ba9148543f22ba7d8642e4fbd29ae301f5dc
 
 Description:
 Contains the sources used to compile libaom.

diff --git a/README.version b/README.version
index 95519e8..5f70705 100644
--- a/README.version
+++ b/README.version

@@ -1,15 +1,4 @@
-URL: https://aomedia.googlesource.com/aom/+archive/250bc3a61c501344523eec07fec35f304767260b.tar.gz
-Version: v1.0.0
+URL: https://aomedia.googlesource.com/aom/+archive/bb35ba9148543f22ba7d8642e4fbd29ae301f5dc.tar.gz
+Version: v2.0.0
 Local Modifications:
-  Rename files to avoid object collisions:
-    aom_dsp/x86/highbd_intrapred_sse2.asm
-    aom_dsp/x86/intrapred_sse2.asm
-  7ad847ac6 Seq header shouldn't change in the middle of frame
-Updates to libaom/examples/av1_dec_fuzzer.cc to include the following commits from upstream
-139efd2c8 av1_dec_fuzzer: get thread count from 1st byte of frame header
-adfc4b7f8 av1_dec_fuzzer: Remove fmemopen dependency
-338f1e688 av1_dec_fuzzer: Remove dependency on ivfdec.o and tools_common.o
-690a08a34 av1_dec_fuzzer: Updated fsanitize flags in build script
-c2632bb3f av1_dec_fuzzer: Fix missing field initializer for 'cfg'
-cherry-picks:
-0e07ea54d disable av1_(apply_|)selfguided_restoration_neon
+None

diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm
index 50338c1..ba39eb3 100644
--- a/config/arm/config/aom_config.asm
+++ b/config/arm/config/aom_config.asm

@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2019, Alliance for Open Media. All rights reserved
+; Copyright (c) 2020, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -13,12 +13,11 @@
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
-CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
 CONFIG_ACCOUNTING equ 0
 CONFIG_ANALYZER equ 0
 CONFIG_AV1_DECODER equ 1
 CONFIG_AV1_ENCODER equ 0
+CONFIG_AV1_HIGHBITDEPTH equ 1
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
@@ -30,32 +29,36 @@
 CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
-CONFIG_FILEOPTIONS equ 1
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
+CONFIG_HTB_TRELLIS equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_INTERNAL_STATS equ 0
 CONFIG_INTER_STATS_ONLY equ 0
 CONFIG_LIBYUV equ 1
-CONFIG_LOWBITDEPTH equ 1
+CONFIG_LPF_MASK equ 0
 CONFIG_MAX_DECODE_PROFILE equ 0
 CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_MULTITHREAD equ 1
+CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OS_SUPPORT equ 1
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
+CONFIG_REALTIME_ONLY equ 0
 CONFIG_RUNTIME_CPU_DETECT equ 0
 CONFIG_SHARED equ 0
 CONFIG_SHARP_SETTINGS equ 0
 CONFIG_SIZE_LIMIT equ 1
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
-CONFIG_STATIC equ 1
+CONFIG_SUPERRES_IN_RECODE equ 1
+CONFIG_TUNE_VMAF equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
+FORCE_HIGHBITDEPTH_DECODING equ 0
 HAVE_AVX equ 0
 HAVE_AVX2 equ 0
 HAVE_DSPR2 equ 0

diff --git a/config/arm/config/aom_config.c b/config/arm/config/aom_config.c
index 8b500ab..2ec4b0d 100644
--- a/config/arm/config/aom_config.c
+++ b/config/arm/config/aom_config.c

@@ -9,5 +9,5 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/armv7-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/armv7-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
 const char *aom_codec_build_config(void) {return cfg;}

diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h
index a3b86df..0e09fd2 100644
--- a/config/arm/config/aom_config.h
+++ b/config/arm/config/aom_config.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -15,12 +15,11 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
 #define CONFIG_AV1_ENCODER 0
+#define CONFIG_AV1_HIGHBITDEPTH 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -32,32 +31,36 @@
 #define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
-#define CONFIG_FILEOPTIONS 1
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
+#define CONFIG_HTB_TRELLIS 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_INTER_STATS_ONLY 0
 #define CONFIG_LIBYUV 1
-#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_LPF_MASK 0
 #define CONFIG_MAX_DECODE_PROFILE 0
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
+#define CONFIG_NN_V2 0
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
+#define CONFIG_REALTIME_ONLY 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_SPEED_STATS 0
-#define CONFIG_STATIC 1
+#define CONFIG_SUPERRES_IN_RECODE 1
+#define CONFIG_TUNE_VMAF 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
+#define FORCE_HIGHBITDEPTH_DECODING 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
 #define HAVE_DSPR2 0

diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h
index 0b1a28a..e3fd306 100644
--- a/config/arm/config/aom_dsp_rtcd.h
+++ b/config/arm/config/aom_dsp_rtcd.h

@@ -26,7 +26,7 @@
 void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_neon
 
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
 #define aom_blend_a64_mask aom_blend_a64_mask_c
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -39,7 +39,7 @@
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8_vert aom_convolve8_vert_c
 
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
 #define aom_convolve_copy aom_convolve_copy_c
 
 void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -362,25 +362,25 @@
 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_h_predictor_8x8 aom_h_predictor_8x8_neon
 
-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
 #define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
 
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
 
-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
 #define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
 
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
 
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
 
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
 
-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
 #define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
 
 void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -691,7 +691,7 @@
 void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
 #define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_c
 
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
 #define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_c
 
 void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1036,8 +1036,8 @@
 void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_c
 
-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
-void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
 #define aom_lowbd_blend_a64_d16_mask aom_lowbd_blend_a64_d16_mask_neon
 
 void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h
index 8ed0faa..8a8b3ce 100644
--- a/config/arm/config/av1_rtcd.h
+++ b/config/arm/config/av1_rtcd.h

@@ -34,27 +34,51 @@
 struct NN_CONFIG;
 typedef struct NN_CONFIG NN_CONFIG;
 
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-#define apply_selfguided_restoration apply_selfguided_restoration_c
+void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
 
 void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
 #define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
@@ -66,42 +90,63 @@
 void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+#define av1_calc_frame_error av1_calc_frame_error_c
+
+void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+#define av1_cnn_activate av1_cnn_activate_c
+
+void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+#define av1_cnn_add av1_cnn_add_c
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
+#define av1_cnn_batchnorm av1_cnn_batchnorm_c
+
+void av1_cnn_convolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step);
+#define av1_cnn_convolve av1_cnn_convolve_c
+
+void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+#define av1_cnn_deconvolve av1_cnn_deconvolve_c
+
+void av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+#define av1_cnn_predict av1_cnn_predict_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_neon
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c
 
-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_sr av1_convolve_2d_sr_neon
 
 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
 
-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_x_sr av1_convolve_x_sr_neon
 
-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_neon
 
-void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
 
-void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
 
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
 
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
 
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
@@ -131,13 +176,13 @@
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
 
-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
 
-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c
 
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@@ -149,22 +194,22 @@
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
 
-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c
 
-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
 
-void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
 
-void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
 
-void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
 
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@@ -176,25 +221,25 @@
 void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
 
-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
 
-void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
 
 void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
@@ -206,7 +251,7 @@
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
 #define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -277,7 +322,10 @@
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                  int sgr_params_idx, int bit_depth, int highbd);
-#define av1_selfguided_restoration av1_selfguided_restoration_c
+int av1_selfguided_restoration_neon(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+#define av1_selfguided_restoration av1_selfguided_restoration_neon
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz);
 #define av1_upsample_intra_edge av1_upsample_intra_edge_c
@@ -293,6 +341,14 @@
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
 
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_neon
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_neon
+
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_neon(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 #define cdef_filter_block cdef_filter_block_neon
@@ -325,25 +381,17 @@
 cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_neon(TX_SIZE tx_size);
 #define cfl_get_luma_subsampling_444_lbd cfl_get_luma_subsampling_444_lbd_neon
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-#define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_neon
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_neon(TX_SIZE tx_size);
+#define cfl_get_predict_hbd_fn cfl_get_predict_hbd_fn_neon
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-#define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_neon
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_neon(TX_SIZE tx_size);
+#define cfl_get_predict_lbd_fn cfl_get_predict_lbd_fn_neon
 
-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
-cfl_predict_hbd_fn get_predict_hbd_fn_neon(TX_SIZE tx_size);
-#define get_predict_hbd_fn get_predict_hbd_fn_neon
-
-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
-cfl_predict_lbd_fn get_predict_lbd_fn_neon(TX_SIZE tx_size);
-#define get_predict_lbd_fn get_predict_lbd_fn_neon
-
-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
-cfl_subtract_average_fn get_subtract_average_fn_neon(TX_SIZE tx_size);
-#define get_subtract_average_fn get_subtract_average_fn_neon
+cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
+#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_neon
 
 void av1_rtcd(void);
 

diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm
index 50338c1..ba39eb3 100644
--- a/config/arm64/config/aom_config.asm
+++ b/config/arm64/config/aom_config.asm

@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2019, Alliance for Open Media. All rights reserved
+; Copyright (c) 2020, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -13,12 +13,11 @@
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
-CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
 CONFIG_ACCOUNTING equ 0
 CONFIG_ANALYZER equ 0
 CONFIG_AV1_DECODER equ 1
 CONFIG_AV1_ENCODER equ 0
+CONFIG_AV1_HIGHBITDEPTH equ 1
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
@@ -30,32 +29,36 @@
 CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
-CONFIG_FILEOPTIONS equ 1
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
 CONFIG_GPROF equ 0
+CONFIG_HTB_TRELLIS equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_INTERNAL_STATS equ 0
 CONFIG_INTER_STATS_ONLY equ 0
 CONFIG_LIBYUV equ 1
-CONFIG_LOWBITDEPTH equ 1
+CONFIG_LPF_MASK equ 0
 CONFIG_MAX_DECODE_PROFILE equ 0
 CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_MULTITHREAD equ 1
+CONFIG_NN_V2 equ 0
 CONFIG_NORMAL_TILE_MODE equ 1
 CONFIG_OS_SUPPORT equ 1
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
+CONFIG_REALTIME_ONLY equ 0
 CONFIG_RUNTIME_CPU_DETECT equ 0
 CONFIG_SHARED equ 0
 CONFIG_SHARP_SETTINGS equ 0
 CONFIG_SIZE_LIMIT equ 1
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_SPEED_STATS equ 0
-CONFIG_STATIC equ 1
+CONFIG_SUPERRES_IN_RECODE equ 1
+CONFIG_TUNE_VMAF equ 0
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
 DECODE_WIDTH_LIMIT equ 16384
+FORCE_HIGHBITDEPTH_DECODING equ 0
 HAVE_AVX equ 0
 HAVE_AVX2 equ 0
 HAVE_DSPR2 equ 0

diff --git a/config/arm64/config/aom_config.c b/config/arm64/config/aom_config.c
index 950ccdf..866b415 100644
--- a/config/arm64/config/aom_config.c
+++ b/config/arm64/config/aom_config.c

@@ -9,5 +9,5 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/arm64-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/arm64-linux-gcc.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
 const char *aom_codec_build_config(void) {return cfg;}

diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h
index a3b86df..0e09fd2 100644
--- a/config/arm64/config/aom_config.h
+++ b/config/arm64/config/aom_config.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -15,12 +15,11 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
 #define CONFIG_AV1_ENCODER 0
+#define CONFIG_AV1_HIGHBITDEPTH 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -32,32 +31,36 @@
 #define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
-#define CONFIG_FILEOPTIONS 1
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
+#define CONFIG_HTB_TRELLIS 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_INTER_STATS_ONLY 0
 #define CONFIG_LIBYUV 1
-#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_LPF_MASK 0
 #define CONFIG_MAX_DECODE_PROFILE 0
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
+#define CONFIG_NN_V2 0
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
+#define CONFIG_REALTIME_ONLY 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_SPEED_STATS 0
-#define CONFIG_STATIC 1
+#define CONFIG_SUPERRES_IN_RECODE 1
+#define CONFIG_TUNE_VMAF 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
+#define FORCE_HIGHBITDEPTH_DECODING 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
 #define HAVE_DSPR2 0

diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h
index 0b1a28a..e3fd306 100644
--- a/config/arm64/config/aom_dsp_rtcd.h
+++ b/config/arm64/config/aom_dsp_rtcd.h

@@ -26,7 +26,7 @@
 void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_neon
 
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
 #define aom_blend_a64_mask aom_blend_a64_mask_c
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -39,7 +39,7 @@
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8_vert aom_convolve8_vert_c
 
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
 #define aom_convolve_copy aom_convolve_copy_c
 
 void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -362,25 +362,25 @@
 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_h_predictor_8x8 aom_h_predictor_8x8_neon
 
-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
 #define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
 
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
 
-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
 #define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
 
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
 
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
 
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
 
-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
 #define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
 
 void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -691,7 +691,7 @@
 void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
 #define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_c
 
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
 #define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_c
 
 void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1036,8 +1036,8 @@
 void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_c
 
-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
-void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
 #define aom_lowbd_blend_a64_d16_mask aom_lowbd_blend_a64_d16_mask_neon
 
 void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h
index 8ed0faa..8a8b3ce 100644
--- a/config/arm64/config/av1_rtcd.h
+++ b/config/arm64/config/av1_rtcd.h

@@ -34,27 +34,51 @@
 struct NN_CONFIG;
 typedef struct NN_CONFIG NN_CONFIG;
 
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-#define apply_selfguided_restoration apply_selfguided_restoration_c
+void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_neon
 
 void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
 #define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
@@ -66,42 +90,63 @@
 void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+#define av1_calc_frame_error av1_calc_frame_error_c
+
+void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+#define av1_cnn_activate av1_cnn_activate_c
+
+void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+#define av1_cnn_add av1_cnn_add_c
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
+#define av1_cnn_batchnorm av1_cnn_batchnorm_c
+
+void av1_cnn_convolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step);
+#define av1_cnn_convolve av1_cnn_convolve_c
+
+void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+#define av1_cnn_deconvolve av1_cnn_deconvolve_c
+
+void av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+#define av1_cnn_predict av1_cnn_predict_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_neon
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c
 
-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_sr av1_convolve_2d_sr_neon
 
 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
 
-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_x_sr av1_convolve_x_sr_neon
 
-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_neon
 
-void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
 
-void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
 
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
 
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
 
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
@@ -131,13 +176,13 @@
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
 
-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
 
-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c
 
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@@ -149,22 +194,22 @@
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
 
-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c
 
-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
 
-void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
 
-void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
 
-void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
 
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@@ -176,25 +221,25 @@
 void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
 
-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
 
-void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
 
 void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
@@ -206,7 +251,7 @@
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
 #define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -277,7 +322,10 @@
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                  int sgr_params_idx, int bit_depth, int highbd);
-#define av1_selfguided_restoration av1_selfguided_restoration_c
+int av1_selfguided_restoration_neon(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+#define av1_selfguided_restoration av1_selfguided_restoration_neon
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz);
 #define av1_upsample_intra_edge av1_upsample_intra_edge_c
@@ -293,6 +341,14 @@
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_neon
 
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_neon
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_neon
+
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_neon(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 #define cdef_filter_block cdef_filter_block_neon
@@ -325,25 +381,17 @@
 cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_neon(TX_SIZE tx_size);
 #define cfl_get_luma_subsampling_444_lbd cfl_get_luma_subsampling_444_lbd_neon
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-#define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_neon
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_neon(TX_SIZE tx_size);
+#define cfl_get_predict_hbd_fn cfl_get_predict_hbd_fn_neon
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-#define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_neon
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_neon(TX_SIZE tx_size);
+#define cfl_get_predict_lbd_fn cfl_get_predict_lbd_fn_neon
 
-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
-cfl_predict_hbd_fn get_predict_hbd_fn_neon(TX_SIZE tx_size);
-#define get_predict_hbd_fn get_predict_hbd_fn_neon
-
-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
-cfl_predict_lbd_fn get_predict_lbd_fn_neon(TX_SIZE tx_size);
-#define get_predict_lbd_fn get_predict_lbd_fn_neon
-
-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
-cfl_subtract_average_fn get_subtract_average_fn_neon(TX_SIZE tx_size);
-#define get_subtract_average_fn get_subtract_average_fn_neon
+cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
+#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_neon
 
 void av1_rtcd(void);
 

diff --git a/config/config/aom_version.h b/config/config/aom_version.h
index 2cb85f4..eaf9a0c 100644
--- a/config/config/aom_version.h
+++ b/config/config/aom_version.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,11 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#define VERSION_MAJOR 1
+#define VERSION_MAJOR 2
 #define VERSION_MINOR 0
 #define VERSION_PATCH 0
 #define VERSION_EXTRA ""
 #define VERSION_PACKED \
   ((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.0.0"
-#define VERSION_STRING " v1.0.0"
+#define VERSION_STRING_NOSP "v2.0.0"
+#define VERSION_STRING " v2.0.0"

diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm
index 222e3bf..96831b3 100644
--- a/config/x86/config/aom_config.asm
+++ b/config/x86/config/aom_config.asm

@@ -3,11 +3,11 @@
 %define ARCH_PPC 0
 %define ARCH_X86 1
 %define ARCH_X86_64 0
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 %define CONFIG_ACCOUNTING 0
 %define CONFIG_ANALYZER 0
 %define CONFIG_AV1_DECODER 1
 %define CONFIG_AV1_ENCODER 0
+%define CONFIG_AV1_HIGHBITDEPTH 1
 %define CONFIG_BIG_ENDIAN 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -19,32 +19,36 @@
 %define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
-%define CONFIG_FILEOPTIONS 1
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
+%define CONFIG_HTB_TRELLIS 0
 %define CONFIG_INSPECTION 0
 %define CONFIG_INTERNAL_STATS 0
 %define CONFIG_INTER_STATS_ONLY 0
 %define CONFIG_LIBYUV 1
-%define CONFIG_LOWBITDEPTH 1
+%define CONFIG_LPF_MASK 0
 %define CONFIG_MAX_DECODE_PROFILE 0
 %define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_MULTITHREAD 1
+%define CONFIG_NN_V2 0
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OS_SUPPORT 1
 %define CONFIG_PIC 1
 %define CONFIG_RD_DEBUG 0
+%define CONFIG_REALTIME_ONLY 0
 %define CONFIG_RUNTIME_CPU_DETECT 0
 %define CONFIG_SHARED 0
 %define CONFIG_SHARP_SETTINGS 0
 %define CONFIG_SIZE_LIMIT 1
 %define CONFIG_SPATIAL_RESAMPLING 1
 %define CONFIG_SPEED_STATS 0
-%define CONFIG_STATIC 1
+%define CONFIG_SUPERRES_IN_RECODE 1
+%define CONFIG_TUNE_VMAF 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
+%define FORCE_HIGHBITDEPTH_DECODING 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
 %define HAVE_DSPR2 0

diff --git a/config/x86/config/aom_config.c b/config/x86/config/aom_config.c
index 9095ebc..3b08ae0 100644
--- a/config/x86/config/aom_config.c
+++ b/config/x86/config/aom_config.c

@@ -9,5 +9,5 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/x86-linux.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_PIC=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DCMAKE_TOOLCHAIN_FILE=\"../libaom/build/cmake/toolchains/x86-linux.cmake\" -DCONFIG_AV1_ENCODER=0 -DCONFIG_PIC=1 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
 const char *aom_codec_build_config(void) {return cfg;}

diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h
index db2edbd..5fc0902 100644
--- a/config/x86/config/aom_config.h
+++ b/config/x86/config/aom_config.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -15,12 +15,11 @@
 #define ARCH_PPC 0
 #define ARCH_X86 1
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
 #define CONFIG_AV1_ENCODER 0
+#define CONFIG_AV1_HIGHBITDEPTH 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -32,32 +31,36 @@
 #define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
-#define CONFIG_FILEOPTIONS 1
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
+#define CONFIG_HTB_TRELLIS 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_INTER_STATS_ONLY 0
 #define CONFIG_LIBYUV 1
-#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_LPF_MASK 0
 #define CONFIG_MAX_DECODE_PROFILE 0
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
+#define CONFIG_NN_V2 0
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 1
 #define CONFIG_RD_DEBUG 0
+#define CONFIG_REALTIME_ONLY 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_SPEED_STATS 0
-#define CONFIG_STATIC 1
+#define CONFIG_SUPERRES_IN_RECODE 1
+#define CONFIG_TUNE_VMAF 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
+#define FORCE_HIGHBITDEPTH_DECODING 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
 #define HAVE_DSPR2 0

diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h
index f84f313..1e746e3 100644
--- a/config/x86/config/aom_dsp_rtcd.h
+++ b/config/x86/config/aom_dsp_rtcd.h

@@ -25,7 +25,7 @@
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_c
 
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
 #define aom_blend_a64_mask aom_blend_a64_mask_c
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -41,8 +41,8 @@
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8_vert aom_convolve8_vert_ssse3
 
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
 #define aom_convolve_copy aom_convolve_copy_sse2
 
 void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -440,28 +440,28 @@
 void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
 
-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
 #define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
 
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
 
-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
 #define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
 
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
 
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_sse2
 
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_sse2
 
-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
 #define aom_highbd_convolve_copy aom_highbd_convolve_copy_sse2
 
 void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -818,8 +818,8 @@
 void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
 #define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
 
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
 #define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_sse2
 
 void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1188,7 +1188,7 @@
 void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
 
-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
 #define aom_lowbd_blend_a64_d16_mask aom_lowbd_blend_a64_d16_mask_c
 
 void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h
index f788933..1dce89b 100644
--- a/config/x86/config/av1_rtcd.h
+++ b/config/x86/config/av1_rtcd.h

@@ -34,27 +34,50 @@
 struct NN_CONFIG;
 typedef struct NN_CONFIG NN_CONFIG;
 
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-#define apply_selfguided_restoration apply_selfguided_restoration_c
+void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_c
 
 void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
 #define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
@@ -66,43 +89,65 @@
 void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_ssse3
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+#define av1_calc_frame_error av1_calc_frame_error_sse2
+
+void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+#define av1_cnn_activate av1_cnn_activate_c
+
+void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+#define av1_cnn_add av1_cnn_add_c
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
+#define av1_cnn_batchnorm av1_cnn_batchnorm_c
+
+void av1_cnn_convolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step);
+#define av1_cnn_convolve av1_cnn_convolve_c
+
+void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+#define av1_cnn_deconvolve av1_cnn_deconvolve_c
+
+void av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+#define av1_cnn_predict av1_cnn_predict_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_sse2
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c
 
-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_sr av1_convolve_2d_sr_sse2
 
 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
 
-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_x_sr av1_convolve_x_sr_sse2
 
-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_sse2
 
-void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
 
-void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
 
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
 
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
 
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
@@ -132,15 +177,15 @@
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
 
-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_sse2
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
 
-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_ssse3
 
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@@ -152,24 +197,24 @@
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
 
-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_ssse3
 
-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
 
-void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
 
-void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
 
-void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
 
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@@ -181,25 +226,25 @@
 void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
 
-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
 
-void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
 
 void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
@@ -211,8 +256,8 @@
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
 #define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_ssse3
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -297,6 +342,16 @@
 void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
 
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_ssse3
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_ssse3
+
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
@@ -331,27 +386,17 @@
 cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
 #define cfl_get_luma_subsampling_444_lbd cfl_get_luma_subsampling_444_lbd_ssse3
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-#define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_ssse3
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+#define cfl_get_predict_hbd_fn cfl_get_predict_hbd_fn_ssse3
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-#define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_ssse3
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+#define cfl_get_predict_lbd_fn cfl_get_predict_lbd_fn_ssse3
 
-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
-cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
-#define get_predict_hbd_fn get_predict_hbd_fn_ssse3
-
-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
-cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
-#define get_predict_lbd_fn get_predict_lbd_fn_ssse3
-
-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
-cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
-#define get_subtract_average_fn get_subtract_average_fn_sse2
+cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
+#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_sse2
 
 void av1_rtcd(void);
 

diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm
index 43e7f74..7ad6ca1 100644
--- a/config/x86_64/config/aom_config.asm
+++ b/config/x86_64/config/aom_config.asm

@@ -3,11 +3,11 @@
 %define ARCH_PPC 0
 %define ARCH_X86 0
 %define ARCH_X86_64 1
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 %define CONFIG_ACCOUNTING 0
 %define CONFIG_ANALYZER 0
 %define CONFIG_AV1_DECODER 1
 %define CONFIG_AV1_ENCODER 0
+%define CONFIG_AV1_HIGHBITDEPTH 1
 %define CONFIG_BIG_ENDIAN 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -19,32 +19,36 @@
 %define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
-%define CONFIG_FILEOPTIONS 1
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
 %define CONFIG_GPROF 0
+%define CONFIG_HTB_TRELLIS 0
 %define CONFIG_INSPECTION 0
 %define CONFIG_INTERNAL_STATS 0
 %define CONFIG_INTER_STATS_ONLY 0
 %define CONFIG_LIBYUV 1
-%define CONFIG_LOWBITDEPTH 1
+%define CONFIG_LPF_MASK 0
 %define CONFIG_MAX_DECODE_PROFILE 0
 %define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_MULTITHREAD 1
+%define CONFIG_NN_V2 0
 %define CONFIG_NORMAL_TILE_MODE 1
 %define CONFIG_OS_SUPPORT 1
 %define CONFIG_PIC 0
 %define CONFIG_RD_DEBUG 0
+%define CONFIG_REALTIME_ONLY 0
 %define CONFIG_RUNTIME_CPU_DETECT 0
 %define CONFIG_SHARED 0
 %define CONFIG_SHARP_SETTINGS 0
 %define CONFIG_SIZE_LIMIT 1
 %define CONFIG_SPATIAL_RESAMPLING 1
 %define CONFIG_SPEED_STATS 0
-%define CONFIG_STATIC 1
+%define CONFIG_SUPERRES_IN_RECODE 1
+%define CONFIG_TUNE_VMAF 0
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
 %define DECODE_WIDTH_LIMIT 16384
+%define FORCE_HIGHBITDEPTH_DECODING 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
 %define HAVE_DSPR2 0

diff --git a/config/x86_64/config/aom_config.c b/config/x86_64/config/aom_config.c
index 488df3a..8fa1e0b 100644
--- a/config/x86_64/config/aom_config.c
+++ b/config/x86_64/config/aom_config.c

@@ -9,5 +9,5 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_codec.h"
-static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DAOM_TARGET_CPU=x86_64 -DCONFIG_AV1_ENCODER=0 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
+static const char* const cfg = "cmake ../libaom -G \"Unix Makefiles\" -DAOM_TARGET_CPU=x86_64 -DCONFIG_AV1_ENCODER=0 -DCONFIG_RUNTIME_CPU_DETECT=0 -DCONFIG_MAX_DECODE_PROFILE=0 -DCONFIG_NORMAL_TILE_MODE=1 -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=16384 -DDECODE_WIDTH_LIMIT=16384 -DENABLE_SSE4_1=0";
 const char *aom_codec_build_config(void) {return cfg;}

diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h
index 610e8ca..ba5f73b 100644
--- a/config/x86_64/config/aom_config.h
+++ b/config/x86_64/config/aom_config.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -15,12 +15,11 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 1
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
 #define CONFIG_AV1_ENCODER 0
+#define CONFIG_AV1_HIGHBITDEPTH 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
@@ -32,32 +31,36 @@
 #define CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
-#define CONFIG_FILEOPTIONS 1
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
 #define CONFIG_GPROF 0
+#define CONFIG_HTB_TRELLIS 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_INTER_STATS_ONLY 0
 #define CONFIG_LIBYUV 1
-#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_LPF_MASK 0
 #define CONFIG_MAX_DECODE_PROFILE 0
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
+#define CONFIG_NN_V2 0
 #define CONFIG_NORMAL_TILE_MODE 1
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
+#define CONFIG_REALTIME_ONLY 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_SPEED_STATS 0
-#define CONFIG_STATIC 1
+#define CONFIG_SUPERRES_IN_RECODE 1
+#define CONFIG_TUNE_VMAF 0
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
 #define DECODE_WIDTH_LIMIT 16384
+#define FORCE_HIGHBITDEPTH_DECODING 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
 #define HAVE_DSPR2 0

diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h
index f84f313..1e746e3 100644
--- a/config/x86_64/config/aom_dsp_rtcd.h
+++ b/config/x86_64/config/aom_dsp_rtcd.h

@@ -25,7 +25,7 @@
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_c
 
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh);
 #define aom_blend_a64_mask aom_blend_a64_mask_c
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
@@ -41,8 +41,8 @@
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8_vert aom_convolve8_vert_ssse3
 
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h);
 #define aom_convolve_copy aom_convolve_copy_sse2
 
 void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -440,28 +440,28 @@
 void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
 
-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd);
 #define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
 
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
 
-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd);
 #define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
 
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
 #define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
 
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_sse2
 
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd);
 #define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_sse2
 
-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd);
 #define aom_highbd_convolve_copy aom_highbd_convolve_copy_sse2
 
 void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -818,8 +818,8 @@
 void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
 #define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
 
-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd);
 #define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_sse2
 
 void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1188,7 +1188,7 @@
 void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
 
-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params);
 #define aom_lowbd_blend_a64_d16_mask aom_lowbd_blend_a64_d16_mask_c
 
 void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h
index 84673ba..4596ee2 100644
--- a/config/x86_64/config/av1_rtcd.h
+++ b/config/x86_64/config/av1_rtcd.h

@@ -34,27 +34,50 @@
 struct NN_CONFIG;
 typedef struct NN_CONFIG NN_CONFIG;
 
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-#define apply_selfguided_restoration apply_selfguided_restoration_c
+void av1_apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+#define av1_apply_selfguided_restoration av1_apply_selfguided_restoration_c
 
 void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
 #define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
@@ -66,43 +89,65 @@
 void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_ssse3
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride);
+#define av1_calc_frame_error av1_calc_frame_error_sse2
+
+void av1_cnn_activate_c( float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation);
+#define av1_cnn_activate av1_cnn_activate_c
+
+void av1_cnn_add_c( float **input, int channels, int width, int height, int stride, const float **add);
+#define av1_cnn_add av1_cnn_add_c
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std);
+#define av1_cnn_batchnorm av1_cnn_batchnorm_c
+
+void av1_cnn_convolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step);
+#define av1_cnn_convolve av1_cnn_convolve_c
+
+void av1_cnn_deconvolve_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride);
+#define av1_cnn_deconvolve av1_cnn_deconvolve_c
+
+void av1_cnn_predict_c( const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct);
+#define av1_cnn_predict av1_cnn_predict_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_sse2
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c
 
-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_sr av1_convolve_2d_sr_sse2
 
 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
 
-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_x_sr av1_convolve_x_sr_sse2
 
-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_sse2
 
-void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
 
-void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
 
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
 
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 #define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
 
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
@@ -135,15 +180,15 @@
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
 
-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_sse2
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
 
-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_ssse3
 
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@@ -155,24 +200,24 @@
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
 
-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_ssse3
 
-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
 
-void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
 
-void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
 
-void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
 
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@@ -184,25 +229,25 @@
 void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
 
-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
 
-void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
 
 void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
@@ -214,8 +259,8 @@
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd);
 #define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_ssse3
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
@@ -300,6 +345,16 @@
 void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
 #define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_sse2
 
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_16bit_to_16bit cdef_copy_rect8_16bit_to_16bit_ssse3
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+#define cdef_copy_rect8_8bit_to_16bit cdef_copy_rect8_8bit_to_16bit_ssse3
+
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
 void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift);
@@ -334,27 +389,17 @@
 cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
 #define cfl_get_luma_subsampling_444_lbd cfl_get_luma_subsampling_444_lbd_ssse3
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
-#define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_ssse3
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+#define cfl_get_predict_hbd_fn cfl_get_predict_hbd_fn_ssse3
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
-#define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_ssse3
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+#define cfl_get_predict_lbd_fn cfl_get_predict_lbd_fn_ssse3
 
-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
-cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
-#define get_predict_hbd_fn get_predict_hbd_fn_ssse3
-
-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
-cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
-#define get_predict_lbd_fn get_predict_lbd_fn_ssse3
-
-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
-cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
-#define get_subtract_average_fn get_subtract_average_fn_sse2
+cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
+#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_sse2
 
 void av1_rtcd(void);
 

diff --git a/libaom/.clang-format b/libaom/.clang-format
index e76a526..a378820 100644
--- a/libaom/.clang-format
+++ b/libaom/.clang-format

@@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 5.0.0
+# Generated with clang-format 7.0.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
@@ -12,7 +12,6 @@
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: false
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
@@ -30,6 +29,7 @@
   AfterObjCDeclaration: false
   AfterStruct:     false
   AfterUnion:      false
+  AfterExternBlock: false
   BeforeCatch:     false
   BeforeElse:      false
   IndentBraces:    false
@@ -39,6 +39,7 @@
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
@@ -59,7 +60,10 @@
   - foreach
   - Q_FOREACH
   - BOOST_FOREACH
+IncludeBlocks:   Preserve
 IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
   - Regex:           '^<.*\.h>'
     Priority:        1
   - Regex:           '^<.*'
@@ -68,6 +72,7 @@
     Priority:        3
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
+IndentPPDirectives: None
 IndentWidth:     2
 IndentWrappedFunctionNames: false
 JavaScriptQuotes: Leave
@@ -77,6 +82,7 @@
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
@@ -85,20 +91,53 @@
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
 ReflowComments:  true
 SortIncludes:    false
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles:  false
-SpacesInContainerLiterals: true
+SpacesInContainerLiterals: false
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false

diff --git a/libaom/.cmake-format.py b/libaom/.cmake-format.py
index aa7354c..7b0e4f0 100644
--- a/libaom/.cmake-format.py
+++ b/libaom/.cmake-format.py

@@ -1,13 +1,11 @@
-# Generated with cmake-format 0.3.6
+# Generated with cmake-format 0.5.1
 # How wide to allow formatted cmake files
 line_width = 80
 
 # How many spaces to tab for indent
 tab_size = 2
 
-# If arglists are longer than this, break them always. This introduces some
-# interesting effects with complicated 'if' statements. However, we want file
-# lists to look reasonable. Try to strike a balance.
+# If arglists are longer than this, break them always
 max_subargs_per_line = 10
 
 # If true, separate flow control names from their parentheses with a space
@@ -21,10 +19,10 @@
 dangle_parens = False
 
 # What character to use for bulleted lists
-bullet_char = u'*'
+bullet_char = '*'
 
 # What character to use as punctuation after numerals in an enumerated list
-enum_char = u'.'
+enum_char = '.'
 
 # What style line endings to use in the output.
 line_ending = u'unix'
@@ -32,6 +30,9 @@
 # Format command names consistently as 'lower' or 'upper' case
 command_case = u'lower'
 
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = u'unchanged'
+
 # Specify structure for custom cmake functions
 additional_commands = {
   "foo": {
@@ -46,3 +47,56 @@
     }
   }
 }
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+autosort = False
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in
+# eachlistfile. Use this to preserve formatting of your
+# copyright/licensestatements.
+first_comment_is_literal = False
+
+# If comment markup is enabled, don't reflow any comment block which matchesthis
+# (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = u'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = u'utf-8'
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}

diff --git a/libaom/.mailmap b/libaom/.mailmap
index bbe4525..30fae4d 100644
--- a/libaom/.mailmap
+++ b/libaom/.mailmap

@@ -3,32 +3,89 @@
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Andrey Norkin <anorkin@netflix.com>
+Angie Chiang <angiebird@google.com>
+Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
+Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Bohan Li <bohanli@google.com>
+Changjun Yang <changjun.yang@intel.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chi Yo Tsai <chiyotsai@google.com> <chiyotsai@dhcp-100-106-128-213.corp.google.com>
+Chm <chm@rock-chips.com>
+Damon Shen <yjshen@google.com>
+Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
-Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com> <fbarbier.contact@gmail.com>
+Fyodor Kyslov <kyslov@google.com>
+Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
+Guillaume Martres <smarter@ubuntu.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
+Iole Moccagatta <iole.moccagatta@gmail.com>
 Jacky Chen <jackychen@google.com>
+James Zern <jzern@google.com> <jzern@google.cOm>
+Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
+Logan Goldberg <logangw@google.com>
+Luc Trudeau <luc@trud.ca>
+Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Michael Bebenita <mbebenita@gmail.com> <mbebenita@mozilla.com>
+Michael Horowitz <mhoro@webrtc.org> <mhoro@google.com>
+Mingliang Chen <mlchen@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
+Nathan E. Egge <negge@mozilla.com>
+Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
 Pascal Massimino <pascal.massimino@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com> <skal@google.com>
 Paul Wilkins <paulwilkins@google.com>
+Peng Bin <binpengsmail@gmail.com>
+Peng Bin <binpengsmail@gmail.com> <pengbin@kingsoft.com>
+Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Roger Zhou <youzhou@microsoft.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Ryan Lei <ryan.z.lei@intel.com>
+Ryan Lei <ryan.z.lei@intel.com> <ryan.lei@intel.com>
+Ryan Lei <ryan.z.lei@intel.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Thomas Davies Thomas <thdavies@cisco.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Zhipin Deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>

diff --git a/libaom/AUTHORS b/libaom/AUTHORS
index 95c3c8b..f61026f 100644
--- a/libaom/AUTHORS
+++ b/libaom/AUTHORS

@@ -1,13 +1,16 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
+Aamir Anis <aanis@google.com>
 Aaron Watry <awatry@gmail.com>
+Aasaipriya <aasaipriya.c@ittiam.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
+Akshata Jadhav <akshata.jadhav@ittiam.com>
+Alexander Bokov <alexanderbokov@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Aℓex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -16,39 +19,82 @@
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Russell <anrussell@google.com>
+Andrey Norkin <anorkin@netflix.com>
 Angie Chiang <angiebird@google.com>
+Aniket Dhok <aniket.dhok@ittiam.com>
+Ankur Saxena <ankurs@nvidia.com>
+Arild Fuldseth <arilfuld@cisco.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Bohan Li <bohanli@google.com>
+Brennan Shacklett <bshacklett@mozilla.com>
 Brion Vibber <bvibber@wikimedia.org>
-changjun.yang <changjun.yang@intel.com>
+Bruno Berthier <bruno.berthier@allegrodvt.com>
+Changjun Yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
-chm <chm@rock-chips.com>
+Cheng Chen <chengchen@google.com>
+Cherma Rajan A <cherma.rajan@ittiam.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
+Cyril Concolato <cconcolato@netflix.com>
+Dake He <dkhe@google.com>
+Damon Shen <yjshen@google.com>
+Dandan Ding <vickyddding@gmail.com>
+Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Daniel Max Valenzuela <daniel.vt@samsung.com>
+Danil Chapovalov <danilchap@google.com>
+David Barker <david.barker@argondesign.com>
+David Major <dmajor@mozilla.com>
+David Michael Barr <b@rr-dav.id.au>
+David Turner <david.turner@argondesign.com>
 Deb Mukherjee <debargha@google.com>
+Deepa K G <deepa.kg@ittiam.com>
+Deng <zhipin.deng@intel.com>
+Di Chen <chendixi@google.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
+Dominic Symes <dominic.symes@arm.com>
 Dragan Mrdjan <dmrdjan@mips.com>
 Ed Baker <edward.baker@intel.com>
+Edward Hervey <edward@centricular.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
+Fangwen Fu <fangwen.fu@intel.com>
+Fergus Simpson <afergs@google.com>
+Frank Bossen <fbossen@gmail.com>
 Frank Galligan <fgalligan@google.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
-Guillaume Martres <gmartres@google.com>
+Grant Hsu <grant.hsu@cidana.com>
+Guillaume Martres <smarter@ubuntu.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Hamsalekha S <hamsalekha.s@ittiam.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
+Hien Ho <hienho@google.com>
 Hui Su <huisu@google.com>
+Ilie Halip <ilie.halip@gmail.com>
+Ilya Brailovskiy <brailovs@lab126.com>
+Imdad Sardharwalla <imdad.sardharwalla@argondesign.com>
+iole moccagatta <iole.moccagatta@gmail.com>
+Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
+Jack Haughton <jack.haughton@argondesign.com>
 Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
 James Yu <james.yu@linaro.org>
@@ -56,29 +102,42 @@
 Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
-Jean-Marc Valin <jmvalin@jmvalin.ca>
+Jayasanker J <jayasanker.j@ittiam.com>
+Jean-Marc Valin <jmvalin@mozilla.com>
+Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
+Joe Young <joeyoung@google.com>
 Joey Parrish <joeyparrish@google.com>
-Johann Koenig <johannkoenig@chromium.org>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jonathan Matthews <jonathan.matthews@argondesign.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
+Katsuhisa Yuasa <berupon@gmail.com>
 KO Myung-Hun <komh@chollian.net>
+Krishna Malladi <kmalladi@google.com>
+Kyle Siefring <kylesiefring@gmail.com>
+Larisa Markeeva <lmarkeeva@google.com>
 Lawrence Velázquez <larryv@macports.org>
+Lester Lu <kslu@google.com>
+Linfeng Zhang <linfengz@google.com>
+Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
+Luc Trudeau <ltrudeau@mozilla.com>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
@@ -86,40 +145,78 @@
 Martin Ettl <ettl.martin78@googlemail.com>
 Martin Storsjo <martin@martin.st>
 Matthew Heaney <matthewjheaney@chromium.org>
+Matthieu Vaudano <matthieu.vaudano@allegrodvt.com>
+Mattias Hansson <mattias.hansson@arm.com>
+Maxym Dmytrychenko <maxim.d33@gmail.com>
+Michael Bebenita <mbebenita@mozilla.com>
+Michael Horowitz <mhoro@webrtc.org>
 Michael Kohler <michaelkohler@live.com>
+Michelle Findlay-Olynyk <mfo@google.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
+Mingliang Chen <mlchen@google.com>
+Mirko Bonadei <mbonadei@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
 Morton Jonuschat <yabawock@gmail.com>
-Nathan E. Egge <negge@dgql.org>
+Mufaddal Chakera <mufaddal.chakera@ittiam.com>
+Nathan E. Egge <negge@mozilla.com>
+Neil Birkbeck <birkbeck@google.com>
 Nico Weber <thakis@chromium.org>
+Nithya V S <nithya.vs@ittiam.com>
+Ola Hugosson <ola.hugosson@arm.com>
+Oleg Nalivayko <o13g86@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Pavel Frolov <pavel.frolov@vicuesoft.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
+Peng Bin <binpengsmail@gmail.com>
 Pengchong Jin <pengchong@google.com>
-Peter de Rivaz <peter.derivaz@argondesign.com>
+Peter Boström <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
+Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Ravi Chaudhary <ravi.chaudhary@ittiam.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Remy Foray <remy.foray@allegrodvt.com>
 Rob Bradford <rob@linux.intel.com>
+Robert-André Mauchin <zebob.m@gmail.com>
+RogerZhou <youzhou@microsoft.com>
+Rohit Athavale <rathaval@xilinx.com>
 Ronald S. Bultje <rsbultje@gmail.com>
+Rostislav Pehlivanov <rpehlivanov@mozilla.com>
+Ruiling Song <ruiling.song@intel.com>
 Rui Ueyama <ruiu@google.com>
+Rupert Swarbrick <rupert.swarbrick@argondesign.com>
+Ryan Lei <ryan.lei@intel.com>
+Ryan Overbeck <rover@google.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
+Satish Kumar Suman <satish.suman@ittiam.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Sean DuBois <sean@siobud.com>
 Sean McGovern <gseanmcg@gmail.com>
+Sean Purser-Haskell <seanhaskell@google.com>
+Sebastien Alaiwan <sebastien.alaiwan@allegrodvt.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shunyao Li <shunyaoli@google.com>
+SmilingWolf <lupo996@gmail.com>
+Soo-Chul Han <shan@vidyo.com>
+Stanislav Vitvitskyy <vitvitskyy@google.com>
 Stefan Holmer <holmer@google.com>
 Steinar Midtskogen <stemidts@cisco.com>
 Suman Sunkara <sunkaras@google.com>
@@ -127,18 +224,37 @@
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
 Tao Bai <michaelbai@chromium.org>
+Tarek AMARA <amatarek@justin.tv>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Thomas Daede <tdaede@mozilla.com>
-Thomas Davies <thdavies@cisco.com>
-Thomas <thdavies@cisco.com>
+Thomas Davies Thomas <thdavies@cisco.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
+Timo Witte <timo.witte@gmail.com>
+Todd Nguyen <toddnguyen@google.com>
+Tom Anderson <thomasanderson@google.com>
 Tom Finegan <tomfinegan@google.com>
-Tristan Matthews <le.businessman@gmail.com>
 Tristan Matthews <tmatth@videolan.org>
+Umang Saini <umang.saini@ittiam.com>
+Urvang Joshi <urvang@google.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Victoria Zhislina <niva213@gmail.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vishesh <vishesh.garg@ittiam.com>
+Wan-Teh Chang <wtc@google.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Xing Jin <ddvfinite@gmail.com>
+Xin Zhao <xinzzhao@tencent.com>
+Yaowu Xu <yaowu.google.com>
 Yaowu Xu <yaowu@google.com>
+Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
+Yue Chen <yuec@google.com>
 Yunqing Wang <yunqingwang@google.com>
-Zoe Liu <zoeliu@google.com>
+Yury Gitman <yuryg@google.com>
+Yushin Cho <ycho@mozilla.com>
+Zhijie Yang <zhijie.yang@broadcom.com>
+zhipin deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com>

diff --git a/libaom/CHANGELOG b/libaom/CHANGELOG
index d84aa02..9536707 100644
--- a/libaom/CHANGELOG
+++ b/libaom/CHANGELOG

@@ -1,3 +1,20 @@
+2020-05-07 v2.0.0 "Applejack"
+  First official release of libaom.
+  This release includes new real-time mode and SVC support.
+
+  - Upgrading:
+    AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are
+    removed.
+
+    AOM_SET_DBG_* is removed.
+
+    Multi-resolution encoding is removed.
+
+    put_frame and put_slice callbacks are removed.
+
+  - Enhancements:
+    Full-sweep document update for codec controls.
+
 2018-06-28 v1.0.0
   AOMedia Codec Workgroup Approved version 1.0
 

diff --git a/libaom/CMakeLists.txt b/libaom/CMakeLists.txt
index 2c35a0f..2ef0863 100644
--- a/libaom/CMakeLists.txt
+++ b/libaom/CMakeLists.txt

@@ -9,28 +9,34 @@
 # can obtain it at www.aomedia.org/license/patent.
 #
 cmake_minimum_required(VERSION 3.5)
+project(AOM C CXX)
 
 if(NOT EMSCRIPTEN)
-  if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release"
-        CACHE "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" STRING
+  if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE
+        "Release"
+        CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel"
               FORCE)
   endif()
 endif()
 
-project(AOM C CXX)
-
 set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
 
 if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
-  message(FATAL_ERROR
-            "Building from within the aom source tree is not supported.\n"
-            "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
-            "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
-            "And re-run CMake from the aom_build directory.")
+  message(
+    FATAL_ERROR "Building from within the aom source tree is not supported.\n"
+                "Hint: Run these commands\n"
+                "$ rm -rf CMakeCache.txt CMakeFiles\n"
+                "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+                "And re-run CMake from the aom_build directory.")
 endif()
 
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+set(SO_VERSION 2)
+set(SO_FILE_VERSION 2.0.0)
+
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
 include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
 include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
@@ -113,6 +119,7 @@
             "${AOM_ROOT}/aom/aomcx.h"
             "${AOM_ROOT}/aom/aomdx.h"
             "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+            "${AOM_ROOT}/aom/internal/aom_image_internal.h"
             "${AOM_ROOT}/aom/src/aom_codec.c"
             "${AOM_ROOT}/aom/src/aom_decoder.c"
             "${AOM_ROOT}/aom/src/aom_encoder.c"
@@ -176,15 +183,30 @@
                            -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
                            -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
                            "${AOM_ROOT}/build/cmake/version.cmake"
-                   COMMENT "Writing aom_version.h" VERBATIM)
+                   COMMENT "Writing aom_version.h"
+                   VERBATIM)
 
 add_custom_target(aom_version_check
-                  COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                  COMMAND ${CMAKE_COMMAND}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
                           -DAOM_ROOT=${AOM_ROOT}
                           -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
                           -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
                           "${AOM_ROOT}/build/cmake/version.cmake"
-                  COMMENT "Updating version info if necessary." VERBATIM)
+                  COMMENT "Updating version info if necessary."
+                  VERBATIM)
+
+if(BUILD_SHARED_LIBS AND NOT MSVC)
+  # Generate version file immediately for non-MSVC shared builds: The version
+  # string is needed for the aom target.
+  execute_process(COMMAND ${CMAKE_COMMAND}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                          -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                          "${AOM_ROOT}/build/cmake/version.cmake")
+endif()
+
 add_dependencies(aom_version aom_version_check)
 
 # TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
@@ -206,14 +228,46 @@
   add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats)
 endif()
+
 add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
+if(BUILD_SHARED_LIBS)
+  add_library(aom_static STATIC ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
+  set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
+
+  if(NOT MSVC)
+    # Extract version string and set VERSION/SOVERSION for the aom target.
+    extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
+                           aom_version_triple)
+
+    # Strip any trailing version information, if present.
+    string(FIND "${aom_version_triple}" "-" dash_pos)
+    if(NOT dash_pos EQUAL -1)
+      string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple)
+    endif()
+
+    # cmake-format: off
+    # VERSION is embedded in the .so file name.
+    # libaom.so -> libaom.so.SOVERSION
+    # libaom.so.SOVERSION -> libaom.so.VERSION
+    # libaom.so.VERSION
+    # cmake-format: on
+    set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION})
+    set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION})
+  endif()
+endif()
 
 if(NOT MSVC AND NOT APPLE)
   target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m)
+  endif()
 endif()
 
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
+if(BUILD_SHARED_LIBS)
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static)
+endif()
 
 # Setup dependencies.
 setup_aom_dsp_targets()
@@ -236,7 +290,7 @@
 # other pieces of the util support without defining usage_exit().
 file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "void usage_exit(void) {}")
 file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc"
-           "extern \"C\" void usage_exit(void) {}")
+     "extern \"C\" void usage_exit(void) {}")
 
 #
 # Application and application support targets.
@@ -255,31 +309,31 @@
 
 if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
   add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>)
+                             $<TARGET_OBJECTS:aom_common_app_util>)
   list(APPEND AOM_APP_TARGETS resize_util)
 endif()
 
 if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
   add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                        $<TARGET_OBJECTS:aom_common_app_util>
+                        $<TARGET_OBJECTS:aom_decoder_app_util>)
   add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                               $<TARGET_OBJECTS:aom_common_app_util>
+                               $<TARGET_OBJECTS:aom_decoder_app_util>)
   add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_decoder_app_util>)
   add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                                $<TARGET_OBJECTS:aom_common_app_util>
+                                $<TARGET_OBJECTS:aom_decoder_app_util>)
   add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                                  $<TARGET_OBJECTS:aom_common_app_util>
+                                  $<TARGET_OBJECTS:aom_decoder_app_util>)
 
   if(CONFIG_ANALYZER)
     add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_decoder_app_util>)
     target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
     list(APPEND AOM_APP_TARGETS analyzer)
     list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer)
@@ -287,8 +341,8 @@
 
   if(CONFIG_INSPECTION)
     add_executable(inspect "${AOM_ROOT}/examples/inspect.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+                           $<TARGET_OBJECTS:aom_common_app_util>
+                           $<TARGET_OBJECTS:aom_decoder_app_util>)
     list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect)
 
     if(EMSCRIPTEN)
@@ -315,8 +369,8 @@
   endif()
 
   # Maintain a list of decoder example targets.
-  list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5
-              decode_with_drops scalable_decoder simple_decoder)
+  list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops
+              scalable_decoder simple_decoder)
 
   # Add decoder examples to the app targets list.
   list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
@@ -325,31 +379,36 @@
 if(CONFIG_AV1_ENCODER)
   if(ENABLE_EXAMPLES)
     add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_stats>)
+                          $<TARGET_OBJECTS:aom_common_app_util>
+                          $<TARGET_OBJECTS:aom_encoder_app_util>
+                          $<TARGET_OBJECTS:aom_encoder_stats>)
     add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                  $<TARGET_OBJECTS:aom_common_app_util>
+                                  $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                               $<TARGET_OBJECTS:aom_common_app_util>
+                               $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.c"
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
 
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
-                set_maps simple_encoder scalable_encoder twopass_encoder)
+                set_maps simple_encoder scalable_encoder twopass_encoder
+                svc_encoder_rtc)
   endif()
 
   if(ENABLE_TOOLS)
@@ -358,7 +417,8 @@
       # TODO(tomfinegan): Sort out why a simple link command with
       # aom_entropy_optimizer.c won't work on macos, but dragging in all the
       # helper machinery allows the link to succeed.
-      add_executable(aom_entropy_optimizer "${AOM_GEN_SRC_DIR}/usage_exit.c"
+      add_executable(aom_entropy_optimizer
+                     "${AOM_GEN_SRC_DIR}/usage_exit.c"
                      "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
                      $<TARGET_OBJECTS:aom_common_app_util>
                      $<TARGET_OBJECTS:aom_encoder_app_util>)
@@ -371,6 +431,19 @@
   # Add encoder examples and tools to the targets list.
   list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
               ${AOM_ENCODER_TOOL_TARGETS})
+
+  if(CONFIG_TUNE_VMAF)
+    find_library(VMAF libvmaf.a vmaf)
+    if(NOT VMAF)
+      message(FATAL_ERROR "VMAF library not found.")
+    endif()
+    message("-- Found VMAF library: " ${VMAF})
+    set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
+    if(BUILD_SHARED_LIBS)
+      set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
+    endif()
+    target_link_libraries(aom PRIVATE ${VMAF})
+  endif()
 endif()
 
 if(ENABLE_EXAMPLES)
@@ -385,11 +458,11 @@
 if(ENABLE_TOOLS)
   if(CONFIG_AV1_DECODER)
     add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.cc"
-                   "${AOM_ROOT}/tools/dump_obu.cc"
-                   "${AOM_ROOT}/tools/obu_parser.cc"
-                   "${AOM_ROOT}/tools/obu_parser.h"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+                            "${AOM_ROOT}/tools/dump_obu.cc"
+                            "${AOM_ROOT}/tools/obu_parser.cc"
+                            "${AOM_ROOT}/tools/obu_parser.h"
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_decoder_app_util>)
 
     list(APPEND AOM_TOOL_TARGETS dump_obu)
     list(APPEND AOM_APP_TARGETS dump_obu)
@@ -404,16 +477,16 @@
 
 if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
   add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                $<TARGET_OBJECTS:aom_common_app_util>
+                                $<TARGET_OBJECTS:aom_encoder_app_util>)
   list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref)
   list(APPEND AOM_APP_TARGETS aom_cx_set_ref)
 endif()
 
 if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER)
   add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
   list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder)
   list(APPEND AOM_APP_TARGETS lightfield_encoder)
 endif()
@@ -429,8 +502,8 @@
 
 if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
   add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_decoder_app_util>)
   list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder)
   list(APPEND AOM_APP_TARGETS lightfield_decoder)
 endif()
@@ -499,6 +572,9 @@
 if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
   find_package(Threads)
   target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads)
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads)
+  endif()
 endif()
 
 if(XCODE)
@@ -548,7 +624,6 @@
 if(BUILD_SHARED_LIBS)
   include("${AOM_ROOT}/build/cmake/exports.cmake")
   setup_exports_target()
-  set_target_properties(aom PROPERTIES SOVERSION 0)
 endif()
 
 # Handle user supplied compile and link flags last to ensure they're obeyed.
@@ -591,7 +666,8 @@
 endif()
 
 add_custom_target(dist
-                  COMMAND ${CMAKE_COMMAND} -DAOM_ROOT=${AOM_ROOT}
+                  COMMAND ${CMAKE_COMMAND}
+                          -DAOM_ROOT=${AOM_ROOT}
                           -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
                           -DAOM_DIST_DIR=${AOM_DIST_DIR}
                           -DAOM_DIST_APPS="${AOM_DIST_APPS}"
@@ -612,8 +688,8 @@
 # Collect all variables containing libaom source files.
 get_cmake_property(all_cmake_vars VARIABLES)
 foreach(var ${all_cmake_vars})
-  if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES
-     "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST")
+  if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
+     AND NOT "${var}" MATCHES "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST")
     list(APPEND aom_source_vars ${var})
   endif()
 endforeach()
@@ -632,9 +708,8 @@
   endforeach()
 endforeach()
 
-file(APPEND
-       "${libaom_srcs_txt_file}"
-       "# Files below this line are generated by the libaom build system.\n")
+file(APPEND "${libaom_srcs_txt_file}"
+     "# Files below this line are generated by the libaom build system.\n")
 foreach(aom_source_var ${aom_source_vars})
   foreach(file ${${aom_source_var}})
     if("${file}" MATCHES "${AOM_CONFIG_DIR}")
@@ -667,15 +742,14 @@
   endif()
 endforeach()
 
-file(APPEND
-       "${libaom_srcs_gni_file}"
-       "\n# Files below this line are generated by the libaom build system.\n")
+file(APPEND "${libaom_srcs_gni_file}"
+     "\n# Files below this line are generated by the libaom build system.\n")
 
 foreach(aom_source_var ${aom_source_vars})
   if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
     string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
     file(APPEND "${libaom_srcs_gni_file}"
-                "\n${aom_source_var_lowercase}_gen = [\n")
+         "\n${aom_source_var_lowercase}_gen = [\n")
   endif()
   foreach(file ${${aom_source_var}})
     if(NOT "${file}" MATCHES "${AOM_ROOT}")

diff --git a/libaom/README.md b/libaom/README.md
index 6b58d35..cf057ae 100644
--- a/libaom/README.md
+++ b/libaom/README.md

@@ -14,6 +14,7 @@
     - [Xcode builds](#xcode-builds)
     - [Emscripten builds](#emscripten-builds)
     - [Extra Build Flags](#extra-build-flags)
+    - [Build with VMAF support](#build-with-vmaf)
 2. [Testing the library](#testing-the-av1-codec)
     - [Basics](#testing-basics)
         - [Unit tests](#1_unit-tests)
@@ -47,7 +48,9 @@
  2. [Git](https://git-scm.com/).
  3. [Perl](https://www.perl.org/).
  4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
-    recent version of [nasm](http://www.nasm.us/).
+    recent version of [nasm](http://www.nasm.us/). If you download yasm with
+    the intention to work with Visual Studio, please download win32.exe or
+    win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe.
  5. Building the documentation requires [doxygen](http://doxygen.org).
  6. Building the unit tests requires [Python](https://www.python.org/).
  7. Emscripten builds require the portable
@@ -211,12 +214,28 @@
 ### Microsoft Visual Studio builds
 
 Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
-Studio 2015 (14.0) or later is required. The following example demonstrates
+Studio 2017 (15.0) or later is required. The following example demonstrates
 generating projects and a solution for the Microsoft IDE:
 
 ~~~
-    # This does not require a bash shell; command.exe is fine.
-    $ cmake path/to/aom -G "Visual Studio 15 2017"
+    # This does not require a bash shell; Command Prompt (cmd.exe) is fine.
+    # This assumes the build host is a Windows x64 computer.
+
+    # To build with Visual Studio 2019 for the x64 target:
+    $ cmake path/to/aom -G "Visual Studio 16 2019"
+    $ cmake --build .
+
+    # To build with Visual Studio 2019 for the 32-bit x86 target:
+    $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32
+    $ cmake --build .
+
+    # To build with Visual Studio 2017 for the x64 target:
+    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64 -A x64
+    $ cmake --build .
+
+    # To build with Visual Studio 2017 for the 32-bit x86 target:
+    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64
+    $ cmake --build .
 ~~~
 
 NOTE: The build system targets Windows 7 or later by compiling files with
@@ -293,6 +312,24 @@
         -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
 ~~~
 
+### Build with VMAF support
+
+After installing
+[libvmaf.a](https://github.com/Netflix/vmaf/blob/master/resource/doc/libvmaf.md),
+you can use it with the encoder:
+
+~~~
+    $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1
+~~~
+
+Please note that the default VMAF model
+("/usr/local/share/model/vmaf_v0.6.1.pkl")
+will be used unless you set the following flag when running the encoder:
+
+~~~
+    # --vmaf-model-path=path/to/model
+~~~
+
 ## Testing the AV1 codec
 
 ### Testing basics

diff --git a/libaom/Sample.cfg b/libaom/Sample.cfg
new file mode 100644
index 0000000..d5dbe66
--- /dev/null
+++ b/libaom/Sample.cfg

@@ -0,0 +1,35 @@
+#sample config file
+super_block_size = 128                  # super block size. 0, 64 or 128
+max_partition_size = 128                # max partition size(8, 16, 32, 64, 128)
+min_partition_size = 4                  # min partition size(4, 8, 16, 32, 64)
+disable_rect_partition_type = 0         # disable rectangle partition type
+disable_ab_partition_type = 0           # disable AB partition type
+disable_1to4_partition_type = 0         # disable 1 to 4 and 4 to 1 partition type
+disable_intra_angle_delta = 0           # disable intra angle delta
+disable_paeth_intra = 0                 # disable paeth intra
+disable_smooth_intra = 0                # disable intra smooth mode
+disable_intra_edge_filter = 0           # disable intra edge filter
+disable_filter_intra = 0                # disable filter intra
+disable_intrabc = 0                     # disable Intra Block Copy
+disable_cfl = 0                         # disable chroma from luma prediction
+disable_palette = 0                     # disable Palette
+disable_flip_idtx = 0                   # disable flip and identity transform
+disable_tx_64x64 = 0                    # disable 64x64 transform
+reduced_tx_type_set = 0                 # use reduced transform type set
+reduced_reference_set = 0               # use reduced reference frame set
+disable_obmc = 0                        # disable OBMC
+disable_warp_motion = 0                 # disable Warped Motion
+disable_global_motion = 0               # disable global motion
+disable_ref_frame_mv = 0                # disable ref mv
+disable_dual_filter = 0                 # disable dual interpolation filter
+disable_one_sided_comp = 0              # disable one sided compound mode
+disable_masked_comp = 0                 # disable masked compound prediction
+disable_diff_wtd_comp = 0               # disable difference weighted compound mode
+disable_inter_inter_wedge = 0           # disable inter/inter wedge comp
+disable_dist_wtd_comp = 0               # disable distant weighted compound mode
+disable_inter_intra_comp = 0            # disable inter/intra compound mode.
+disable_inter_intra_wedge = 0           # disable inter/intra wedge comp
+disable_smooth_inter_intra = 0          # disable smooth inter/intra
+disable_cdef = 0                        # disable CDEF filter
+disable_lr = 0                          # disable Loop Restoration Filter
+disable_trellis_quant = 0               # disable trellis quantization
\ No newline at end of file

diff --git a/libaom/aom/aom.h b/libaom/aom/aom.h
index b1cc1ec..c591dc9 100644
--- a/libaom/aom/aom.h
+++ b/libaom/aom/aom.h

@@ -43,64 +43,27 @@
  * The set of macros define the control functions of AOM interface
  */
 enum aom_com_control_id {
-  /*!\brief pass in an external frame into decoder to be used as reference frame
+  /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of
+   * these values for its control ids, see the NOTEs in aom/aomcx.h. These
+   * should be migrated to something like the AOM_DECODER_CTRL_ID_START range
+   * next time we're ready to break the ABI.
    */
-  AOM_SET_POSTPROC = 3, /**< set the decoder's post processing settings  */
-  AOM_SET_DBG_COLOR_REF_FRAME =
-      4, /**< set the reference frames to color for each macroblock */
-  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
-  AOM_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
-  AOM_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
-
-  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
-   * for its control ids. These should be migrated to something like the
-   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
-   */
-  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
-  AV1_SET_REFERENCE = 129, /**< write a frame into a reference buffer */
-  AV1_COPY_REFERENCE =
-      130, /**< get a copy of reference frame from the decoder */
+  AV1_GET_REFERENCE = 128,  /**< get a pointer to a reference frame,
+                               av1_ref_frame_t* parameter */
+  AV1_SET_REFERENCE = 129,  /**< write a frame into a reference buffer,
+                               av1_ref_frame_t* parameter */
+  AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm
+                               av1_ref_frame_t* parameter */
   AOM_COMMON_CTRL_ID_MAX,
 
-  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
-  AV1_COPY_NEW_FRAME_IMAGE =
-      193, /**< copy the new frame to an external buffer */
+  AV1_GET_NEW_FRAME_IMAGE =
+      192, /**< get a pointer to the new frame, aom_image_t* parameter */
+  AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer,
+                                     aom_image_t* parameter */
 
   AOM_DECODER_CTRL_ID_START = 256
 };
 
-/*!\brief post process flags
- *
- * The set of macros define AOM decoder post processing flags
- */
-enum aom_postproc_level {
-  AOM_NOFILTERING = 0,
-  AOM_DEBLOCK = 1 << 0,
-  AOM_DEMACROBLOCK = 1 << 1,
-  AOM_ADDNOISE = 1 << 2,
-  AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
-  AOM_DEBUG_TXT_MBLK_MODES =
-      1 << 4, /**< print macro block modes over each macro block */
-  AOM_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
-  AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
-  AOM_MFQE = 1 << 10
-};
-
-/*!\brief post process flags
- *
- * This define a structure that describe the post processing settings. For
- * the best objective measure (using the PSNR metric) set post_proc_flag
- * to AOM_DEBLOCK and deblocking_level to 1.
- */
-
-typedef struct aom_postproc_cfg {
-  /*!\brief the types of post processing to be done, should be combination of
-   * "aom_postproc_level" */
-  int post_proc_flag;
-  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
-  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
-} aom_postproc_cfg_t;
-
 /*!\brief AV1 specific reference frame data struct
  *
  * Define the data struct to access av1 reference frames.
@@ -114,26 +77,25 @@
 /*!\cond */
 /*!\brief aom decoder control function parameter type
  *
- * defines the data type for each of AOM decoder control function requires
+ * Defines the data type for each of AOM decoder control function requires.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
  */
-AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
-#define AOM_CTRL_AOM_SET_POSTPROC
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
-#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
 AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_GET_REFERENCE
+
 AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_SET_REFERENCE
+
 AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_COPY_REFERENCE
+
 AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
 #define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+
 AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
 #define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE
 

diff --git a/libaom/aom/aom_codec.h b/libaom/aom/aom_codec.h
index fc0df5b..75f6a1a 100644
--- a/libaom/aom/aom_codec.h
+++ b/libaom/aom/aom_codec.h

@@ -95,7 +95,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define AOM_CODEC_ABI_VERSION (3 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_CODEC_ABI_VERSION (5 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
 /*!\brief Algorithm return codes */
 typedef enum {
@@ -173,10 +173,25 @@
  */
 typedef long aom_codec_flags_t;
 
+/*!\brief Time Stamp Type
+ *
+ * An integer, which when multiplied by the stream's time base, provides
+ * the absolute time of a sample.
+ */
+typedef int64_t aom_codec_pts_t;
+
 /*!\brief Codec interface structure.
  *
  * Contains function pointers and other data private to the codec
- * implementation. This structure is opaque to the application.
+ * implementation. This structure is opaque to the application. Common
+ * functions used with this structure:
+ *   - aom_codec_iface_name: get the name of the codec
+ *   - aom_codec_get_caps: returns the capabilities of the codec (see
+ *     aom_encoder.h for more details)
+ *   - aom_codec_enc_config_default: generate the default config to use
+ *     when initializing the encoder
+ *   - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
+ *     structure (see documentation on aom_codec_ctx for more information).
  */
 typedef const struct aom_codec_iface aom_codec_iface_t;
 
@@ -370,19 +385,24 @@
  */
 aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
 
-/*!\brief Control algorithm
+/*!\name Codec Control
  *
- * This function is used to exchange algorithm specific data with the codec
- * instance. This can be used to implement features specific to a particular
- * algorithm.
+ * The aom_codec_control function exchanges algorithm specific data with the
+ * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is
+ * provided, which will type-check the parameter against the control ID before
+ * calling aom_codec_control - note that this macro requires the control ID
+ * to be directly encoded in it, e.g.,
+ * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8).
  *
- * This wrapper function dispatches the request to the helper function
- * associated with the given ctrl_id. It tries to call this function
- * transparently, but will return #AOM_CODEC_ERROR if the request could not
- * be dispatched.
+ * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h
+ * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id).
+ * @{
+ */
+/*!\brief Algorithm Control
  *
- * Note that this function should not be used directly. Call the
- * #aom_codec_control wrapper macro instead.
+ * aom_codec_control takes a context, a control ID, and a third parameter
+ * (with varying type). If the context is non-null and an error occurs,
+ * ctx->err will be set to the same value as the return value.
  *
  * \param[in]     ctx              Pointer to this instance's context
  * \param[in]     ctrl_id          Algorithm specific control identifier
@@ -394,85 +414,33 @@
  * \retval #AOM_CODEC_INVALID_PARAM
  *     The data was not valid.
  */
-aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
-#if defined(AOM_DISABLE_CTRL_TYPECHECKS) && AOM_DISABLE_CTRL_TYPECHECKS
-#define aom_codec_control(ctx, id, data) aom_codec_control_(ctx, id, data)
-#define AOM_CTRL_USE_TYPE(id, typ)
-#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)
-#define AOM_CTRL_VOID(id, typ)
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...);
 
-#else
-/*!\brief aom_codec_control wrapper macro
+/*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible)
  *
  * This macro allows for type safe conversions across the variadic parameter
- * to aom_codec_control_().
- *
- * \internal
- * It works by dispatching the call to the control function through a wrapper
- * function named with the id parameter.
+ * to aom_codec_control(). However, it requires the explicit control ID
+ * be passed in (it cannot be passed in via a variable) -- otherwise a compiler
+ * error will occur. After the type checking, it calls aom_codec_control.
  */
-#define aom_codec_control(ctx, id, data) \
-  aom_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
+#define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \
+  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
 
-/*!\brief aom_codec_control type definition macro
+/*!\brief Creates typechecking mechanisms for aom_codec_control
  *
- * This macro allows for type safe conversions across the variadic parameter
- * to aom_codec_control_(). It defines the type of the argument for a given
- * control identifier.
- *
- * \internal
- * It defines a static function with
- * the correctly typed arguments as a wrapper to the type-unsafe internal
- * function.
+ * It defines a static function with the correctly typed arguments as a wrapper
+ * to the type-unsafe aom_codec_control function. It also creates a typedef
+ * for each type.
  */
-#define AOM_CTRL_USE_TYPE(id, typ)                                           \
-  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
-      AOM_UNUSED;                                                            \
-                                                                             \
-  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,        \
-                                                int ctrl_id, typ data) {     \
-    return aom_codec_control_(ctx, ctrl_id, data);                           \
-  } /**<\hideinitializer*/
-
-/*!\brief aom_codec_control deprecated type definition macro
- *
- * Like #AOM_CTRL_USE_TYPE, but indicates that the specified control is
- * deprecated and should not be used. Consult the documentation for your
- * codec for more information.
- *
- * \internal
- * It defines a static function with the correctly typed arguments as a
- * wrapper to the type-unsafe internal function.
- */
-#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
-  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
-      aom_codec_ctx_t *, int, typ) AOM_DEPRECATED AOM_UNUSED;            \
-                                                                         \
-  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
-      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
-    return aom_codec_control_(ctx, ctrl_id, data);                       \
-  } /**<\hideinitializer*/
-
-/*!\brief aom_codec_control void type definition macro
- *
- * This macro allows for type safe conversions across the variadic parameter
- * to aom_codec_control_(). It indicates that a given control identifier takes
- * no argument.
- *
- * \internal
- * It defines a static function without a data argument as a wrapper to the
- * type-unsafe internal function.
- */
-#define AOM_CTRL_VOID(id)                                               \
-  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
-      AOM_UNUSED;                                                       \
-                                                                        \
-  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,   \
-                                                int ctrl_id) {          \
-    return aom_codec_control_(ctx, ctrl_id);                            \
-  } /**<\hideinitializer*/
-
-#endif
+#define AOM_CTRL_USE_TYPE(id, typ)                           \
+  static aom_codec_err_t aom_codec_control_typechecked_##id( \
+      aom_codec_ctx_t *, int, typ) AOM_UNUSED;               \
+  static aom_codec_err_t aom_codec_control_typechecked_##id( \
+      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
+    return aom_codec_control(ctx, ctrl, data);               \
+  } /**<\hideinitializer*/                                   \
+  typedef typ aom_codec_control_type_##id;
+/*!@} end Codec Control group */
 
 /*!\brief OBU types. */
 typedef enum ATTRIBUTE_PACKED {
@@ -503,19 +471,6 @@
  */
 const char *aom_obu_type_to_string(OBU_TYPE type);
 
-/*!\brief Config Options
- *
- * This type allows to enumerate and control options defined for control
- * via config file at runtime.
- */
-typedef struct cfg_options {
-  /*!\brief Reflects if ext_partition should be enabled
-   *
-   * If this value is non-zero it enabled the feature
-   */
-  unsigned int ext_partition;
-} cfg_options_t;
-
 /*!@} - end defgroup codec*/
 #ifdef __cplusplus
 }

diff --git a/libaom/aom/aom_decoder.h b/libaom/aom/aom_decoder.h
index 70420c3..5ce7c7b 100644
--- a/libaom/aom/aom_decoder.h
+++ b/libaom/aom/aom_decoder.h

@@ -42,7 +42,7 @@
  * fields to structures
  */
 #define AOM_DECODER_ABI_VERSION \
-  (3 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Decoder capabilities bitfield
  *
@@ -52,9 +52,8 @@
  *
  *  The available flags are specified by AOM_CODEC_CAP_* defines.
  */
-#define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
-#define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
-#define AOM_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
+/*!brief Can support external frame buffers */
+#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000
 
 /*! \brief Initialization-time Feature Enabling
  *
@@ -63,10 +62,6 @@
  *
  *  The available flags are specified by AOM_CODEC_USE_* defines.
  */
-/*!brief Can support external frame buffers */
-#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000
-
-#define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
 
 /*!\brief Stream properties
  *
@@ -98,7 +93,6 @@
   unsigned int w;       /**< Width */
   unsigned int h;       /**< Height */
   unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
-  cfg_options_t cfg;              /**< Options defined per config attributes */
 } aom_codec_dec_cfg_t;            /**< alias for struct aom_codec_dec_cfg */
 
 /*!\brief Initialize a decoder instance
@@ -108,8 +102,8 @@
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
  *
- * If the library was configured with --disable-multithread, this call
- * is not thread safe and should be guarded with a lock if being used
+ * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this
+ * call is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
  * \param[in]    ctx     Pointer to this instance's context.
@@ -179,16 +173,12 @@
 
 /*!\brief Decode data
  *
- * Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
- * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
- * time stamp) order. Frames produced will always be in PTS (presentation
- * time stamp) order.
+ * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS
+ * (decode time stamp) order. Frames produced will always be in PTS
+ * (presentation time stamp) order.
  *
  * \param[in] ctx          Pointer to this instance's context
- * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a AOM_CODEC_CB_PUT_FRAME event is posted
- *                         for the previously decoded frame.
+ * \param[in] data         Pointer to this block of new coded data.
  * \param[in] data_sz      Size of the coded data, in bytes.
  * \param[in] user_priv    Application specific data to associate with
  *                         this frame.
@@ -219,95 +209,12 @@
  */
 aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
 
-/*!\defgroup cap_put_frame Frame-Based Decoding Functions
- *
- * The following functions are required to be implemented for all decoders
- * that advertise the AOM_CODEC_CAP_PUT_FRAME capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually AOM_CODEC_ERROR
- * @{
- */
-
-/*!\brief put frame callback prototype
- *
- * This callback is invoked by the decoder to notify the application of
- * the availability of decoded image data.
- */
-typedef void (*aom_codec_put_frame_cb_fn_t)(void *user_priv,
-                                            const aom_image_t *img);
-
-/*!\brief Register for notification of frame completion.
- *
- * Registers a given function to be called when a decoded frame is
- * available.
- *
- * \param[in] ctx          Pointer to this instance's context
- * \param[in] cb           Pointer to the callback function
- * \param[in] user_priv    User's private data
- *
- * \retval #AOM_CODEC_OK
- *     Callback successfully registered.
- * \retval #AOM_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     posting slice completion.
- */
-aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_frame_cb_fn_t cb,
-                                                void *user_priv);
-
-/*!@} - end defgroup cap_put_frame */
-
-/*!\defgroup cap_put_slice Slice-Based Decoding Functions
- *
- * The following functions are required to be implemented for all decoders
- * that advertise the AOM_CODEC_CAP_PUT_SLICE capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually AOM_CODEC_ERROR
- * @{
- */
-
-/*!\brief put slice callback prototype
- *
- * This callback is invoked by the decoder to notify the application of
- * the availability of partially decoded image data. The
- */
-typedef void (*aom_codec_put_slice_cb_fn_t)(void *user_priv,
-                                            const aom_image_t *img,
-                                            const aom_image_rect_t *valid,
-                                            const aom_image_rect_t *update);
-
-/*!\brief Register for notification of slice completion.
- *
- * Registers a given function to be called when a decoded slice is
- * available.
- *
- * \param[in] ctx          Pointer to this instance's context
- * \param[in] cb           Pointer to the callback function
- * \param[in] user_priv    User's private data
- *
- * \retval #AOM_CODEC_OK
- *     Callback successfully registered.
- * \retval #AOM_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     posting slice completion.
- */
-aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_slice_cb_fn_t cb,
-                                                void *user_priv);
-
-/*!@} - end defgroup cap_put_slice*/
-
 /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
  *
- * The following section is required to be implemented for all decoders
+ * The following function is required to be implemented for all decoders
  * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
  * Calling this function for codecs that don't advertise this capability
- * will result in an error code being returned, usually AOM_CODEC_ERROR.
- *
- * \note
- * Currently this only works with AV1.
+ * will result in an error code being returned, usually AOM_CODEC_INCAPABLE.
  * @{
  */
 
@@ -329,13 +236,13 @@
  * \retval #AOM_CODEC_INVALID_PARAM
  *     One or more of the callbacks were NULL.
  * \retval #AOM_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     using external frame buffers.
+ *     Decoder context not initialized.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Algorithm not capable of using external frame buffers.
  *
  * \note
  * When decoding AV1, the application may be required to pass in at least
- * #AOM_MAXIMUM_WORK_BUFFERS external frame
- * buffers.
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers.
  */
 aom_codec_err_t aom_codec_set_frame_buffer_functions(
     aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,

diff --git a/libaom/aom/aom_encoder.h b/libaom/aom/aom_encoder.h
index f8a7cec..a494c17 100644
--- a/libaom/aom/aom_encoder.h
+++ b/libaom/aom/aom_encoder.h

@@ -41,7 +41,7 @@
  * fields to structures
  */
 #define AOM_ENCODER_ABI_VERSION \
-  (5 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (8 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -74,17 +74,10 @@
  * This structure is able to hold a reference to any fixed size buffer.
  */
 typedef struct aom_fixed_buf {
-  void *buf;       /**< Pointer to the data */
+  void *buf;       /**< Pointer to the data. Does NOT own the data! */
   size_t sz;       /**< Length of the buffer, in chars */
 } aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
 
-/*!\brief Time Stamp Type
- *
- * An integer, which when multiplied by the stream's time base, provides
- * the absolute time of a sample.
- */
-typedef int64_t aom_codec_pts_t;
-
 /*!\brief Compressed Frame Flags
  *
  * This type represents a bitfield containing information about a compressed
@@ -209,6 +202,154 @@
   AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
+/*!\brief Encoder Config Options
+ *
+ * This type allows to enumerate and control flags defined for encoder control
+ * via config file at runtime.
+ */
+typedef struct cfg_options {
+  /*!\brief Indicate init by cfg file
+   * 0 or 1
+   */
+  unsigned int init_by_cfg_file;
+  /*!\brief Superblock size
+   * 0, 64 or 128
+   */
+  unsigned int super_block_size;
+  /*!\brief max partition size
+   * 8, 16, 32, 64, 128
+   */
+  unsigned int max_partition_size;
+  /*!\brief min partition size
+   * 8, 16, 32, 64, 128
+   */
+  unsigned int min_partition_size;
+  /*!\brief disable AB Shape partition type
+   *
+   */
+  unsigned int disable_ab_partition_type;
+  /*!\brief disable rectangular partition type
+   *
+   */
+  unsigned int disable_rect_partition_type;
+  /*!\brief disable 1:4/4:1 partition type
+   *
+   */
+  unsigned int disable_1to4_partition_type;
+  /*!\brief disable flip and identity transform type
+   *
+   */
+  unsigned int disable_flip_idtx;
+  /*!\brief disable CDEF filter
+   *
+   */
+  unsigned int disable_cdef;
+  /*!\brief disable Loop Restoration Filter
+   *
+   */
+  unsigned int disable_lr;
+  /*!\brief disable OBMC
+   *
+   */
+  unsigned int disable_obmc;
+  /*!\brief disable Warped Motion
+   *
+   */
+  unsigned int disable_warp_motion;
+  /*!\brief disable global motion
+   *
+   */
+  unsigned int disable_global_motion;
+  /*!\brief disable dist weighted compound
+   *
+   */
+  unsigned int disable_dist_wtd_comp;
+  /*!\brief disable diff weighted compound
+   *
+   */
+  unsigned int disable_diff_wtd_comp;
+  /*!\brief disable inter/intra compound
+   *
+   */
+  unsigned int disable_inter_intra_comp;
+  /*!\brief disable masked compound
+   *
+   */
+  unsigned int disable_masked_comp;
+  /*!\brief disable one sided compound
+   *
+   */
+  unsigned int disable_one_sided_comp;
+  /*!\brief disable Palette
+   *
+   */
+  unsigned int disable_palette;
+  /*!\brief disable Intra Block Copy
+   *
+   */
+  unsigned int disable_intrabc;
+  /*!\brief disable chroma from luma
+   *
+   */
+  unsigned int disable_cfl;
+  /*!\brief disable intra smooth mode
+   *
+   */
+  unsigned int disable_smooth_intra;
+  /*!\brief disable filter intra
+   *
+   */
+  unsigned int disable_filter_intra;
+  /*!\brief disable dual filter
+   *
+   */
+  unsigned int disable_dual_filter;
+  /*!\brief disable intra angle delta
+   *
+   */
+  unsigned int disable_intra_angle_delta;
+  /*!\brief disable intra edge filter
+   *
+   */
+  unsigned int disable_intra_edge_filter;
+  /*!\brief disable 64x64 transform
+   *
+   */
+  unsigned int disable_tx_64x64;
+  /*!\brief disable smooth inter/intra
+   *
+   */
+  unsigned int disable_smooth_inter_intra;
+  /*!\brief disable inter/inter wedge comp
+   *
+   */
+  unsigned int disable_inter_inter_wedge;
+  /*!\brief disable inter/intra wedge comp
+   *
+   */
+  unsigned int disable_inter_intra_wedge;
+  /*!\brief disable paeth intra
+   *
+   */
+  unsigned int disable_paeth_intra;
+  /*!\brief disable trellis quantization
+   *
+   */
+  unsigned int disable_trellis_quant;
+  /*!\brief disable ref frame MV
+   *
+   */
+  unsigned int disable_ref_frame_mv;
+  /*!\brief use reduced reference frame set
+   *
+   */
+  unsigned int reduced_reference_set;
+  /*!\brief use reduced transform type set
+   *
+   */
+  unsigned int reduced_tx_type_set;
+} cfg_options_t;
+
 /*!\brief Encoded Frame Flags
  *
  * This type indicates a bitfield to be passed to aom_codec_encode(), defining
@@ -724,10 +865,46 @@
    */
   int tile_heights[MAX_TILE_HEIGHTS];
 
+  /*!\brief Whether encoder should use fixed QP offsets.
+   *
+   * If a value of 1 is provided, encoder will use fixed QP offsets for frames
+   * at different levels of the pyramid.
+   * - If 'fixed_qp_offsets' is also provided, encoder will use the given
+   * offsets
+   * - If not, encoder will select the fixed offsets based on the cq-level
+   *   provided.
+   * If a value of 0 is provided and fixed_qp_offset are not provided, encoder
+   * will NOT use fixed QP offsets.
+   * Note: This option is only relevant for --end-usage=q.
+   */
+  unsigned int use_fixed_qp_offsets;
+
+/*!\brief Number of fixed QP offsets
+ *
+ * This defines the number of elements in the fixed_qp_offsets array.
+ */
+#define FIXED_QP_OFFSET_COUNT 5
+
+  /*!\brief Array of fixed QP offsets
+   *
+   * This array specifies fixed QP offsets (range: 0 to 63) for frames at
+   * different levels of the pyramid. It is a comma-separated list of 5 values:
+   * - QP offset for keyframe
+   * - QP offset for ALTREF frame
+   * - QP offset for 1st level internal ARF
+   * - QP offset for 2nd level internal ARF
+   * - QP offset for 3rd level internal ARF
+   * Notes:
+   * - QP offset for leaf level frames is not explicitly specified. These frames
+   *   use the worst quality allowed (--cq-level).
+   * - This option is only relevant for --end-usage=q.
+   */
+  int fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
+
   /*!\brief Options defined per config file
    *
    */
-  cfg_options_t cfg;
+  cfg_options_t encoder_cfg;
 } aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
 
 /*!\brief Initialize an encoder instance
@@ -764,41 +941,9 @@
 #define aom_codec_enc_init(ctx, iface, cfg, flags) \
   aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
 
-/*!\brief Initialize multi-encoder instance
+/*!\brief Get the default configuration for a usage.
  *
- * Initializes multi-encoder context using the given interface.
- * Applications should call the aom_codec_enc_init_multi convenience macro
- * instead of this function directly, to ensure that the ABI version number
- * parameter is properly initialized.
- *
- * \param[in]    ctx     Pointer to this instance's context.
- * \param[in]    iface   Pointer to the algorithm interface to use.
- * \param[in]    cfg     Configuration to use, if known.
- * \param[in]    num_enc Total number of encoders.
- * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
- * \param[in]    dsf     Pointer to down-sampling factors.
- * \param[in]    ver     ABI version number. Must be set to
- *                       AOM_ENCODER_ABI_VERSION
- * \retval #AOM_CODEC_OK
- *     The decoder algorithm initialized.
- * \retval #AOM_CODEC_MEM_ERROR
- *     Memory allocation failed.
- */
-aom_codec_err_t aom_codec_enc_init_multi_ver(
-    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
-    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver);
-
-/*!\brief Convenience macro for aom_codec_enc_init_multi_ver()
- *
- * Ensures the ABI version parameter is properly set.
- */
-#define aom_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
-  aom_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
-                               AOM_ENCODER_ABI_VERSION)
-
-/*!\brief Get a default configuration
- *
- * Initializes a encoder configuration structure with default values. Supports
+ * Initializes an encoder configuration structure with default values. Supports
  * the notion of "usages" so that an algorithm may offer different default
  * settings depending on the user's intended goal. This function \ref SHOULD
  * be called by all applications to initialize the configuration structure
@@ -806,7 +951,9 @@
  *
  * \param[in]    iface     Pointer to the algorithm interface to use.
  * \param[out]   cfg       Configuration buffer to populate.
- * \param[in]    reserved  Must set to 0.
+ * \param[in]    usage     Algorithm specific usage value. For AV1, must be
+ *                         set to AOM_USAGE_GOOD_QUALITY (0) or
+ *                         AOM_USAGE_REALTIME (1).
  *
  * \retval #AOM_CODEC_OK
  *     The configuration was populated.
@@ -817,7 +964,7 @@
  */
 aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
                                              aom_codec_enc_cfg_t *cfg,
-                                             unsigned int reserved);
+                                             unsigned int usage);
 
 /*!\brief Set or change configuration
  *

diff --git a/libaom/aom/aom_image.h b/libaom/aom/aom_image.h
index 245ef2c..bb6973f 100644
--- a/libaom/aom/aom_image.h
+++ b/libaom/aom/aom_image.h

@@ -30,7 +30,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define AOM_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/
+#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
 
 #define AOM_IMG_FMT_PLANAR 0x100  /**< Image is a planar format. */
 #define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
@@ -137,6 +137,36 @@
   AOM_CSP_RESERVED = 3          /**< Reserved value */
 } aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
 
+/*!\brief List of insert flags for Metadata
+ *
+ * These flags control how the library treats metadata during encode.
+ *
+ * While encoding, when metadata is added to an aom_image via
+ * aom_img_add_metadata(), the flag passed along with the metadata will
+ * determine where the metadata OBU will be placed in the encoded OBU stream.
+ * Metadata will be emitted into the output stream within the next temporal unit
+ * if it satisfies the specified insertion flag.
+ *
+ * During decoding, when the library encounters a metadata OBU, it is always
+ * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image.
+ */
+typedef enum aom_metadata_insert_flags {
+  AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */
+  AOM_MIF_KEY_FRAME = 1,     /**< Adds metadata only if it's a keyframe */
+  AOM_MIF_ANY_FRAME = 2      /**< Adds metadata to any type of frame */
+} aom_metadata_insert_flags_t;
+
+/*!\brief Array of aom_metadata structs for an image. */
+typedef struct aom_metadata_array aom_metadata_array_t;
+
+/*!\brief Metadata payload. */
+typedef struct aom_metadata {
+  uint32_t type;                           /**< Metadata type */
+  uint8_t *payload;                        /**< Metadata payload data */
+  size_t sz;                               /**< Metadata payload size */
+  aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */
+} aom_metadata_t;
+
 /**\brief Image Descriptor */
 typedef struct aom_image {
   aom_img_fmt_t fmt;                 /**< Image Format */
@@ -188,21 +218,16 @@
   int img_data_owner;      /**< private */
   int self_allocd;         /**< private */
 
+  aom_metadata_array_t
+      *metadata; /**< Metadata payloads associated with the image. */
+
   void *fb_priv; /**< Frame buffer data associated with the image. */
 } aom_image_t;   /**< alias for struct aom_image */
 
-/**\brief Representation of a rectangle on a surface */
-typedef struct aom_image_rect {
-  unsigned int x;   /**< leftmost column */
-  unsigned int y;   /**< topmost row */
-  unsigned int w;   /**< width */
-  unsigned int h;   /**< height */
-} aom_image_rect_t; /**< alias for struct aom_image_rect */
-
 /*!\brief Open a descriptor, allocating storage for the underlying image
  *
  * Returns a descriptor for storing an image of the given format. The
- * storage for the descriptor is allocated on the heap.
+ * storage for the image is allocated on the heap.
  *
  * \param[in]    img       Pointer to storage for descriptor. If this parameter
  *                         is NULL, the storage for the descriptor will be
@@ -211,7 +236,7 @@
  * \param[in]    d_w       Width of the image
  * \param[in]    d_h       Height of the image
  * \param[in]    align     Alignment, in bytes, of the image buffer and
- *                         each row in the image(stride).
+ *                         each row in the image (stride).
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
@@ -224,7 +249,7 @@
 /*!\brief Open a descriptor, using existing storage for the underlying image
  *
  * Returns a descriptor for storing an image of the given format. The
- * storage for descriptor has been allocated elsewhere, and a descriptor is
+ * storage for the image has been allocated elsewhere, and a descriptor is
  * desired to "wrap" that storage.
  *
  * \param[in]    img       Pointer to storage for descriptor. If this parameter
@@ -233,7 +258,8 @@
  * \param[in]    fmt       Format for the image
  * \param[in]    d_w       Width of the image
  * \param[in]    d_h       Height of the image
- * \param[in]    align     Alignment, in bytes, of each row in the image.
+ * \param[in]    align     Alignment, in bytes, of each row in the image
+ *                         (stride).
  * \param[in]    img_data  Storage to use for the image
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
@@ -248,7 +274,7 @@
  * border
  *
  * Returns a descriptor for storing an image of the given format and its
- * borders. The storage for the descriptor is allocated on the heap.
+ * borders. The storage for the image is allocated on the heap.
  *
  * \param[in]    img        Pointer to storage for descriptor. If this parameter
  *                          is NULL, the storage for the descriptor will be
@@ -257,8 +283,8 @@
  * \param[in]    d_w        Width of the image
  * \param[in]    d_h        Height of the image
  * \param[in]    align      Alignment, in bytes, of the image buffer and
- *                          each row in the image(stride).
- * \param[in]    size_align Alignment, in bytes, of the image width and height.
+ *                          each row in the image (stride).
+ * \param[in]    size_align Alignment, in pixels, of the image width and height.
  * \param[in]    border     A border that is padded on four sides of the image.
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
@@ -323,6 +349,80 @@
  */
 int aom_img_plane_height(const aom_image_t *img, int plane);
 
+/*!\brief Add metadata to image.
+ *
+ * Adds metadata to aom_image_t.
+ * Function makes a copy of the provided data parameter.
+ * Metadata insertion point is controlled by insert_flag.
+ *
+ * \param[in]    img          Image descriptor
+ * \param[in]    type         Metadata type
+ * \param[in]    data         Metadata contents
+ * \param[in]    sz           Metadata contents size
+ * \param[in]    insert_flag  Metadata insert flag
+ */
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+                         size_t sz, aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Return a metadata payload stored within the image metadata array.
+ *
+ * Gets the metadata (aom_metadata_t) at the indicated index in the image
+ * metadata array.
+ *
+ * \param[in] img          Pointer to image descriptor to get metadata from
+ * \param[in] index        Metadata index to get from metadata array
+ *
+ * \return Returns a const pointer to the selected metadata, if img and/or index
+ * is invalid, it returns NULL.
+ */
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+                                           size_t index);
+
+/*!\brief Return the number of metadata blocks within the image.
+ *
+ * Gets the number of metadata blocks contained within the provided image
+ * metadata array.
+ *
+ * \param[in] img          Pointer to image descriptor to get metadata number
+ * from.
+ *
+ * \return Returns the size of the metadata array. If img or metadata is NULL,
+ * it returns 0.
+ */
+size_t aom_img_num_metadata(const aom_image_t *img);
+
+/*!\brief Remove metadata from image.
+ *
+ * Removes all metadata in image metadata list and sets metadata list pointer
+ * to NULL.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_remove_metadata(aom_image_t *img);
+
+/*!\brief Allocate memory for aom_metadata struct.
+ *
+ * Allocates storage for the metadata payload, sets its type and copies the
+ * payload data into the aom_metadata struct. A metadata payload buffer of size
+ * sz is allocated and sz bytes are copied from data into the payload buffer.
+ *
+ * \param[in]    type         Metadata type
+ * \param[in]    data         Metadata data pointer
+ * \param[in]    sz           Metadata size
+ * \param[in]    insert_flag  Metadata insert flag
+ */
+aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data,
+                                       size_t sz,
+                                       aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Free metadata struct.
+ *
+ * Free metadata struct and its buffer.
+ *
+ * \param[in]    metadata       Metadata struct pointer
+ */
+void aom_img_metadata_free(aom_metadata_t *metadata);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/aom/aom_integer.h b/libaom/aom/aom_integer.h
index 90263bd..113671e 100644
--- a/libaom/aom/aom_integer.h
+++ b/libaom/aom/aom_integer.h

@@ -19,7 +19,6 @@
 #define AOM_INLINE __inline
 #else
 #define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
-// TODO(jbb): Allow a way to force inline off for older compilers.
 #define AOM_INLINE inline
 #endif
 
@@ -72,8 +71,6 @@
 #define INT32_MIN (-2147483647 - 1)
 #endif
 
-#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
-
 #if defined(__cplusplus)
 extern "C" {
 #endif  // __cplusplus

diff --git a/libaom/aom/aomcx.h b/libaom/aom/aomcx.h
index da7498f..051d33e 100644
--- a/libaom/aom/aomcx.h
+++ b/libaom/aom/aomcx.h

@@ -148,85 +148,114 @@
  * This set of macros define the control functions available for AVx
  * encoder interface.
  *
- * \sa #aom_codec_control
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
 enum aome_enc_control_id {
-  /*!\brief Codec control function to set which reference frame encoder can use.
+  /*!\brief Codec control function to set which reference frame encoder can use,
+   * int parameter.
    */
   AOME_USE_REFERENCE = 7,
 
-  /*!\brief Codec control function to pass an ROI map to encoder.
+  /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t*
+   * parameter.
    */
   AOME_SET_ROI_MAP = 8,
 
-  /*!\brief Codec control function to pass an Active map to encoder.
+  /*!\brief Codec control function to pass an Active map to encoder,
+   * aom_active_map_t* parameter.
    */
-  AOME_SET_ACTIVEMAP,
+  AOME_SET_ACTIVEMAP = 9,
 
-  /*!\brief Codec control function to set encoder scaling mode.
+  /* NOTE: enum 10 unused */
+
+  /*!\brief Codec control function to set encoder scaling mode,
+   * aom_scaling_mode_t* parameter.
    */
   AOME_SET_SCALEMODE = 11,
 
-  /*!\brief Codec control function to set encoder spatial layer id.
+  /*!\brief Codec control function to set encoder spatial layer id, unsigned int
+   * parameter.
    */
   AOME_SET_SPATIAL_LAYER_ID = 12,
 
-  /*!\brief Codec control function to set encoder internal speed settings.
+  /*!\brief Codec control function to set encoder internal speed settings,
+   * int parameter
    *
-   * Changes in this value influences, among others, the encoder's selection
-   * of motion estimation methods. Values greater than 0 will increase encoder
-   * speed at the expense of quality.
+   * Changes in this value influences the complexity of algorithms used in
+   * encoding process, values greater than 0 will increase encoder speed at
+   * the expense of quality.
    *
-   * \note Valid range: 0..8
+   * Valid range: 0..8. 0 runs the slowest, and 8 runs the fastest;
+   * quality improves as speed decreases (since more compression
+   * possibilities are explored).
    */
   AOME_SET_CPUUSED = 13,
 
-  /*!\brief Codec control function to enable automatic set and use alf frames.
+  /*!\brief Codec control function to enable automatic set and use alf frames,
+   * unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AOME_SET_ENABLEAUTOALTREF,
+  AOME_SET_ENABLEAUTOALTREF = 14,
 
-  /*!\brief Codec control function to set sharpness.
+  /* NOTE: enum 15 unused */
+
+  /*!\brief Codec control function to set sharpness, unsigned int parameter.
    */
-  AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,
+  AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
 
-  /*!\brief Codec control function to set the threshold for MBs treated static.
+  /*!\brief Codec control function to set the threshold for MBs treated static,
+   * unsigned int parameter
    */
-  AOME_SET_STATIC_THRESHOLD,
+  AOME_SET_STATIC_THRESHOLD = 17,
 
-  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+  /* NOTE: enum 18 unused */
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder,
+   * int* parameter
    *
    * Return value uses internal quantizer scale defined by the codec.
    */
-  AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2,
+  AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2,  // 19
 
-  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+  /*!\brief Codec control function to get last quantizer chosen by the encoder,
+   * int* parameter
    *
    * Return value uses the 0..63 scale as used by the rc_*_quantizer config
    * parameters.
    */
-  AOME_GET_LAST_QUANTIZER_64,
+  AOME_GET_LAST_QUANTIZER_64 = 20,
 
-  /*!\brief Codec control function to set the max no of frames to create arf.
+  /*!\brief Codec control function to set the max no of frames to create arf,
+   * unsigned int parameter
    */
-  AOME_SET_ARNR_MAXFRAMES,
+  AOME_SET_ARNR_MAXFRAMES = 21,
 
-  /*!\brief Codec control function to set the filter strength for the arf.
+  /*!\brief Codec control function to set the filter strength for the arf,
+   * unsigned int parameter
    */
-  AOME_SET_ARNR_STRENGTH,
+  AOME_SET_ARNR_STRENGTH = 22,
 
-  /*!\brief Codec control function to set visual tuning.
+  /* NOTE: enum 23 unused */
+
+  /*!\brief Codec control function to set visual tuning, aom_tune_metric (int)
+   * parameter
    */
-  AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2,
+  AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2,  // 24
 
-  /*!\brief Codec control function to set constrained quality level.
+  /*!\brief Codec control function to set constrained / constant quality level,
+   * unsigned int parameter
    *
-   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
-   *            set to #AOM_CQ.
-   * \note Valid range: 0..63
+   * Valid range: 0..63
+   *
+   * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage
+   *            must be set to #AOM_CQ or #AOM_Q.
    */
-  AOME_SET_CQ_LEVEL,
+  AOME_SET_CQ_LEVEL = 25,
 
-  /*!\brief Codec control function to set Max data rate for Intra frames.
+  /*!\brief Codec control function to set max data rate for intra frames,
+   * unsigned int parameter
    *
    * This value controls additional clamping on the maximum size of a
    * keyframe. It is expressed as a percentage of the average
@@ -237,13 +266,15 @@
    * For example, to allocate no more than 4.5 frames worth of bitrate
    * to a keyframe, set this to 450.
    */
-  AOME_SET_MAX_INTRA_BITRATE_PCT,
+  AOME_SET_MAX_INTRA_BITRATE_PCT = 26,
 
-  /*!\brief Codec control function to set number of spatial layers.
+  /*!\brief Codec control function to set number of spatial layers, int
+   * parameter
    */
-  AOME_SET_NUMBER_SPATIAL_LAYERS,
+  AOME_SET_NUMBER_SPATIAL_LAYERS = 27,
 
-  /*!\brief Codec control function to set max data rate for Inter frames.
+  /*!\brief Codec control function to set max data rate for inter frames,
+   * unsigned int parameter
    *
    * This value controls additional clamping on the maximum size of an
    * inter frame. It is expressed as a percentage of the average
@@ -254,9 +285,10 @@
    * For example, to allow no more than 4.5 frames worth of bitrate
    * to an inter frame, set this to 450.
    */
-  AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2,
+  AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2,  // 28
 
-  /*!\brief Boost percentage for Golden Frame in CBR mode.
+  /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int
+   * parameter
    *
    * This value controls the amount of boost given to Golden Frame in
    * CBR mode. It is expressed as a percentage of the average
@@ -267,68 +299,85 @@
    * For example, to allow 100% more bits, i.e, 2X, in a golden frame
    * than average frame, set this to 100.
    */
-  AV1E_SET_GF_CBR_BOOST_PCT,
+  AV1E_SET_GF_CBR_BOOST_PCT = 29,
 
-  /*!\brief Codec control function to set lossless encoding mode.
+  /* NOTE: enum 30 unused */
+
+  /*!\brief Codec control function to set lossless encoding mode, unsigned int
+   * parameter
    *
    * AV1 can operate in lossless encoding mode, in which the bitstream
    * produced will be able to decode and reconstruct a perfect copy of
-   * input source. This control function provides a mean to switch encoder
-   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
-   *                          0 = lossy coding mode
-   *                          1 = lossless coding mode
+   * input source.
    *
-   *  By default, encoder operates in normal coding mode (maybe lossy).
+   * - 0 = normal coding mode, may be lossy (default)
+   * - 1 = lossless coding mode
    */
-  AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2,
+  AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2,  // 31
 
-  /** control function to enable the row based multi-threading of encoder. A
-   * value that is equal to 1 indicates that row based multi-threading is
-   * enabled.
+  /*!\brief Codec control function to enable the row based multi-threading
+   * of the encoder, unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ROW_MT,
+  AV1E_SET_ROW_MT = 32,
 
-  /*!\brief Codec control function to set number of tile columns.
+  /*!\brief Codec control function to set number of tile columns. unsigned int
+   * parameter
    *
    * In encoding and decoding, AV1 allows an input image frame be partitioned
    * into separate vertical tile columns, which can be encoded or decoded
    * independently. This enables easy implementation of parallel encoding and
    * decoding. The parameter for this control describes the number of tile
    * columns (in log2 units), which has a valid range of [0, 6]:
-   *             0 = 1 tile column
-   *             1 = 2 tile columns
-   *             2 = 4 tile columns
-   *             .....
-   *             n = 2**n tile columns
-   *
+   * \verbatim
+                 0 = 1 tile column
+                 1 = 2 tile columns
+                 2 = 4 tile columns
+                 .....
+                 n = 2**n tile columns
+     \endverbatim
    * By default, the value is 0, i.e. one single column tile for entire image.
    */
-  AV1E_SET_TILE_COLUMNS,
+  AV1E_SET_TILE_COLUMNS = 33,
 
-  /*!\brief Codec control function to set number of tile rows.
+  /*!\brief Codec control function to set number of tile rows, unsigned int
+   * parameter
    *
    * In encoding and decoding, AV1 allows an input image frame be partitioned
    * into separate horizontal tile rows, which can be encoded or decoded
    * independently. The parameter for this control describes the number of tile
    * rows (in log2 units), which has a valid range of [0, 6]:
-   *            0 = 1 tile row
-   *            1 = 2 tile rows
-   *            2 = 4 tile rows
-   *            .....
-   *            n = 2**n tile rows
-   *
+   * \verbatim
+                0 = 1 tile row
+                1 = 2 tile rows
+                2 = 4 tile rows
+                .....
+                n = 2**n tile rows
+   \endverbatim
    * By default, the value is 0, i.e. one single row tile for entire image.
    */
-  AV1E_SET_TILE_ROWS,
+  AV1E_SET_TILE_ROWS = 34,
 
   /*!\brief Codec control function to enable RDO modulated by frame temporal
-   * dependency.
+   * dependency, unsigned int parameter
    *
-   * By default, this feature is off.
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_TPL_MODEL,
+  AV1E_SET_ENABLE_TPL_MODEL = 35,
 
-  /*!\brief Codec control function to enable frame parallel decoding feature.
+  /*!\brief Codec control function to enable temporal filtering on key frame,
+   * unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature,
+   * unsigned int parameter
    *
    * AV1 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
@@ -336,291 +385,301 @@
    * video frames in the decoder. This control function provides a mean to
    * turn this feature on or off for bitstreams produced by encoder.
    *
-   * By default, this feature is off.
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AV1E_SET_FRAME_PARALLEL_DECODING,
+  AV1E_SET_FRAME_PARALLEL_DECODING = 37,
 
-  /*!\brief Codec control function to enable error_resilient_mode
+  /*!\brief Codec control function to enable error_resilient_mode, int parameter
    *
    * AV1 has a bitstream feature to guarantee parseability of a frame
    * by turning on the error_resilient_decoding mode, even though the
    * reference buffers are unreliable or not received.
    *
-   * By default, this feature is off.
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AV1E_SET_ERROR_RESILIENT_MODE,
+  AV1E_SET_ERROR_RESILIENT_MODE = 38,
 
-  /*!\brief Codec control function to enable s_frame_mode
+  /*!\brief Codec control function to enable s_frame_mode, int parameter
    *
    * AV1 has a bitstream feature to designate certain frames as S-frames,
    * from where we can switch to a different stream,
    * even though the reference buffers may not be exactly identical.
    *
-   * By default, this feature is off.
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AV1E_SET_S_FRAME_MODE,
+  AV1E_SET_S_FRAME_MODE = 39,
 
-  /*!\brief Codec control function to set adaptive quantization mode.
+  /*!\brief Codec control function to set adaptive quantization mode, unsigned
+   * int parameter
    *
    * AV1 has a segment based feature that allows encoder to adaptively change
    * quantization parameter for each segment within a frame to improve the
    * subjective quality. This control makes encoder operate in one of the
    * several AQ_modes supported.
    *
-   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AV1E_SET_AQ_MODE,
+  AV1E_SET_AQ_MODE = 40,
 
-  /*!\brief Codec control function to enable/disable periodic Q boost.
+  /*!\brief Codec control function to enable/disable periodic Q boost, unsigned
+   * int parameter
    *
    * One AV1 encoder speed feature is to enable quality boost by lowering
    * frame level Q periodically. This control function provides a mean to
    * turn on/off this feature.
-   *               0 = off
-   *               1 = on
    *
-   * By default, the encoder is allowed to use this feature for appropriate
-   * encoding modes.
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AV1E_SET_FRAME_PERIODIC_BOOST,
+  AV1E_SET_FRAME_PERIODIC_BOOST = 41,
 
-  /*!\brief Codec control function to set noise sensitivity.
+  /*!\brief Codec control function to set noise sensitivity, unsigned int
+   * parameter
    *
-   *  0: off, 1: On(YOnly)
+   * - 0 = disable (default)
+   * - 1 = enable (Y only)
    */
-  AV1E_SET_NOISE_SENSITIVITY,
+  AV1E_SET_NOISE_SENSITIVITY = 42,
 
-  /*!\brief Codec control function to set content type.
-   * \note Valid parameter range:
-   *              AOM_CONTENT_DEFAULT = Regular video content (Default)
-   *              AOM_CONTENT_SCREEN  = Screen capture content
-   */
-  AV1E_SET_TUNE_CONTENT,
-
-  /*!\brief Codec control function to set CDF update mode.
+  /*!\brief Codec control function to set content type, aom_tune_content
+   * parameter
    *
-   *  0: no update          1: update on every frame
-   *  2: selectively update
+   *  - AOM_CONTENT_DEFAULT = Regular video content (default)
+   *  - AOM_CONTENT_SCREEN  = Screen capture content
    */
-  AV1E_SET_CDF_UPDATE_MODE,
+  AV1E_SET_TUNE_CONTENT = 43,
 
-  /*!\brief Codec control function to set color space info.
-   * \note Valid ranges: 0..23, default is "Unspecified".
-   *                     0 = For future use
-   *                     1 = BT.709
-   *                     2 = Unspecified
-   *                     3 = For future use
-   *                     4 = BT.470 System M (historical)
-   *                     5 = BT.470 System B, G (historical)
-   *                     6 = BT.601
-   *                     7 = SMPTE 240
-   *                     8 = Generic film (color filters using illuminant C)
-   *                     9 = BT.2020, BT.2100
-   *                     10 = SMPTE 428 (CIE 1921 XYZ)
-   *                     11 = SMPTE RP 431-2
-   *                     12 = SMPTE EG 432-1
-   *                     13 = For future use (values 13 - 21)
-   *                     22 = EBU Tech. 3213-E
-   *                     23 = For future use
+  /*!\brief Codec control function to set CDF update mode, unsigned int
+   * parameter
    *
+   *  - 0: no update
+   *  - 1: update on every frame (default)
+   *  - 2: selectively update
    */
-  AV1E_SET_COLOR_PRIMARIES,
+  AV1E_SET_CDF_UPDATE_MODE = 44,
 
-  /*!\brief Codec control function to set transfer function info.
-   * \note Valid ranges: 0..19, default is "Unspecified".
-   *                     0 = For future use
-   *                     1 = BT.709
-   *                     2 = Unspecified
-   *                     3 = For future use
-   *                     4 = BT.470 System M (historical)
-   *                     5 = BT.470 System B, G (historical)
-   *                     6 = BT.601
-   *                     7 = SMPTE 240 M
-   *                     8 = Linear
-   *                     9 = Logarithmic (100 : 1 range)
-   *                     10 = Logarithmic (100 * Sqrt(10) : 1 range)
-   *                     11 = IEC 61966-2-4
-   *                     12 = BT.1361
-   *                     13 = sRGB or sYCC
-   *                     14 = BT.2020 10-bit systems
-   *                     15 = BT.2020 12-bit systems
-   *                     16 = SMPTE ST 2084, ITU BT.2100 PQ
-   *                     17 = SMPTE ST 428
-   *                     18 = BT.2100 HLG, ARIB STD-B67
-   *                     19 = For future use
+  /*!\brief Codec control function to set color space info, int parameter
    *
+   *  - 0 = For future use
+   *  - 1 = BT.709
+   *  - 2 = Unspecified (default)
+   *  - 3 = For future use
+   *  - 4 = BT.470 System M (historical)
+   *  - 5 = BT.470 System B, G (historical)
+   *  - 6 = BT.601
+   *  - 7 = SMPTE 240
+   *  - 8 = Generic film (color filters using illuminant C)
+   *  - 9 = BT.2020, BT.2100
+   *  - 10 = SMPTE 428 (CIE 1921 XYZ)
+   *  - 11 = SMPTE RP 431-2
+   *  - 12 = SMPTE EG 432-1
+   *  - 13..21 = For future use
+   *  - 22 = EBU Tech. 3213-E
+   *  - 23 = For future use
    */
-  AV1E_SET_TRANSFER_CHARACTERISTICS,
+  AV1E_SET_COLOR_PRIMARIES = 45,
 
-  /*!\brief Codec control function to set transfer function info.
-   * \note Valid ranges: 0..15, default is "Unspecified".
-   *                     0 = Identity matrix
-   *                     1 = BT.709
-   *                     2 = Unspecified
-   *                     3 = For future use
-   *                     4 = US FCC 73.628
-   *                     5 = BT.470 System B, G (historical)
-   *                     6 = BT.601
-   *                     7 = SMPTE 240 M
-   *                     8 = YCgCo
-   *                     9 = BT.2020 non-constant luminance, BT.2100 YCbCr
-   *                     10 = BT.2020 constant luminance
-   *                     11 = SMPTE ST 2085 YDzDx
-   *                     12 = Chromaticity-derived non-constant luminance
-   *                     13 = Chromaticity-derived constant luminance
-   *                     14 = BT.2100 ICtCp
-   *                     15 = For future use
+  /*!\brief Codec control function to set transfer function info, int parameter
    *
+   * - 0 = For future use
+   * - 1 = BT.709
+   * - 2 = Unspecified (default)
+   * - 3 = For future use
+   * - 4 = BT.470 System M (historical)
+   * - 5 = BT.470 System B, G (historical)
+   * - 6 = BT.601
+   * - 7 = SMPTE 240 M
+   * - 8 = Linear
+   * - 9 = Logarithmic (100 : 1 range)
+   * - 10 = Logarithmic (100 * Sqrt(10) : 1 range)
+   * - 11 = IEC 61966-2-4
+   * - 12 = BT.1361
+   * - 13 = sRGB or sYCC
+   * - 14 = BT.2020 10-bit systems
+   * - 15 = BT.2020 12-bit systems
+   * - 16 = SMPTE ST 2084, ITU BT.2100 PQ
+   * - 17 = SMPTE ST 428
+   * - 18 = BT.2100 HLG, ARIB STD-B67
+   * - 19 = For future use
    */
-  AV1E_SET_MATRIX_COEFFICIENTS,
+  AV1E_SET_TRANSFER_CHARACTERISTICS = 46,
 
-  /*!\brief Codec control function to set chroma 4:2:0 sample position info.
-   * \note Valid ranges: 0..3, default is "UNKNOWN".
-   *                     0 = UNKNOWN,
-   *                     1 = VERTICAL
-   *                     2 = COLOCATED
-   *                     3 = RESERVED
+  /*!\brief Codec control function to set transfer function info, int parameter
+   *
+   * - 0 = Identity matrix
+   * - 1 = BT.709
+   * - 2 = Unspecified (default)
+   * - 3 = For future use
+   * - 4 = US FCC 73.628
+   * - 5 = BT.470 System B, G (historical)
+   * - 6 = BT.601
+   * - 7 = SMPTE 240 M
+   * - 8 = YCgCo
+   * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr
+   * - 10 = BT.2020 constant luminance
+   * - 11 = SMPTE ST 2085 YDzDx
+   * - 12 = Chromaticity-derived non-constant luminance
+   * - 13 = Chromaticity-derived constant luminance
+   * - 14 = BT.2100 ICtCp
+   * - 15 = For future use
    */
-  AV1E_SET_CHROMA_SAMPLE_POSITION,
+  AV1E_SET_MATRIX_COEFFICIENTS = 47,
 
-  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+  /*!\brief Codec control function to set chroma 4:2:0 sample position info,
+   * aom_chroma_sample_position_t parameter
+   *
+   * AOM_CSP_UNKNOWN is default
+   */
+  AV1E_SET_CHROMA_SAMPLE_POSITION = 48,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF
+   * frames, unsigned int parameter
    *
    * By default the value is set as 4.
    */
-  AV1E_SET_MIN_GF_INTERVAL,
+  AV1E_SET_MIN_GF_INTERVAL = 49,
 
-  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+  /*!\brief Codec control function to set minimum interval between GF/ARF
+   * frames, unsigned int parameter
    *
    * By default the value is set as 16.
    */
-  AV1E_SET_MAX_GF_INTERVAL,
+  AV1E_SET_MAX_GF_INTERVAL = 50,
 
-  /*!\brief Codec control function to get an Active map back from the encoder.
+  /*!\brief Codec control function to get an active map back from the encoder,
+    aom_active_map_t* parameter
    */
-  AV1E_GET_ACTIVEMAP,
+  AV1E_GET_ACTIVEMAP = 51,
 
-  /*!\brief Codec control function to set color range bit.
-   * \note Valid ranges: 0..1, default is 0
-   *                     0 = Limited range (16..235 or HBD equivalent)
-   *                     1 = Full range (0..255 or HBD equivalent)
+  /*!\brief Codec control function to set color range bit, int parameter
+   *
+   * - 0 = Limited range, 16..235 or HBD equivalent (default)
+   * - 1 = Full range, 0..255 or HBD equivalent
    */
-  AV1E_SET_COLOR_RANGE,
+  AV1E_SET_COLOR_RANGE = 52,
 
-  /*!\brief Codec control function to set intended rendering image size.
+  /*!\brief Codec control function to set intended rendering image size,
+   * int32_t[2] parameter
    *
    * By default, this is identical to the image size in pixels.
    */
-  AV1E_SET_RENDER_SIZE,
+  AV1E_SET_RENDER_SIZE = 53,
 
   /*!\brief Control to set target sequence level index for a certain operating
-   * point(OP).
+   * point(OP), int parameter
    * Possible values are in the form of "ABxy"(pad leading zeros if less than
    * 4 digits).
-   *   AB: OP index.
-   *   xy: Target level index for the OP. Can be values 0~23(corresponding to
-   *   level 2.0 ~ 7.3) or 31(maximum level parameter, no level-based
-   *   constraints).
-   * E.g. "0" means target level index 0 for the 0th OP;
-   *      "111" means target level index 11 for the 1st OP;
-   *      "1021" means target level index 21 for the 10th OP.
+   *  - AB: OP index.
+   *  - xy: Target level index for the OP. Can be values 0~23(corresponding to
+   *    level 2.0 ~ 7.3) or 24(keep level stats only for level monitoring) or
+   *    31(maximum level parameter, no level-based constraints).
+   *
+   * E.g.:
+   * - "0" means target level index 0 for the 0th OP;
+   * - "111" means target level index 11 for the 1st OP;
+   * - "1021" means target level index 21 for the 10th OP.
+   *
    * If the target level is not specified for an OP, the maximum level parameter
    * of 31 is used as default.
    */
-  AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+  AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54,
 
-  /*!\brief Codec control function to get sequence level index.
+  /*!\brief Codec control function to get sequence level index for each
+   * operating point. int* parameter. There can be at most 32 operating points.
+   * The results will be written into a provided integer array of sufficient
+   * size.
    */
-  AV1E_GET_SEQ_LEVEL_IDX,
+  AV1E_GET_SEQ_LEVEL_IDX = 55,
 
-  /*!\brief Codec control function to set intended superblock size.
+  /*!\brief Codec control function to set intended superblock size, unsigned int
+   * parameter
    *
    * By default, the superblock size is determined separately for each
    * frame by the encoder.
-   *
-   * Experiment: EXT_PARTITION
    */
-  AV1E_SET_SUPERBLOCK_SIZE,
+  AV1E_SET_SUPERBLOCK_SIZE = 56,
 
-  /*!\brief Codec control function to enable automatic set and use
-   * bwd-pred frames.
+  /*!\brief Codec control function to enable automatic set and use of
+   * bwd-pred frames, unsigned int parameter
    *
+   * - 0 = disable (default)
+   * - 1 = enable
    */
-  AOME_SET_ENABLEAUTOBWDREF,
+  AOME_SET_ENABLEAUTOBWDREF = 57,
 
-  /*!\brief Codec control function to encode with CDEF.
+  /*!\brief Codec control function to encode with CDEF, unsigned int parameter
    *
    * CDEF is the constrained directional enhancement filter which is an
    * in-loop filter aiming to remove coding artifacts
-   *                          0 = do not apply CDEF
-   *                          1 = apply CDEF
    *
-   *  By default, the encoder applies CDEF.
-   *
-   * Experiment: AOM_CDEF
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_CDEF,
+  AV1E_SET_ENABLE_CDEF = 58,
 
-  /*!\brief Codec control function to encode with Loop Restoration Filter.
+  /*!\brief Codec control function to encode with Loop Restoration Filter,
+   * unsigned int parameter
    *
-   *                          0 = do not apply Restoration Filter
-   *                          1 = apply Restoration Filter
-   *
-   *  By default, the encoder applies Restoration Filter.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_RESTORATION,
+  AV1E_SET_ENABLE_RESTORATION = 59,
 
-  /*!\brief Codec control function to predict with OBMC mode.
+  /*!\brief Codec control function to force video mode, unsigned int parameter
    *
-   *                          0 = do not allow OBMC mode
-   *                          1 = allow OBMC mode
-   *
-   *  By default, the encoder allows OBMC prediction mode.
-   *
+   * - 0 = do not force video mode (default)
+   * - 1 = force video mode even for a single frame
    */
-  AV1E_SET_ENABLE_OBMC,
+  AV1E_SET_FORCE_VIDEO_MODE = 60,
 
-  /*!\brief Codec control function to encode without trellis quantization.
+  /*!\brief Codec control function to predict with OBMC mode, unsigned int
+   * parameter
    *
-   *                          0 = apply trellis quantization
-   *                          1 = do not apply trellis quantization
-   *                          2 = disable trellis quantization partially
-   *
-   *  By default, the encoder applies optimization on quantized
-   *  coefficients.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_DISABLE_TRELLIS_QUANT,
+  AV1E_SET_ENABLE_OBMC = 61,
 
-  /*!\brief Codec control function to encode with quantisation matrices.
+  /*!\brief Codec control function to encode without trellis quantization,
+   * unsigned int parameter
+   *
+   * - 0 = apply trellis quantization (default)
+   * - 1 = do not apply trellis quantization
+   * - 2 = disable trellis quantization in rd search
+   * - 3 = disable trellis quantization in estimate yrd
+   */
+  AV1E_SET_DISABLE_TRELLIS_QUANT = 62,
+
+  /*!\brief Codec control function to encode with quantisation matrices,
+   * unsigned int parameter
    *
    * AOM can operate with default quantisation matrices dependent on
    * quantisation level and block type.
-   *                          0 = do not use quantisation matrices
-   *                          1 = use quantisation matrices
    *
-   *  By default, the encoder operates without quantisation matrices.
-   *
-   * Experiment: AOM_QM
+   * - 0 = disable (default)
+   * - 1 = enable
    */
+  AV1E_SET_ENABLE_QM = 63,
 
-  AV1E_SET_ENABLE_QM,
-
-  /*!\brief Codec control function to set the min quant matrix flatness.
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
    *
    * AOM can operate with different ranges of quantisation matrices.
    * As quantisation levels increase, the matrices get flatter. This
    * control sets the minimum level of flatness from which the matrices
    * are determined.
    *
-   *  By default, the encoder sets this minimum at half the available
-   *  range.
-   *
-   * Experiment: AOM_QM
+   * By default, the encoder sets this minimum at half the available
+   * range.
    */
-  AV1E_SET_QM_MIN,
+  AV1E_SET_QM_MIN = 64,
 
-  /*!\brief Codec control function to set the max quant matrix flatness.
+  /*!\brief Codec control function to set the max quant matrix flatness,
+   * unsigned int parameter
    *
    * AOM can operate with different ranges of quantisation matrices.
    * As quantisation levels increase, the matrices get flatter. This
@@ -628,470 +687,575 @@
    *
    * By default, the encoder sets this maximum at the top of the
    * available range.
-   *
-   * Experiment: AOM_QM
    */
-  AV1E_SET_QM_MAX,
+  AV1E_SET_QM_MAX = 65,
 
-  /*!\brief Codec control function to set the min quant matrix flatness.
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
    *
    * AOM can operate with different ranges of quantisation matrices.
    * As quantisation levels increase, the matrices get flatter. This
    * control sets the flatness for luma (Y).
    *
-   *  By default, the encoder sets this minimum at half the available
-   *  range.
-   *
-   * Experiment: AOM_QM
+   * By default, the encoder sets this minimum at half the available
+   * range.
    */
-  AV1E_SET_QM_Y,
+  AV1E_SET_QM_Y = 66,
 
-  /*!\brief Codec control function to set the min quant matrix flatness.
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
    *
    * AOM can operate with different ranges of quantisation matrices.
    * As quantisation levels increase, the matrices get flatter. This
    * control sets the flatness for chroma (U).
    *
-   *  By default, the encoder sets this minimum at half the available
-   *  range.
-   *
-   * Experiment: AOM_QM
+   * By default, the encoder sets this minimum at half the available
+   * range.
    */
-  AV1E_SET_QM_U,
+  AV1E_SET_QM_U = 67,
 
-  /*!\brief Codec control function to set the min quant matrix flatness.
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
    *
    * AOM can operate with different ranges of quantisation matrices.
    * As quantisation levels increase, the matrices get flatter. This
    * control sets the flatness for chrome (V).
    *
-   *  By default, the encoder sets this minimum at half the available
-   *  range.
-   *
-   * Experiment: AOM_QM
+   * By default, the encoder sets this minimum at half the available
+   * range.
    */
-  AV1E_SET_QM_V,
+  AV1E_SET_QM_V = 68,
 
-  /*!\brief Codec control function to encode with dist_8x8.
-   *
-   *  The dist_8x8 is enabled automatically for model tuning parameters that
-   *  require measuring distortion at the 8x8 level. This control also allows
-   *  measuring distortion at the 8x8 level for other tuning options
-   *  (e.g., PSNR), for testing purposes.
-   *                          0 = do not use dist_8x8
-   *                          1 = use dist_8x8
-   *
-   *  By default, the encoder does not use dist_8x8
-   *
-   * Experiment: DIST_8X8
-   */
-  AV1E_SET_ENABLE_DIST_8X8,
+  /* NOTE: enum 69 unused */
 
-  /*!\brief Codec control function to set a maximum number of tile groups.
+  /*!\brief Codec control function to set a maximum number of tile groups,
+   * unsigned int parameter
    *
    * This will set the maximum number of tile groups. This will be
    * overridden if an MTU size is set. The default value is 1.
-   *
-   * Experiment: TILE_GROUPS
    */
-  AV1E_SET_NUM_TG,
+  AV1E_SET_NUM_TG = 70,
 
-  /*!\brief Codec control function to set an MTU size for a tile group.
+  /*!\brief Codec control function to set an MTU size for a tile group, unsigned
+   * int parameter
    *
    * This will set the maximum number of bytes in a tile group. This can be
    * exceeded only if a single tile is larger than this amount.
    *
    * By default, the value is 0, in which case a fixed number of tile groups
    * is used.
-   *
-   * Experiment: TILE_GROUPS
    */
-  AV1E_SET_MTU,
+  AV1E_SET_MTU = 71,
 
-  /*!\brief Codec control function to set the number of symbols in an ANS data
-   * window.
+  /* NOTE: enum 72 unused */
+
+  /*!\brief Codec control function to enable/disable rectangular partitions, int
+   * parameter
    *
-   * The number of ANS symbols (both boolean and non-booleans alphabets) in an
-   * ANS data window is set to 1 << value.
-   *
-   * \note Valid range: [8, 23]
-   *
-   * Experiment: ANS
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ANS_WINDOW_SIZE_LOG2,
+  AV1E_SET_ENABLE_RECT_PARTITIONS = 73,
 
-  /*!\brief Codec control function to enable/disable rectangular partitions.
+  /*!\brief Codec control function to enable/disable AB partitions, int
+   * parameter
    *
-   * This will enable or disable usage of rectangular partitions. The default
-   * value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_RECT_PARTITIONS,
+  AV1E_SET_ENABLE_AB_PARTITIONS = 74,
 
-  /*!\brief Codec control function to enable/disable AB partitions.
+  /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int
+   * parameter
    *
-   * This will enable or disable usage of AB partitions. The default
-   * value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_AB_PARTITIONS,
+  AV1E_SET_ENABLE_1TO4_PARTITIONS = 75,
 
-  /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions.
+  /*!\brief Codec control function to set min partition size, int parameter
    *
-   * This will enable or disable usage of 1:4 and 4:1 partitions. The default
-   * value is 1.
-   *
-   */
-  AV1E_SET_ENABLE_1TO4_PARTITIONS,
-
-  /*!\brief Codec control function to set min partition size.
-   *
-   * This will set min partition size. The default value is 4 for 4x4.
-   * valid values are [4, 8, 16, 32, 64, 128]
    * min_partition_size is applied to both width and height of the partition.
    * i.e, both width and height of a partition can not be smaller than
    * the min_partition_size, except the partition at the picture boundary.
    *
+   * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for
+   * 4x4.
    */
-  AV1E_SET_MIN_PARTITION_SIZE,
+  AV1E_SET_MIN_PARTITION_SIZE = 76,
 
-  /*!\brief Codec control function to set max partition size.
+  /*!\brief Codec control function to set max partition size, int parameter
    *
-   * This will set max partition size. The default value is 128 for 128x128.
-   * valid values are [4, 8, 16, 32, 64, 128]
    * max_partition_size is applied to both width and height of the partition.
    * i.e, both width and height of a partition can not be larger than
    * the max_partition_size.
+   *
+   * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for
+   * 128x128.
    */
-  AV1E_SET_MAX_PARTITION_SIZE,
+  AV1E_SET_MAX_PARTITION_SIZE = 77,
 
   /*!\brief Codec control function to turn on / off intra edge filter
-   * at sequence level.
+   * at sequence level, int parameter
    *
-   * This will enable or disable usage of intra-edge filtering. The default
-   * value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
+  AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78,
 
-  /*!\brief Codec control function to turn on / off frame order hint for a
-   * few tools:
-   *
-   * joint compound mode
-   * motion field motion vector
+  /*!\brief Codec control function to turn on / off frame order hint (int
+   * parameter). Affects: joint compound mode, motion field motion vector,
    * ref frame sign bias
    *
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_ORDER_HINT,
+  AV1E_SET_ENABLE_ORDER_HINT = 79,
 
-  /*!\brief Codec control function to turn on / off 64-length transforms.
+  /*!\brief Codec control function to turn on / off 64-length transforms, int
+   * parameter
    *
    * This will enable or disable usage of length 64 transforms in any
-   * direction. The default value is 1.
+   * direction.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_TX64,
+  AV1E_SET_ENABLE_TX64 = 80,
 
   /*!\brief Codec control function to turn on / off flip and identity
-   * transforms.
+   * transforms, int parameter
    *
    * This will enable or disable usage of flip and identity transform
-   * types in any direction. The default value is 1. Including:
-   * FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST,
-   * FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
-   * H_FLIPADST
-   */
-  AV1E_SET_ENABLE_FLIP_IDTX,
-
-  /*!\brief Codec control function to set transform block size search method.
+   * types in any direction. If enabled, this includes:
+   * - FLIPADST_DCT
+   * - DCT_FLIPADST
+   * - FLIPADST_FLIPADST
+   * - ADST_FLIPADST
+   * - FLIPADST_ADST
+   * - IDTX
+   * - V_DCT
+   * - H_DCT
+   * - V_ADST
+   * - H_ADST
+   * - V_FLIPADST
+   * - H_FLIPADST
    *
-   * This will set the transform block size search method.
-   * 0: use Full RD search, 1: use Fast RD search, 2: always use largest
-   * allowed transform block size based on partition size.
+   * Valid values:
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_TX_SIZE_SEARCH_METHOD,
+  AV1E_SET_ENABLE_FLIP_IDTX = 81,
+
+  /* Note: enum value 82 unused */
 
   /*!\brief Codec control function to turn on / off dist-wtd compound mode
-   * at sequence level.
+   * at sequence level, int parameter
    *
-   * This will enable or disable distance-weighted compound mode. The default
-   * value is 1. If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+   * This will enable or disable distance-weighted compound mode.
+   * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
    * to 0.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_DIST_WTD_COMP,
+  AV1E_SET_ENABLE_DIST_WTD_COMP = 83,
 
   /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
-   * at sequence level.
+   * at sequence level, int parameter
    *
-   * This will enable or disable usage of MFMV. The default value is 1.
-   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+   * to 0.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_REF_FRAME_MVS,
+  AV1E_SET_ENABLE_REF_FRAME_MVS = 84,
 
   /*!\brief Codec control function to set temporal mv prediction
-   * enabling/disabling at frame level.
+   * enabling/disabling at frame level, int parameter
    *
-   * This will enable or disable temporal mv predicton. The default value is 1.
-   * If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is forced to 0.
+   * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is
+   * forced to 0.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ALLOW_REF_FRAME_MVS,
+  AV1E_SET_ALLOW_REF_FRAME_MVS = 85,
 
-  /*!\brief Codec control function to turn on / off dual filter usage
-   * for a sequence.
+  /*!\brief Codec control function to turn on / off dual interpolation filter
+   * for a sequence, int parameter
    *
-   * This will enable or disable use of dual interpolation filter.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable
    */
-  AV1E_SET_ENABLE_DUAL_FILTER,
+  AV1E_SET_ENABLE_DUAL_FILTER = 86,
+
+  /*!\brief Codec control function to turn on / off delta quantization in chroma
+   * planes usage for a sequence, int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_ENABLE_CHROMA_DELTAQ = 87,
 
   /*!\brief Codec control function to turn on / off masked compound usage
-   * for a sequence.
+   * (wedge and diff-wtd compound modes) for a sequence, int parameter
    *
-   * This will enable or disable usage of wedge and diff-wtd compound
-   * modes. The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_MASKED_COMP,
+  AV1E_SET_ENABLE_MASKED_COMP = 88,
 
   /*!\brief Codec control function to turn on / off one sided compound usage
-   * for a sequence.
+   * for a sequence, int parameter
    *
-   * This will enable or disable usage of one sided compound
-   * modes. The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_ONESIDED_COMP,
+  AV1E_SET_ENABLE_ONESIDED_COMP = 89,
 
   /*!\brief Codec control function to turn on / off interintra compound
-   * for a sequence.
+   * for a sequence, int parameter
    *
-   * This will enable or disable usage of inter-intra compound modes.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_INTERINTRA_COMP,
+  AV1E_SET_ENABLE_INTERINTRA_COMP = 90,
 
   /*!\brief Codec control function to turn on / off smooth inter-intra
-   * mode for a sequence.
+   * mode for a sequence, int parameter
    *
-   * This will enable or disable usage of smooth inter-intra mode.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+  AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91,
 
   /*!\brief Codec control function to turn on / off difference weighted
-   * compound.
+   * compound, int parameter
    *
-   * This will enable or disable usage of difference weighted compound.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_DIFF_WTD_COMP,
+  AV1E_SET_ENABLE_DIFF_WTD_COMP = 92,
 
   /*!\brief Codec control function to turn on / off interinter wedge
-   * compound.
+   * compound, int parameter
    *
-   * This will enable or disable usage of interinter wedge compound.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_INTERINTER_WEDGE,
+  AV1E_SET_ENABLE_INTERINTER_WEDGE = 93,
 
   /*!\brief Codec control function to turn on / off interintra wedge
-   * compound.
+   * compound, int parameter
    *
-   * This will enable or disable usage of interintra wedge compound.
-   * The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+  AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94,
 
   /*!\brief Codec control function to turn on / off global motion usage
-   * for a sequence.
+   * for a sequence, int parameter
    *
-   * This will enable or disable usage of global motion. The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_GLOBAL_MOTION,
+  AV1E_SET_ENABLE_GLOBAL_MOTION = 95,
 
   /*!\brief Codec control function to turn on / off warped motion usage
-   * at sequence level.
+   * at sequence level, int parameter
    *
-   * This will enable or disable usage of warped motion. The default value is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_WARPED_MOTION,
+  AV1E_SET_ENABLE_WARPED_MOTION = 96,
 
   /*!\brief Codec control function to turn on / off warped motion usage
-   * at frame level.
+   * at frame level, int parameter
    *
-   * This will enable or disable usage of warped motion. The default value is 1.
-   * If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is forced to 0.
+   * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is
+   * forced to 0.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ALLOW_WARPED_MOTION,
+  AV1E_SET_ALLOW_WARPED_MOTION = 97,
 
   /*!\brief Codec control function to turn on / off filter intra usage at
-   * sequence level.
+   * sequence level, int parameter
    *
-   * This will enable or disable usage of filter intra. The default value is 1.
-   * If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is forced to 0.
+   * \attention If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is
+   * forced to 0.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_FILTER_INTRA,
+  AV1E_SET_ENABLE_FILTER_INTRA = 98,
 
-  /*!\brief Codec control function to turn on / off smooth intra modes usage.
+  /*!\brief Codec control function to turn on / off smooth intra modes usage,
+   * int parameter
    *
    * This will enable or disable usage of smooth, smooth_h and smooth_v intra
-   * modes. The default value is 1.
+   * modes.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_SMOOTH_INTRA,
+  AV1E_SET_ENABLE_SMOOTH_INTRA = 99,
 
-  /*!\brief Codec control function to turn on / off Paeth intra mode usage.
+  /*!\brief Codec control function to turn on / off Paeth intra mode usage, int
+   * parameter
    *
-   * This will enable or disable usage of Paeth intra mode. The default value
-   * is 1.
-   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_PAETH_INTRA,
+  AV1E_SET_ENABLE_PAETH_INTRA = 100,
 
-  /*!\brief Codec control function to turn on / off CFL uv intra mode usage.
+  /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int
+   * parameter
    *
-   * This will enable or disable usage of chroma-from-luma intra mode. The
-   * default value is 1.
+   * This will enable or disable usage of chroma-from-luma intra mode.
    *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_CFL_INTRA,
+  AV1E_SET_ENABLE_CFL_INTRA = 101,
 
-  /*!\brief Codec control function to turn on / off frame superresolution.
+  /*!\brief Codec control function to turn on / off frame superresolution, int
+   * parameter
    *
-   * This will enable or disable frame superresolution. The default value is 1
-   * If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
+   * \attention If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
    */
-  AV1E_SET_ENABLE_SUPERRES,
+  AV1E_SET_ENABLE_SUPERRES = 102,
 
-  /*!\brief Codec control function to turn on/off palette mode */
-  AV1E_SET_ENABLE_PALETTE,
-
-  /*!\brief Codec control function to turn on/off intra block copy mode */
-  AV1E_SET_ENABLE_INTRABC,
-
-  /*!\brief Codec control function to turn on/off intra angle delta */
-  AV1E_SET_ENABLE_ANGLE_DELTA,
-
-  /*!\brief Codec control function to set the delta q mode
+  /*!\brief Codec control function to turn on / off overlay frames for
+   * filtered ALTREF frames, int parameter
    *
-   * AV1 has a segment based feature that allows encoder to adaptively change
-   * quantization parameter for each segment within a frame to improve the
-   * subjective quality. the delta q mode is added on top of segment based
-   * feature, and allows control per 64x64 q and lf delta.This control makes
-   * encoder operate in one of the several DELTA_Q_modes supported.
-   *
-   * By default, encoder operates with DELTAQ_Mode 0(deltaq signaling off).
+   * This will enable or disable coding of overlay frames for filtered ALTREF
+   * frames. When set to 0, overlay frames are not used but show existing frame
+   * is used to display the filtered ALTREF frame as is. As a result the decoded
+   * frame rate remains the same as the display frame rate. The default is 1.
    */
-  AV1E_SET_DELTAQ_MODE,
+  AV1E_SET_ENABLE_OVERLAY = 103,
 
-  /*!\brief Codec control function to set the single tile decoding mode to 0 or
-   * 1.
-   *
-   * 0 means that the single tile decoding is off, and 1 means that the single
-   * tile decoding is on.
-   *
-   * Experiment: EXT_TILE
-   */
-  AV1E_SET_SINGLE_TILE_DECODING,
+  /*!\brief Codec control function to turn on/off palette mode, int parameter */
+  AV1E_SET_ENABLE_PALETTE = 104,
 
-  /*!\brief Codec control function to enable the extreme motion vector unit test
-   * in AV1. Please note that this is only used in motion vector unit test.
+  /*!\brief Codec control function to turn on/off intra block copy mode, int
+     parameter */
+  AV1E_SET_ENABLE_INTRABC = 105,
+
+  /*!\brief Codec control function to turn on/off intra angle delta, int
+     parameter */
+  AV1E_SET_ENABLE_ANGLE_DELTA = 106,
+
+  /*!\brief Codec control function to set the delta q mode, unsigned int
+   * parameter
    *
-   * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV
+   * AV1 supports a delta q mode feature, that allows modulating q per
+   * superblock.
+   *
+   * - 0 = deltaq signaling off
+   * - 1 = use modulation to maximize objective quality (default)
+   * - 2 = use modulation to maximize perceptual quality
    */
-  AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST,
+  AV1E_SET_DELTAQ_MODE = 107,
+
+  /*!\brief Codec control function to turn on/off loopfilter modulation
+   * when delta q modulation is enabled, unsigned int parameter.
+   *
+   * \attention AV1 only supports loopfilter modulation when delta q
+   * modulation is enabled as well.
+   */
+  AV1E_SET_DELTALF_MODE = 108,
+
+  /*!\brief Codec control function to set the single tile decoding mode,
+   * unsigned int parameter
+   *
+   * \attention Only applicable if large scale tiling is on.
+   *
+   * - 0 = single tile decoding is off
+   * - 1 = single tile decoding is on (default)
+   */
+  AV1E_SET_SINGLE_TILE_DECODING = 109,
+
+  /*!\brief Codec control function to enable the extreme motion vector unit
+   * test, unsigned int parameter
+   *
+   * - 0 = off
+   * - 1 = MAX_EXTREME_MV
+   * - 2 = MIN_EXTREME_MV
+   *
+   * \note This is only used in motion vector unit test.
+   */
+  AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110,
 
   /*!\brief Codec control function to signal picture timing info in the
-   * bitstream. \note Valid ranges: 0..1, default is "UNKNOWN". 0 = UNKNOWN, 1 =
-   * EQUAL
+   * bitstream, aom_timing_info_type_t parameter. Default is
+   * AOM_TIMING_UNSPECIFIED.
    */
-  AV1E_SET_TIMING_INFO_TYPE,
+  AV1E_SET_TIMING_INFO_TYPE = 111,
 
   /*!\brief Codec control function to add film grain parameters (one of several
-   * preset types) info in the bitstream.
-   * \note Valid ranges: 0..11, default is "0". 0 = UNKNOWN,
-   * 1..16 = different test vectors for grain
+   * preset types) info in the bitstream, int parameter
+   *
+   Valid range: 0..16, 0 is unknown, 1..16 are test vectors
    */
-  AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+  AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112,
 
-  /*!\brief Codec control function to set the path to the film grain parameters
+  /*!\brief Codec control function to set the path to the film grain parameters,
+   * const char* parameter
    */
-  AV1E_SET_FILM_GRAIN_TABLE,
+  AV1E_SET_FILM_GRAIN_TABLE = 113,
 
-  /*!\brief Sets the noise level */
-  AV1E_SET_DENOISE_NOISE_LEVEL,
+  /*!\brief Sets the noise level, int parameter */
+  AV1E_SET_DENOISE_NOISE_LEVEL = 114,
 
-  /*!\brief Sets the denoisers block size */
-  AV1E_SET_DENOISE_BLOCK_SIZE,
+  /*!\brief Sets the denoisers block size, unsigned int parameter */
+  AV1E_SET_DENOISE_BLOCK_SIZE = 115,
 
-  /*!\brief Sets the chroma subsampling x value */
-  AV1E_SET_CHROMA_SUBSAMPLING_X,
+  /*!\brief Sets the chroma subsampling x value, unsigned int parameter */
+  AV1E_SET_CHROMA_SUBSAMPLING_X = 116,
 
-  /*!\brief Sets the chroma subsampling y value */
-  AV1E_SET_CHROMA_SUBSAMPLING_Y,
+  /*!\brief Sets the chroma subsampling y value, unsigned int parameter */
+  AV1E_SET_CHROMA_SUBSAMPLING_Y = 117,
 
-  /*!\brief Control to use a reduced tx type set */
-  AV1E_SET_REDUCED_TX_TYPE_SET,
+  /*!\brief Control to use a reduced tx type set, int parameter */
+  AV1E_SET_REDUCED_TX_TYPE_SET = 118,
 
-  /*!\brief Control to use dct only for intra modes */
-  AV1E_SET_INTRA_DCT_ONLY,
+  /*!\brief Control to use dct only for intra modes, int parameter */
+  AV1E_SET_INTRA_DCT_ONLY = 119,
 
-  /*!\brief Control to use dct only for inter modes */
-  AV1E_SET_INTER_DCT_ONLY,
+  /*!\brief Control to use dct only for inter modes, int parameter */
+  AV1E_SET_INTER_DCT_ONLY = 120,
 
-  /*!\brief Control to use default tx type only for intra modes */
-  AV1E_SET_INTRA_DEFAULT_TX_ONLY,
-
-  /*!\brief Control to use adaptive quantize_b */
-  AV1E_SET_QUANT_B_ADAPT,
-
-  /*!\brief Control to select maximum height for the GF group pyramid structure
-   * (valid values: 0 - 4) */
-  AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
-
-  /*!\brief Control to select maximum reference frames allowed per frame
-   * (valid values: 3 - 7) */
-  AV1E_SET_MAX_REFERENCE_FRAMES,
-
-  /*!\brief Control to use reduced set of single and compound references. */
-  AV1E_SET_REDUCED_REFERENCE_SET,
-
-  /*!\brief Control to set frequency of the cost updates for coefficients
-   * Possible values are:
-   * 0: Update at SB level (default)
-   * 1: Update at SB row level in tile
-   * 2: Update at tile level
+  /*!\brief Control to use default tx type only for intra modes, int parameter
    */
-  AV1E_SET_COEFF_COST_UPD_FREQ,
+  AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121,
 
-  /*!\brief Control to set frequency of the cost updates for mode
-   * Possible values are:
-   * 0: Update at SB level (default)
-   * 1: Update at SB row level in tile
-   * 2: Update at tile level
+  /*!\brief Control to use adaptive quantize_b, int parameter */
+  AV1E_SET_QUANT_B_ADAPT = 122,
+
+  /*!\brief Control to select maximum height for the GF group pyramid structure,
+   * unsigned int parameter
+   *
+   * Valid range: 0..4
    */
-  AV1E_SET_MODE_COST_UPD_FREQ,
+  AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123,
+
+  /*!\brief Control to select maximum reference frames allowed per frame, int
+   * parameter
+   *
+   * Valid range: 3..7
+   */
+  AV1E_SET_MAX_REFERENCE_FRAMES = 124,
+
+  /*!\brief Control to use reduced set of single and compound references, int
+     parameter */
+  AV1E_SET_REDUCED_REFERENCE_SET = 125,
+
+  /* NOTE: enums 126-139 unused */
+  /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */
+
+  /*!\brief Control to set frequency of the cost updates for coefficients,
+   * unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_COEFF_COST_UPD_FREQ = 140,
+
+  /*!\brief Control to set frequency of the cost updates for mode, unsigned int
+   * parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_MODE_COST_UPD_FREQ = 141,
+
+  /*!\brief Control to set frequency of the cost updates for motion vectors,
+   * unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_MV_COST_UPD_FREQ = 142,
 
   /*!\brief Control to set bit mask that specifies which tier each of the 32
-   * possible operating points conforms to.
-   * Bit value 0: Main Tier; 1: High Tier.
+   * possible operating points conforms to, unsigned int parameter
+   *
+   * - 0 = main tier (default)
+   * - 1 = high tier
    */
-  AV1E_SET_TIER_MASK,
+  AV1E_SET_TIER_MASK = 143,
+
+  /*!\brief Control to set minimum compression ratio, unsigned int parameter
+   * Take integer values. If non-zero, encoder will try to keep the compression
+   * ratio of each frame to be higher than the given value divided by 100.
+   * E.g. 850 means minimum compression ratio of 8.5.
+   */
+  AV1E_SET_MIN_CR = 144,
+
+  /* NOTE: enums 145-149 unused */
+
+  /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t*
+   * parameter
+   */
+  AV1E_SET_SVC_LAYER_ID = 150,
+
+  /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t*
+   * parameter
+   */
+  AV1E_SET_SVC_PARAMS = 151,
+
+  /*!\brief Codec control function to set reference frame config:
+   * the ref_idx and the refresh flags for each buffer slot.
+   * aom_svc_ref_frame_config_t* parameter
+   */
+  AV1E_SET_SVC_REF_FRAME_CONFIG = 152,
+
+  /*!\brief Codec control function to set the path to the VMAF model used when
+   * tuning the encoder for VMAF, const char* parameter
+   */
+  AV1E_SET_VMAF_MODEL_PATH = 153,
+
+  /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder,
+   * unsigned int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \note This is only used in lightfield example test.
+   */
+  AV1E_ENABLE_EXT_TILE_DEBUG = 154,
+
+  /*!\brief Codec control function to enable the superblock multipass unit test
+   * in AV1 to ensure that the encoder does not leak state between different
+   * passes. unsigned int parameter.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \note This is only used in sb_multipass unit test.
+   */
+  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155,
+
+  /*!\brief Control to select minimum height for the GF group pyramid structure,
+   * unsigned int parameter
+   *
+   * Valid values: 0..4
+   */
+  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -1175,26 +1339,69 @@
  *
  */
 typedef enum {
-  AOM_TUNE_PSNR,
-  AOM_TUNE_SSIM,
-  AOM_TUNE_CDEF_DIST,
-  AOM_TUNE_DAALA_DIST
+  AOM_TUNE_PSNR = 0,
+  AOM_TUNE_SSIM = 1,
+  /* NOTE: enums 2 and 3 unused */
+  AOM_TUNE_VMAF_WITH_PREPROCESSING = 4,
+  AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5,
+  AOM_TUNE_VMAF_MAX_GAIN = 6
 } aom_tune_metric;
 
+#define AOM_MAX_LAYERS 32   /**< Max number of layers */
+#define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */
+#define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */
+
+/*!brief Struct for spatial and temporal layer ID */
+typedef struct aom_svc_layer_id {
+  int spatial_layer_id;  /**< Spatial layer ID */
+  int temporal_layer_id; /**< Temporal layer ID */
+} aom_svc_layer_id_t;
+
+/*!brief Parameter type for SVC */
+typedef struct aom_svc_params {
+  int number_spatial_layers;                 /**< Number of spatial layers */
+  int number_temporal_layers;                /**< Number of temporal layers */
+  int max_quantizers[AOM_MAX_LAYERS];        /**< Max Q for each layer */
+  int min_quantizers[AOM_MAX_LAYERS];        /**< Min Q for each layer */
+  int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */
+  int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */
+  /*! Target bitrate for each layer */
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+  /*! Frame rate factor for each temporal layer */
+  int framerate_factor[AOM_MAX_TS_LAYERS];
+} aom_svc_params_t;
+
+/*!brief Parameters for setting ref frame config */
+typedef struct aom_svc_ref_frame_config {
+  // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
+  // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int reference[7]; /**< Reference flag for each of the 7 references. */
+  /*! Buffer slot index for each of 7 references. */
+  int ref_idx[7];
+  int refresh[8]; /**< Refresh flag for each of the 8 slots. */
+} aom_svc_ref_frame_config_t;
+
 /*!\cond */
 /*!\brief Encoder control function parameter type
  *
- * Defines the data types that AOME/AV1E control functions take. Note that
- * additional common controls are defined in aom.h
+ * Defines the data types that AOME/AV1E control functions take.
  *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
  */
-
 AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int)
 #define AOM_CTRL_AOME_USE_REFERENCE
+
 AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
 #define AOM_CTRL_AOME_SET_ROI_MAP
+
 AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
 #define AOM_CTRL_AOME_SET_ACTIVEMAP
+
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 
@@ -1203,6 +1410,7 @@
 
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 #define AOM_CTRL_AOME_SET_CPUUSED
+
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
 
@@ -1211,15 +1419,19 @@
 
 AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
 #define AOM_CTRL_AOME_SET_SHARPNESS
+
 AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
 #define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
 
 AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
+
 AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_STRENGTH
+
 AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
 #define AOM_CTRL_AOME_SET_TUNING
+
 AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
 #define AOM_CTRL_AOME_SET_CQ_LEVEL
 
@@ -1228,19 +1440,25 @@
 
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int)
 #define AOM_CTRL_AV1E_SET_TILE_COLUMNS
+
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int)
 #define AOM_CTRL_AV1E_SET_TILE_ROWS
 
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_TPL_MODEL
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_KEYFRAME_FILTERING, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_KEYFRAME_FILTERING
+
 AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
 #define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+
 AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
 #define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
 
 AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 #define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+
 AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
 
@@ -1259,6 +1477,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
 
+AOM_CTRL_USE_TYPE(AV1E_SET_FORCE_VIDEO_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_FORCE_VIDEO_MODE
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_OBMC
 
@@ -1288,6 +1509,7 @@
 
 AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
 #define AOM_CTRL_AV1E_SET_NUM_TG
+
 AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
 #define AOM_CTRL_AV1E_SET_MTU
 
@@ -1318,9 +1540,6 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_TX64
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TX_SIZE_SEARCH_METHOD, int)
-#define AOM_CTRL_AV1E_SET_TXSIZE_SEARCH_METHOD
-
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
 
@@ -1336,6 +1555,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CHROMA_DELTAQ, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CHROMA_DELTAQ
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP
 
@@ -1381,6 +1603,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OVERLAY, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OVERLAY
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_PALETTE
 
@@ -1405,6 +1630,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_DELTAQ_MODE
 
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTALF_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTALF_MODE
+
 AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
 
@@ -1453,6 +1681,12 @@
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG
+
+AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH
+
 AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
 #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
 
@@ -1462,13 +1696,11 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
 
-#ifdef CONFIG_DENOISE
-AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int);
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int)
 #define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
 
-AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int);
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int)
 #define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
-#endif
 
 AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
 #define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X
@@ -1491,6 +1723,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
 #define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
 
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT
+
 AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
 #define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
 
@@ -1506,12 +1741,30 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
 #define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ
+
 AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
 #define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
 
 AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
 #define AOM_CTRL_AV1E_SET_TIER_MASK
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_CR, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_CR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_LAYER_ID, aom_svc_layer_id_t *)
+#define AOME_CTRL_AV1E_SET_SVC_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_PARAMS, aom_svc_params_t *)
+#define AOME_CTRL_AV1E_SET_SVC_PARAMS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_CONFIG, aom_svc_ref_frame_config_t *)
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus

diff --git a/libaom/aom/aomdx.h b/libaom/aom/aomdx.h
index c71eaf9..8cd5de3 100644
--- a/libaom/aom/aomdx.h
+++ b/libaom/aom/aomdx.h

@@ -106,149 +106,195 @@
  * This set of macros define the control functions available for the AOM
  * decoder interface.
  *
- * \sa #aom_codec_control
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
 enum aom_dec_control_id {
-  /** control function to get info on which reference frames were updated
-   *  by the last decode
+  /*!\brief Codec control function to get info on which reference frames were
+   * updated by the last decode, int* parameter
    */
   AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
 
-  /** check if the indicated frame is corrupted */
+  /*!\brief Codec control function to check if the indicated frame is
+    corrupted, int* parameter
+  */
   AOMD_GET_FRAME_CORRUPTED,
 
-  /** control function to get info on which reference frames were used
-   *  by the last decode
+  /*!\brief Codec control function to get info on which reference frames were
+   * used by the last decode, int* parameter
    */
   AOMD_GET_LAST_REF_USED,
 
-  /** control function to get the dimensions that the current frame is decoded
-   * at. This may be different to the intended display size for the frame as
-   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
+  /*!\brief Codec control function to get the dimensions that the current
+   * frame is decoded at, int* parameter. This may be different to the
+   * intended display size for the frame as specified in the wrapper or frame
+   * header (see AV1D_GET_DISPLAY_SIZE).
+   */
   AV1D_GET_FRAME_SIZE,
 
-  /** control function to get the current frame's intended display dimensions
-   * (as specified in the wrapper or frame header). This may be different to
-   * the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
+  /*!\brief Codec control function to get the current frame's intended display
+   * dimensions (as specified in the wrapper or frame header), int* parameter.
+   * This may be different to the decoded dimensions of this frame (see
+   * AV1D_GET_FRAME_SIZE).
+   */
   AV1D_GET_DISPLAY_SIZE,
 
-  /** control function to get the bit depth of the stream. */
+  /*!\brief Codec control function to get the bit depth of the stream,
+   * unsigned int* parameter
+   */
   AV1D_GET_BIT_DEPTH,
 
-  /** control function to get the image format of the stream. */
+  /*!\brief Codec control function to get the image format of the stream,
+   * aom_img_fmt_t* parameter
+   */
   AV1D_GET_IMG_FORMAT,
 
-  /** control function to get the size of the tile. */
+  /*!\brief Codec control function to get the size of the tile, unsigned int
+    parameter */
   AV1D_GET_TILE_SIZE,
 
-  /** control function to get the tile count in a tile list. */
+  /*!\brief Codec control function to get the tile count in a tile list, int*
+   * parameter
+   */
   AV1D_GET_TILE_COUNT,
 
-  /** control function to set the byte alignment of the planes in the reference
-   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+  /*!\brief Codec control function to set the byte alignment of the planes in
+   * the reference buffers, int parameter
+   *
+   * Valid values are power of 2, from 32 to 1024. A value of 0 sets
    * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
    * follows Y plane, and V plane directly follows U plane. Default value is 0.
    */
   AV1_SET_BYTE_ALIGNMENT,
 
-  /** control function to invert the decoding order to from right to left. The
-   * function is used in a test to confirm the decoding independence of tile
+  /*!\brief Codec control function to invert the decoding order to from right to
+   * left, int parameter
+   *
+   * The function is used in a test to confirm the decoding independence of tile
    * columns. The function may be used in application where this order
-   * of decoding is desired.
+   * of decoding is desired. int parameter
    *
    * TODO(yaowu): Rework the unit test that uses this control, and in a future
    *              release, this test-only control shall be removed.
    */
   AV1_INVERT_TILE_DECODE_ORDER,
 
-  /** control function to set the skip loop filter flag. Valid values are
-   * integers. The decoder will skip the loop filter when its value is set to
-   * nonzero. If the loop filter is skipped the decoder may accumulate decode
-   * artifacts. The default value is 0.
+  /*!\brief Codec control function to set the skip loop filter flag, int
+   * parameter
+   *
+   * Valid values are integers. The decoder will skip the loop filter
+   * when its value is set to nonzero. If the loop filter is skipped the
+   * decoder may accumulate decode artifacts. The default value is 0.
    */
   AV1_SET_SKIP_LOOP_FILTER,
 
-  /** control function to retrieve a pointer to the Accounting struct.  When
-   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
+  /*!\brief Codec control function to retrieve a pointer to the Accounting
+   * struct, takes Accounting** as parameter
+   *
    * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
    * The caller should ensure that AOM_CODEC_OK is returned before attempting
    * to dereference the Accounting pointer.
+   *
+   * \attention When compiled without --enable-accounting, this returns
+   * AOM_CODEC_INCAPABLE.
    */
   AV1_GET_ACCOUNTING,
 
-  /** control function to get last decoded frame quantizer. Returned value uses
-   * internal quantizer scale defined by the codec.
+  /*!\brief Codec control function to get last decoded frame quantizer,
+   * int* parameter
+   *
+   * Returned value uses internal quantizer scale defined by the codec.
    */
   AOMD_GET_LAST_QUANTIZER,
 
-  /** control function to set the range of tile decoding. A value that is
-   * greater and equal to zero indicates only the specific row/column is
-   * decoded. A value that is -1 indicates the whole row/column is decoded.
-   * A special case is both values are -1 that means the whole frame is
-   * decoded.
+  /*!\brief Codec control function to set the range of tile decoding, int
+   * parameter
+   *
+   * A value that is greater and equal to zero indicates only the specific
+   * row/column is decoded. A value that is -1 indicates the whole row/column
+   * is decoded. A special case is both values are -1 that means the whole
+   * frame is decoded.
    */
   AV1_SET_DECODE_TILE_ROW,
   AV1_SET_DECODE_TILE_COL,
-  /** control function to set the tile coding mode. A value that is equal to
-   *  zero indicates the tiles are coded in normal tile mode. A value that is
-   *  1 indicates the tiles are coded in large-scale tile mode.
+
+  /*!\brief Codec control function to set the tile coding mode, int parameter
+   *
+   * - 0 = tiles are coded in normal tile mode
+   * - 1 = tiles are coded in large-scale tile mode
    */
   AV1_SET_TILE_MODE,
-  /** control function to get the frame header information of an encoded frame
-   * in the bitstream. This provides a way to access a frame's header data.
+
+  /*!\brief Codec control function to get the frame header information of an
+   * encoded frame, unsigned int* parameter
    */
   AV1D_GET_FRAME_HEADER_INFO,
-  /** control function to get the start address and size of a tile in the coded
-   * bitstream. This provides a way to access a specific tile's bitstream data.
+
+  /*!\brief Codec control function to get the start address and size of a
+   * tile in the coded bitstream, aom_tile_data* parameter.
    */
   AV1D_GET_TILE_DATA,
-  /** control function to set the external references' pointers in the decoder.
-   *  This is used while decoding the tile list OBU in large-scale tile coding
-   *  mode.
+
+  /*!\brief Codec control function to set the external references' pointers in
+   * the decoder, av1_ext_ref_frame_t* parameter.
+   *
+   * This is used while decoding the tile list OBU in large-scale tile coding
+   * mode.
    */
   AV1D_SET_EXT_REF_PTR,
-  /** control function to enable the ext-tile software debug and testing code in
-   * the decoder.
+
+  /*!\brief Codec control function to enable the ext-tile software debug and
+   * testing code in the decoder, unsigned int parameter
    */
   AV1D_EXT_TILE_DEBUG,
 
-  /** control function to enable the row based multi-threading of decoding. A
-   * value that is equal to 1 indicates that row based multi-threading is
-   * enabled.
+  /*!\brief Codec control function to enable the row based multi-threading of
+   * decoding, unsigned int parameter
+   *
+   * - 0 = disabled
+   * - 1 = enabled (default)
    */
   AV1D_SET_ROW_MT,
 
-  /** control function to indicate whether bitstream is in Annex-B format. */
+  /*!\brief Codec control function to indicate whether bitstream is in
+   * Annex-B format, unsigned int parameter
+   */
   AV1D_SET_IS_ANNEXB,
 
-  /** control function to indicate which operating point to use. A scalable
-   *  stream may define multiple operating points, each of which defines a
-   *  set of temporal and spatial layers to be processed. The operating point
-   *  index may take a value between 0 and operating_points_cnt_minus_1 (which
-   *  is at most 31).
+  /*!\brief Codec control function to indicate which operating point to use,
+   * int parameter
+   *
+   * A scalable stream may define multiple operating points, each of which
+   * defines a set of temporal and spatial layers to be processed. The
+   * operating point index may take a value between 0 and
+   * operating_points_cnt_minus_1 (which is at most 31).
    */
   AV1D_SET_OPERATING_POINT,
 
-  /** control function to indicate whether to output one frame per temporal
-   *  unit (the default), or one frame per spatial layer.
-   *  In a scalable stream, each temporal unit corresponds to a single "frame"
-   *  of video, and within a temporal unit there may be multiple spatial layers
-   *  with different versions of that frame.
-   *  For video playback, only the highest-quality version (within the
-   *  selected operating point) is needed, but for some use cases it is useful
-   *  to have access to multiple versions of a frame when they are available.
+  /*!\brief Codec control function to indicate whether to output one frame per
+   * temporal unit (the default), or one frame per spatial layer. int parameter
+   *
+   * In a scalable stream, each temporal unit corresponds to a single "frame"
+   * of video, and within a temporal unit there may be multiple spatial layers
+   * with different versions of that frame.
+   * For video playback, only the highest-quality version (within the
+   * selected operating point) is needed, but for some use cases it is useful
+   * to have access to multiple versions of a frame when they are available.
    */
   AV1D_SET_OUTPUT_ALL_LAYERS,
 
-  /** control function to set an aom_inspect_cb callback that is invoked each
-   * time a frame is decoded.  When compiled without --enable-inspection, this
+  /*!\brief Codec control function to set an aom_inspect_cb callback that is
+   * invoked each time a frame is decoded, aom_inspect_init* parameter
+   *
+   * \attention When compiled without --enable-inspection, this
    * returns AOM_CODEC_INCAPABLE.
    */
   AV1_SET_INSPECTION_CALLBACK,
 
-  /** control function to set the skip film grain flag. Valid values are
-   * integers. The decoder will skip the film grain when its value is set to
-   * nonzero. The default value is 0.
+  /*!\brief Codec control function to set the skip film grain flag, int
+   * parameter
+   *
+   * Valid values are integers. The decoder will skip the film grain when its
+   * value is set to nonzero. The default value is 0.
    */
   AV1D_SET_SKIP_FILM_GRAIN,
 
@@ -258,59 +304,87 @@
 /*!\cond */
 /*!\brief AOM decoder control function parameter type
  *
- * Defines the data types that AOMD control functions take. Note that
- * additional common controls are defined in aom.h
+ * Defines the data types that AOMD control functions take.
  *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
  */
-
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
+
 AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
 #define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
+
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_USED
+
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
 #define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+
 AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
+
 AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
 #define AOM_CTRL_AV1D_GET_BIT_DEPTH
+
 AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
 #define AOM_CTRL_AV1D_GET_IMG_FORMAT
+
 AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
 #define AOM_CTRL_AV1D_GET_TILE_SIZE
+
 AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *)
 #define AOM_CTRL_AV1D_GET_TILE_COUNT
+
 AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_FRAME_SIZE
+
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
 #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+
 AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
 #define AOM_CTRL_AV1_GET_ACCOUNTING
+
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+
 AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int)
 #define AOM_CTRL_AV1_SET_TILE_MODE
+
 AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *)
 #define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO
+
 AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *)
 #define AOM_CTRL_AV1D_GET_TILE_DATA
+
 AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
 #define AOM_CTRL_AV1D_SET_EXT_REF_PTR
+
 AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
 #define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+
 AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
 #define AOM_CTRL_AV1D_SET_ROW_MT
+
 AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
 #define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
+
 AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
 #define AOM_CTRL_AV1D_SET_IS_ANNEXB
+
 AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
 #define AOM_CTRL_AV1D_SET_OPERATING_POINT
+
 AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
 #define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS
+
 AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
 #define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
 /*!\endcond */

diff --git a/libaom/aom/exports_com b/libaom/aom/exports_com
index cf99bc5..6f796f5 100644
--- a/libaom/aom/exports_com
+++ b/libaom/aom/exports_com

@@ -1,5 +1,5 @@
 text aom_codec_build_config
-text aom_codec_control_
+text aom_codec_control
 text aom_codec_destroy
 text aom_codec_err_to_string
 text aom_codec_error
@@ -10,12 +10,20 @@
 text aom_codec_version_extra_str
 text aom_codec_version_str
 text aom_free
+text aom_img_add_metadata
 text aom_img_alloc
 text aom_img_alloc_with_border
 text aom_img_flip
 text aom_img_free
+text aom_img_get_metadata
+text aom_img_metadata_array_free
+text aom_img_metadata_array_alloc
+text aom_img_metadata_free
+text aom_img_metadata_alloc
+text aom_img_num_metadata
 text aom_img_plane_height
 text aom_img_plane_width
+text aom_img_remove_metadata
 text aom_img_set_rect
 text aom_img_wrap
 text aom_malloc

diff --git a/libaom/aom/exports_dec b/libaom/aom/exports_dec
index d7d1c4f..ffff023 100644
--- a/libaom/aom/exports_dec
+++ b/libaom/aom/exports_dec

@@ -3,8 +3,6 @@
 text aom_codec_get_frame
 text aom_codec_get_stream_info
 text aom_codec_peek_stream_info
-text aom_codec_register_put_frame_cb
-text aom_codec_register_put_slice_cb
 text aom_codec_set_frame_buffer_functions
 text aom_obu_type_to_string
 text aom_read_obu_header

diff --git a/libaom/aom/exports_enc b/libaom/aom/exports_enc
index 918d742..1473d9d 100644
--- a/libaom/aom/exports_enc
+++ b/libaom/aom/exports_enc

@@ -1,6 +1,5 @@
 text aom_codec_enc_config_default
 text aom_codec_enc_config_set
-text aom_codec_enc_init_multi_ver
 text aom_codec_enc_init_ver
 text aom_codec_encode
 text aom_codec_get_cx_data

diff --git a/libaom/aom/exports_test b/libaom/aom/exports_test
index 01b864b..452a532 100644
--- a/libaom/aom/exports_test
+++ b/libaom/aom/exports_test

@@ -1,2 +1,4 @@
+text aom_copy_metadata_to_frame_buffer
 text aom_dsp_rtcd
+text aom_remove_metadata_from_frame_buffer
 text aom_scale_rtcd

diff --git a/libaom/aom/internal/aom_codec_internal.h b/libaom/aom/internal/aom_codec_internal.h
index 21c0dc6..efe09ac 100644
--- a/libaom/aom/internal/aom_codec_internal.h
+++ b/libaom/aom/internal/aom_codec_internal.h

@@ -59,10 +59,9 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define AOM_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
+#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/
 
 typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
-typedef struct aom_codec_priv_enc_mr_cfg aom_codec_priv_enc_mr_cfg_t;
 
 /*!\brief init function pointer prototype
  *
@@ -77,8 +76,7 @@
  * \retval #AOM_CODEC_MEM_ERROR
  *     Memory operation failed.
  */
-typedef aom_codec_err_t (*aom_codec_init_fn_t)(
-    aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data);
+typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx);
 
 /*!\brief destroy function pointer prototype
  *
@@ -138,7 +136,7 @@
  * function, so plugins implementing this interface may trust the input
  * parameters to be properly initialized. However,  this interface does not
  * provide type safety for the exchanged data or assign meanings to the
- * control codes. Those details should be specified in the algorithm's
+ * control IDs. Those details should be specified in the algorithm's
  * header file. In particular, the ctrl_id parameter is guaranteed to exist
  * in the algorithm's control mapping table, and the data parameter may be NULL.
  *
@@ -171,17 +169,12 @@
 
 /*!\brief decode data function pointer prototype
  *
- * Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, #AOM_CODEC_CB_PUT_SLICE and
- * #AOM_CODEC_CB_PUT_FRAME events are generated as appropriate. This
- * function is called by the generic aom_codec_decode() wrapper function,
- * so plugins implementing this interface may trust the input parameters
- * to be properly initialized.
+ * Processes a buffer of coded data. This function is called by the generic
+ * aom_codec_decode() wrapper function, so plugins implementing this interface
+ * may trust the input parameters to be properly initialized.
  *
  * \param[in] ctx          Pointer to this instance's context
- * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a #AOM_CODEC_CB_PUT_FRAME event is posted
- *                         for the previously decoded frame.
+ * \param[in] data         Pointer to this block of new coded data.
  * \param[in] data_sz      Size of the coded data, in bytes.
  *
  * \return Returns #AOM_CODEC_OK if the coded data was processed completely
@@ -259,24 +252,6 @@
 typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
     aom_codec_alg_priv_t *ctx);
 
-typedef aom_codec_err_t (*aom_codec_enc_mr_get_mem_loc_fn_t)(
-    const aom_codec_enc_cfg_t *cfg, void **mem_loc);
-
-/*!\brief usage configuration mapping
- *
- * This structure stores the mapping between usage identifiers and
- * configuration structures. Each algorithm provides a list of these
- * mappings. This list is searched by the aom_codec_enc_config_default()
- * wrapper function to determine which config to return. The special value
- * {-1, {0}} is used to indicate end-of-list, and must be present. At least
- * one mapping must be present, in addition to the end-of-list.
- *
- */
-typedef const struct aom_codec_enc_cfg_map {
-  int usage;
-  aom_codec_enc_cfg_t cfg;
-} aom_codec_enc_cfg_map_t;
-
 /*!\brief Decoder algorithm interface interface
  *
  * All decoders \ref MUST expose a variable of this type.
@@ -297,10 +272,9 @@
     aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
   } dec;
   struct aom_codec_enc_iface {
-    int cfg_map_count;
-    aom_codec_enc_cfg_map_t
-        *cfg_maps;                /**< \copydoc ::aom_codec_enc_cfg_map_t */
-    aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
+    int cfg_count;
+    const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */
+    aom_codec_encode_fn_t encode;    /**< \copydoc ::aom_codec_encode_fn_t */
     aom_codec_get_cx_data_fn_t
         get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
     aom_codec_enc_config_set_fn_t
@@ -309,20 +283,9 @@
         get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
     aom_codec_get_preview_frame_fn_t
         get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
-    aom_codec_enc_mr_get_mem_loc_fn_t
-        mr_get_mem_loc; /**< \copydoc ::aom_codec_enc_mr_get_mem_loc_fn_t */
   } enc;
 };
 
-/*!\brief Callback function pointer / user data pair storage */
-typedef struct aom_codec_priv_cb_pair {
-  union {
-    aom_codec_put_frame_cb_fn_t put_frame;
-    aom_codec_put_slice_cb_fn_t put_slice;
-  } u;
-  void *user_priv;
-} aom_codec_priv_cb_pair_t;
-
 /*!\brief Instance private storage
  *
  * This structure is allocated by the algorithm's init function. It can be
@@ -335,37 +298,14 @@
   const char *err_detail;
   aom_codec_flags_t init_flags;
   struct {
-    aom_codec_priv_cb_pair_t put_frame_cb;
-    aom_codec_priv_cb_pair_t put_slice_cb;
-  } dec;
-  struct {
     aom_fixed_buf_t cx_data_dst_buf;
     unsigned int cx_data_pad_before;
     unsigned int cx_data_pad_after;
     aom_codec_cx_pkt_t cx_data_pkt;
-    unsigned int total_encoders;
   } enc;
 };
 
-/*
- * Multi-resolution encoding internal configuration
- */
-struct aom_codec_priv_enc_mr_cfg {
-  unsigned int mr_total_resolutions;
-  unsigned int mr_encoder_id;
-  struct aom_rational mr_down_sampling_factor;
-  void *mr_low_res_mode_info;
-};
-
-#undef AOM_CTRL_USE_TYPE
-#define AOM_CTRL_USE_TYPE(id, typ) \
-  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
-
-#undef AOM_CTRL_USE_TYPE_DEPRECATED
-#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
-
-#define CAST(id, arg) id##__value(arg)
+#define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id)
 
 /* CODEC_INTERFACE convenience macro
  *

diff --git a/libaom/aom/internal/aom_image_internal.h b/libaom/aom/internal/aom_image_internal.h
new file mode 100644
index 0000000..7f2fd18
--- /dev/null
+++ b/libaom/aom/internal/aom_image_internal.h

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the internal functions associated with the aom image
+ * descriptor.
+ *
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Array of aom_metadata structs for an image. */
+struct aom_metadata_array {
+  size_t sz;                       /* Number of metadata structs in the list */
+  aom_metadata_t **metadata_array; /* Array of metadata structs */
+};
+
+/*!\brief Alloc memory for aom_metadata_array struct.
+ *
+ * Allocate memory for aom_metadata_array struct.
+ * If sz is 0 the aom_metadata_array structs internal buffer list will be NULL,
+ * but the aom_metadata_array struct itself will still be allocated.
+ * Returns a pointer to the allocated struct or NULL on failure.
+ *
+ * \param[in]    sz       Size of internal metadata list buffer
+ */
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz);
+
+/*!\brief Free metadata array struct.
+ *
+ * Free metadata array struct and all metadata structs inside.
+ *
+ * \param[in]    arr       Metadata array struct pointer
+ */
+void aom_img_metadata_array_free(aom_metadata_array_t *arr);
+
+typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image by
+ * using the provided callback function.
+ *
+ * Returns a descriptor for storing an image of the given format. The storage
+ * for the image is allocated by using the provided callback function. Unlike
+ * aom_img_alloc(), the returned descriptor does not own the storage for the
+ * image. The caller is responsible for freeing the storage for the image.
+ *
+ * Note: If the callback function is invoked and succeeds,
+ * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if
+ * aom_img_alloc_with_cb() fails, the caller is assured that no storage was
+ * allocated.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of the image buffer and
+ *                         each row in the image (stride).
+ * \param[in]    alloc_cb  Callback function used to allocate storage for the
+ *                         image.
+ * \param[in]    cb_priv   The first argument ('priv') for the callback
+ *                         function.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+                                   unsigned int d_w, unsigned int d_h,
+                                   unsigned int align,
+                                   aom_alloc_img_data_cb_fn_t alloc_cb,
+                                   void *cb_priv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_

diff --git a/libaom/aom/src/aom_codec.c b/libaom/aom/src/aom_codec.c
index 733bffb..196ab83 100644
--- a/libaom/aom/src/aom_codec.c
+++ b/libaom/aom/src/aom_codec.c

@@ -89,7 +89,7 @@
   return (iface) ? iface->caps : 0;
 }
 
-aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
   aom_codec_err_t res;
 
   if (!ctx || !ctrl_id)

diff --git a/libaom/aom/src/aom_decoder.c b/libaom/aom/src/aom_decoder.c
index 282ec8a..49fff26 100644
--- a/libaom/aom/src/aom_decoder.c
+++ b/libaom/aom/src/aom_decoder.c

@@ -34,9 +34,6 @@
     res = AOM_CODEC_INVALID_PARAM;
   else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
     res = AOM_CODEC_ABI_MISMATCH;
-  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
-           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
-    res = AOM_CODEC_INCAPABLE;
   else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
     res = AOM_CODEC_INCAPABLE;
   else {
@@ -47,7 +44,7 @@
     ctx->init_flags = flags;
     ctx->config.dec = cfg;
 
-    res = ctx->iface->init(ctx, NULL);
+    res = ctx->iface->init(ctx);
     if (res) {
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       aom_codec_destroy(ctx);
@@ -120,44 +117,6 @@
   return img;
 }
 
-aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_frame_cb_fn_t cb,
-                                                void *user_priv) {
-  aom_codec_err_t res;
-
-  if (!ctx || !cb)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
-    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
-    res = AOM_CODEC_OK;
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_slice_cb_fn_t cb,
-                                                void *user_priv) {
-  aom_codec_err_t res;
-
-  if (!ctx || !cb)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
-    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
-    res = AOM_CODEC_OK;
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
 aom_codec_err_t aom_codec_set_frame_buffer_functions(
     aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
     aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
@@ -165,9 +124,10 @@
 
   if (!ctx || !cb_get || !cb_release) {
     res = AOM_CODEC_INVALID_PARAM;
-  } else if (!ctx->iface || !ctx->priv ||
-             !(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+  } else if (!ctx->iface || !ctx->priv) {
     res = AOM_CODEC_ERROR;
+  } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = AOM_CODEC_INCAPABLE;
   } else {
     res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
                                     cb_priv);

diff --git a/libaom/aom/src/aom_encoder.c b/libaom/aom/src/aom_encoder.c
index 523f40b..bb51c93 100644
--- a/libaom/aom/src/aom_encoder.c
+++ b/libaom/aom/src/aom_encoder.c

@@ -24,6 +24,8 @@
 
 #include <limits.h>
 #include <string.h>
+
+#include "aom/aom_encoder.h"
 #include "aom/internal/aom_codec_internal.h"
 
 #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
@@ -54,7 +56,7 @@
     ctx->priv = NULL;
     ctx->init_flags = flags;
     ctx->config.enc = cfg;
-    res = ctx->iface->init(ctx, NULL);
+    res = ctx->iface->init(ctx);
 
     if (res) {
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
@@ -65,113 +67,35 @@
   return SAVE_STATUS(ctx, res);
 }
 
-aom_codec_err_t aom_codec_enc_init_multi_ver(
-    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
-    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver) {
-  aom_codec_err_t res = AOM_CODEC_OK;
-
-  if (ver != AOM_ENCODER_ABI_VERSION)
-    res = AOM_CODEC_ABI_MISMATCH;
-  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
-    res = AOM_CODEC_ABI_MISMATCH;
-  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
-    res = AOM_CODEC_INCAPABLE;
-  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
-    res = AOM_CODEC_INCAPABLE;
-  else {
-    int i;
-    void *mem_loc = NULL;
-
-    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
-      for (i = 0; i < num_enc; i++) {
-        aom_codec_priv_enc_mr_cfg_t mr_cfg;
-
-        /* Validate down-sampling factor. */
-        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
-            dsf->den > dsf->num) {
-          res = AOM_CODEC_INVALID_PARAM;
-          break;
-        }
-
-        mr_cfg.mr_low_res_mode_info = mem_loc;
-        mr_cfg.mr_total_resolutions = num_enc;
-        mr_cfg.mr_encoder_id = num_enc - 1 - i;
-        mr_cfg.mr_down_sampling_factor.num = dsf->num;
-        mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-        /* Force Key-frame synchronization. Namely, encoder at higher
-         * resolution always use the same frame_type chosen by the
-         * lowest-resolution encoder.
-         */
-        if (mr_cfg.mr_encoder_id) cfg->kf_mode = AOM_KF_DISABLED;
-
-        ctx->iface = iface;
-        ctx->name = iface->name;
-        ctx->priv = NULL;
-        ctx->init_flags = flags;
-        ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx, &mr_cfg);
-
-        if (res) {
-          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
-          /* Destroy current ctx */
-          ctx->err_detail = error_detail;
-          aom_codec_destroy(ctx);
-
-          /* Destroy already allocated high-level ctx */
-          while (i) {
-            ctx--;
-            ctx->err_detail = error_detail;
-            aom_codec_destroy(ctx);
-            i--;
-          }
-        }
-
-        if (res) break;
-
-        ctx++;
-        cfg++;
-        dsf++;
-      }
-      ctx--;
-    }
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
 aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
                                              aom_codec_enc_cfg_t *cfg,
                                              unsigned int usage) {
   aom_codec_err_t res;
-  aom_codec_enc_cfg_map_t *map;
   int i;
 
-  if (!iface || !cfg || usage > INT_MAX)
+  if (!iface || !cfg)
     res = AOM_CODEC_INVALID_PARAM;
   else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
     res = AOM_CODEC_INCAPABLE;
   else {
     res = AOM_CODEC_INVALID_PARAM;
 
-    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
-      map = iface->enc.cfg_maps + i;
-      if (map->usage == (int)usage) {
-        *cfg = map->cfg;
-        cfg->g_usage = usage;
+    for (i = 0; i < iface->enc.cfg_count; ++i) {
+      if (iface->enc.cfgs[i].g_usage == usage) {
+        *cfg = iface->enc.cfgs[i];
         res = AOM_CODEC_OK;
         break;
       }
     }
   }
-
   /* default values */
   if (cfg) {
-    cfg->cfg.ext_partition = 1;
+    memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg));
+    cfg->encoder_cfg.super_block_size = 0;  // Dynamic
+    cfg->encoder_cfg.max_partition_size = 128;
+    cfg->encoder_cfg.min_partition_size = 4;
+    cfg->encoder_cfg.disable_trellis_quant = 3;
   }
-
   return res;
 }
 
@@ -190,8 +114,11 @@
 
 #if HAVE_FEXCEPT && CONFIG_DEBUG
 #define FLOATING_POINT_SET_EXCEPTIONS \
-  const int float_excepts = feenableexcept(FE_DIVBYZERO);
-#define FLOATING_POINT_RESTORE_EXCEPTIONS feenableexcept(float_excepts);
+  const int float_excepts =           \
+      feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW);
+#define FLOATING_POINT_RESTORE_EXCEPTIONS \
+  fedisableexcept(FE_ALL_EXCEPT);         \
+  feenableexcept(float_excepts);
 #else
 #define FLOATING_POINT_SET_EXCEPTIONS
 #define FLOATING_POINT_RESTORE_EXCEPTIONS
@@ -221,38 +148,11 @@
   else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
     res = AOM_CODEC_INCAPABLE;
   else {
-    unsigned int num_enc = ctx->priv->enc.total_encoders;
-
     /* Execute in a normalized floating point environment, if the platform
      * requires it.
      */
     FLOATING_POINT_INIT
-
-    if (num_enc == 1)
-      res =
-          ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
-    else {
-      /* Multi-resolution encoding:
-       * Encode multi-levels in reverse order. For example,
-       * if mr_total_resolutions = 3, first encode level 2,
-       * then encode level 1, and finally encode level 0.
-       */
-      int i;
-
-      ctx += num_enc - 1;
-      if (img) img += num_enc - 1;
-
-      for (i = num_enc - 1; i >= 0; i--) {
-        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
-                                          flags)))
-          break;
-
-        ctx--;
-        if (img) img--;
-      }
-      ctx++;
-    }
-
+    res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
     FLOATING_POINT_RESTORE
   }
 

diff --git a/libaom/aom/src/aom_image.c b/libaom/aom/src/aom_image.c
index 6504cdd..cd0b5ed 100644
--- a/libaom/aom/src/aom_image.c
+++ b/libaom/aom/src/aom_image.c

@@ -14,6 +14,7 @@
 
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
 #include "aom_mem/aom_mem.h"
 
 static INLINE unsigned int align_image_dimension(unsigned int d,
@@ -29,8 +30,12 @@
 static aom_image_t *img_alloc_helper(
     aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h,
     unsigned int buf_align, unsigned int stride_align, unsigned int size_align,
-    unsigned char *img_data, unsigned int border) {
-  unsigned int h, w, s, xcs, ycs, bps;
+    unsigned int border, unsigned char *img_data,
+    aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) {
+  /* NOTE: In this function, bit_depth is either 8 or 16 (if
+   * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
+   */
+  unsigned int h, w, s, xcs, ycs, bps, bit_depth;
   unsigned int stride_in_bytes;
 
   /* Treat align==0 like align==1 */
@@ -57,15 +62,17 @@
     case AOM_IMG_FMT_YV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12: bps = 12; break;
-    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I422: bps = 16; break;
     case AOM_IMG_FMT_I444: bps = 24; break;
     case AOM_IMG_FMT_YV1216:
     case AOM_IMG_FMT_I42016: bps = 24; break;
-    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I42216: bps = 32; break;
     case AOM_IMG_FMT_I44416: bps = 48; break;
     default: bps = 16; break;
   }
 
+  bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+
   /* Get chroma shift values for this format */
   switch (fmt) {
     case AOM_IMG_FMT_I420:
@@ -93,9 +100,9 @@
   w = align_image_dimension(d_w, xcs, size_align);
   h = align_image_dimension(d_h, ycs, size_align);
 
-  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
+  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth;
   s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
-  stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+  stride_in_bytes = s * bit_depth / 8;
 
   /* Allocate the new image */
   if (!img) {
@@ -113,20 +120,29 @@
   if (!img_data) {
     const uint64_t alloc_size =
         (fmt & AOM_IMG_FMT_PLANAR)
-            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / 8
+            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth
             : (uint64_t)(h + 2 * border) * stride_in_bytes;
 
     if (alloc_size != (size_t)alloc_size) goto fail;
 
-    img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
-    img->img_data_owner = 1;
+    if (alloc_cb) {
+      const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1;
+      img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size);
+      if (img->img_data) {
+        img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align);
+      }
+      img->img_data_owner = 0;
+    } else {
+      img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
+      img->img_data_owner = 1;
+    }
     img->sz = (size_t)alloc_size;
   }
 
   if (!img->img_data) goto fail;
 
   img->fmt = fmt;
-  img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  img->bit_depth = bit_depth;
   // aligned width and aligned height
   img->w = w;
   img->h = h;
@@ -138,8 +154,10 @@
   img->stride[AOM_PLANE_Y] = stride_in_bytes;
   img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
 
-  /* Default viewport to entire image */
-  if (!aom_img_set_rect(img, 0, 0, d_w, d_h, border)) return img;
+  /* Default viewport to entire image. (This aom_img_set_rect call always
+   * succeeds.) */
+  aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  return img;
 
 fail:
   aom_img_free(img);
@@ -149,15 +167,26 @@
 aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
                            unsigned int d_w, unsigned int d_h,
                            unsigned int align) {
-  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, NULL, 0);
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL,
+                          NULL);
+}
+
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+                                   unsigned int d_w, unsigned int d_h,
+                                   unsigned int align,
+                                   aom_alloc_img_data_cb_fn_t alloc_cb,
+                                   void *cb_priv) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL,
+                          alloc_cb, cb_priv);
 }
 
 aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                           unsigned int d_h, unsigned int stride_align,
                           unsigned char *img_data) {
-  /* By setting buf_align = 1, we don't change buffer alignment in this
-   * function. */
-  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, img_data, 0);
+  /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is
+   * not NULL. */
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data,
+                          NULL, NULL);
 }
 
 aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
@@ -165,8 +194,8 @@
                                        unsigned int align,
                                        unsigned int size_align,
                                        unsigned int border) {
-  return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, NULL,
-                          border);
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border,
+                          NULL, NULL, NULL);
 }
 
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
@@ -238,6 +267,7 @@
 
 void aom_img_free(aom_image_t *img) {
   if (img) {
+    aom_img_remove_metadata(img);
     if (img->img_data && img->img_data_owner) aom_free(img->img_data);
 
     if (img->self_allocd) free(img);
@@ -257,3 +287,109 @@
   else
     return img->d_h;
 }
+
+aom_metadata_t *aom_img_metadata_alloc(
+    uint32_t type, const uint8_t *data, size_t sz,
+    aom_metadata_insert_flags_t insert_flag) {
+  if (!data || sz == 0) return NULL;
+  aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t));
+  if (!metadata) return NULL;
+  metadata->type = type;
+  metadata->payload = (uint8_t *)malloc(sz);
+  if (!metadata->payload) {
+    free(metadata);
+    return NULL;
+  }
+  memcpy(metadata->payload, data, sz);
+  metadata->sz = sz;
+  metadata->insert_flag = insert_flag;
+  return metadata;
+}
+
+void aom_img_metadata_free(aom_metadata_t *metadata) {
+  if (metadata) {
+    if (metadata->payload) free(metadata->payload);
+    free(metadata);
+  }
+}
+
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) {
+  aom_metadata_array_t *arr =
+      (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t));
+  if (!arr) return NULL;
+  if (sz > 0) {
+    arr->metadata_array =
+        (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *));
+    if (!arr->metadata_array) {
+      aom_img_metadata_array_free(arr);
+      return NULL;
+    }
+    arr->sz = sz;
+  }
+  return arr;
+}
+
+void aom_img_metadata_array_free(aom_metadata_array_t *arr) {
+  if (arr) {
+    if (arr->metadata_array) {
+      for (size_t i = 0; i < arr->sz; i++) {
+        aom_img_metadata_free(arr->metadata_array[i]);
+      }
+      free(arr->metadata_array);
+    }
+    free(arr);
+  }
+}
+
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+                         size_t sz, aom_metadata_insert_flags_t insert_flag) {
+  if (!img) return -1;
+  if (!img->metadata) {
+    img->metadata = aom_img_metadata_array_alloc(0);
+    if (!img->metadata) return -1;
+  }
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(type, data, sz, insert_flag);
+  if (!metadata) goto fail;
+  if (!img->metadata->metadata_array) {
+    img->metadata->metadata_array =
+        (aom_metadata_t **)calloc(1, sizeof(metadata));
+    if (!img->metadata->metadata_array || img->metadata->sz != 0) {
+      aom_img_metadata_free(metadata);
+      goto fail;
+    }
+  } else {
+    img->metadata->metadata_array =
+        (aom_metadata_t **)realloc(img->metadata->metadata_array,
+                                   (img->metadata->sz + 1) * sizeof(metadata));
+  }
+  img->metadata->metadata_array[img->metadata->sz] = metadata;
+  img->metadata->sz++;
+  return 0;
+fail:
+  aom_img_metadata_array_free(img->metadata);
+  img->metadata = NULL;
+  return -1;
+}
+
+void aom_img_remove_metadata(aom_image_t *img) {
+  if (img && img->metadata) {
+    aom_img_metadata_array_free(img->metadata);
+    img->metadata = NULL;
+  }
+}
+
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+                                           size_t index) {
+  if (!img) return NULL;
+  const aom_metadata_array_t *array = img->metadata;
+  if (array && index < array->sz) {
+    return array->metadata_array[index];
+  }
+  return NULL;
+}
+
+size_t aom_img_num_metadata(const aom_image_t *img) {
+  if (!img || !img->metadata) return 0;
+  return img->metadata->sz;
+}

diff --git a/libaom/aom_dsp/add_noise.c b/libaom/aom_dsp/add_noise.c
deleted file mode 100644
index 43587ca..0000000
--- a/libaom/aom_dsp/add_noise.c
+++ /dev/null

@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
-                           char whiteclamp[16], char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  for (i = 0; i < height; ++i) {
-    uint8_t *pos = start + i * pitch;
-    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; ++j) {
-      int v = pos[j];
-
-      v = clamp(v - blackclamp[0], 0, 255);
-      v = clamp(v + bothclamp[0], 0, 255);
-      v = clamp(v - whiteclamp[0], 0, 255);
-
-      pos[j] = v + ref[j];
-    }
-  }
-}
-
-static double gaussian(double sigma, double mu, double x) {
-  return 1 / (sigma * sqrt(2.0 * PI)) *
-         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
-}
-
-int aom_setup_noise(double sigma, int size, char *noise) {
-  char char_dist[256];
-  int next = 0, i, j;
-
-  // set up a 256 entry lookup that matches gaussian distribution
-  for (i = -32; i < 32; ++i) {
-    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
-    if (a_i) {
-      for (j = 0; j < a_i; ++j) {
-        char_dist[next + j] = (char)i;
-      }
-      next = next + j;
-    }
-  }
-
-  // Rounding error - might mean we have less than 256.
-  for (; next < 256; ++next) {
-    char_dist[next] = 0;
-  }
-
-  for (i = 0; i < size; ++i) {
-    noise[i] = char_dist[rand() & 0xff];  // NOLINT
-  }
-
-  // Returns the highest non 0 value used in distribution.
-  return -char_dist[0];
-}

diff --git a/libaom/aom_dsp/aom_convolve.c b/libaom/aom_dsp/aom_convolve.c
index 4791826..7879b88 100644
--- a/libaom/aom_dsp/aom_convolve.c
+++ b/libaom/aom_dsp/aom_convolve.c

@@ -74,7 +74,6 @@
 
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
 }
 
@@ -130,6 +129,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
                                              const int16_t *b) {
@@ -236,3 +236,4 @@
     dst += dst_stride;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/aom_dsp.cmake b/libaom/aom_dsp/aom_dsp.cmake
index abf6a60..f1b61f0 100644
--- a/libaom/aom_dsp/aom_dsp.cmake
+++ b/libaom/aom_dsp/aom_dsp.cmake

@@ -67,6 +67,7 @@
             "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
@@ -76,6 +77,12 @@
             "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSE2
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+endif()
+
 list(APPEND AOM_DSP_COMMON_ASM_SSSE3
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
@@ -85,6 +92,11 @@
             "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSSE3
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+endif()
+
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
             "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
@@ -104,8 +116,13 @@
             "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
 
-list(APPEND AOM_DSP_COMMON_INTRIN_NEON
-            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
+endif()
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
@@ -138,10 +155,9 @@
   list(APPEND AOM_DSP_DECODER_SOURCES
               "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
               "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
-              "${AOM_ROOT}/aom_dsp/bitreader.h"
-              "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
-              "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
-              "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.c"
+              "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c"
+              "${AOM_ROOT}/aom_dsp/entdec.h"
               "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
               "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
 endif()
@@ -150,9 +166,9 @@
   list(APPEND AOM_DSP_ENCODER_SOURCES
               "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
               "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.c"
               "${AOM_ROOT}/aom_dsp/bitwriter.h"
-              "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
-              "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
+              "${AOM_ROOT}/aom_dsp/blk_sse_sum.c"
               "${AOM_ROOT}/aom_dsp/entenc.c"
               "${AOM_ROOT}/aom_dsp/entenc.h"
               "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
@@ -195,9 +211,18 @@
               "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+  endif()
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
@@ -207,6 +232,8 @@
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
@@ -217,6 +244,7 @@
               "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
 
   list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
@@ -225,6 +253,7 @@
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
@@ -238,11 +267,18 @@
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
-  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+  endif()
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
@@ -254,6 +290,11 @@
                 "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
                 "${AOM_ROOT}/aom_dsp/ssim.h")
   endif()
+
+  if(CONFIG_TUNE_VMAF)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c"
+                "${AOM_ROOT}/aom_dsp/vmaf.h")
+  endif()
 endif()
 
 # Creates aom_dsp build targets. Must not be called until after libaom target
@@ -264,6 +305,9 @@
   create_dummy_source_file("aom_av1" "c" "dummy_source_file")
   add_library(aom_dsp OBJECT "${dummy_source_file}")
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  endif()
   list(APPEND AOM_LIB_TARGETS aom_dsp)
 
   # Not all generators support libraries consisting only of object files. Add a
@@ -274,96 +318,103 @@
     add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
     list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    endif()
   endif()
 
   if(CONFIG_AV1_ENCODER)
     add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
     list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    endif()
   endif()
 
   if(HAVE_SSE2)
-    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
+    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2")
     add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_SSE2")
 
     if(CONFIG_AV1_ENCODER)
       if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64})
       endif()
-      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2")
       add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_SSE2")
     endif()
   endif()
 
   if(HAVE_SSSE3)
-    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
+    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3")
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_SSSE3")
 
     if(CONFIG_AV1_ENCODER)
       if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
                     ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
       endif()
-      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
+      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3")
       add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3")
     endif()
   endif()
 
   if(HAVE_SSE4_1)
     add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_SSE4_1")
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1")
     endif()
   endif()
 
   if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
     if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
-                      "aom")
+      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64")
     endif()
   endif()
 
   if(HAVE_AVX2)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_AVX2")
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_AVX2")
     endif()
   endif()
 
   if(HAVE_NEON)
     add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
-                                  "aom")
+                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                     "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_NEON")
     endif()
   endif()
 
   if(HAVE_DSPR2)
     add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_DSPR2")
   endif()
 
   if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_MSA" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_MSA")
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
+                                    "AOM_DSP_ENCODER_INTRIN_MSA")
     endif()
   endif()
 
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  endif()
 
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.

diff --git a/libaom/aom_dsp/aom_dsp_common.h b/libaom/aom_dsp/aom_dsp_common.h
index a185b23..150d35d 100644
--- a/libaom/aom_dsp/aom_dsp_common.h
+++ b/libaom/aom_dsp/aom_dsp_common.h

@@ -27,6 +27,9 @@
 
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
+
+#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
 
 #define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 

diff --git a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
index f56a117..b7d5a41 100755
--- a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -75,8 +75,10 @@
   foreach $pred_name (@pred_names) {
     add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
               "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-    add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
-              "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+        add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+                  "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    }
   }
 }
 
@@ -90,6 +92,7 @@
 specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
 specialize qw/aom_dc_top_predictor_16x64 sse2/;
 specialize qw/aom_dc_top_predictor_32x8 sse2/;
@@ -280,7 +283,7 @@
 specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
-
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_4x4 sse2/;
   specialize qw/aom_highbd_v_predictor_4x8 sse2/;
   specialize qw/aom_highbd_v_predictor_8x4 sse2/;
@@ -346,11 +349,11 @@
   specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
-
+}
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
@@ -358,14 +361,16 @@
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
-add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
-add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+  add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
 
-add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+  add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+}
 
 #
 # Loopfilter
@@ -415,56 +420,58 @@
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_4_dual sse2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_6 sse2/;
-
 add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_vertical_6_dual sse2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14 sse2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+  add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+  add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8 sse2/;
 
-add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+  add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6 sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
+  add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
-specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
+  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
+  add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4 sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
+  add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+  add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+  add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+  add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
 
-add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+  add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+}
 
 #
 # Encoder functions.
@@ -474,13 +481,19 @@
 # Forward transform
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 neon sse2/;
+
+    add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride";
+    specialize qw/aom_fdct4x4_lp neon sse2/;
+
     add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
-
+    specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64";
     # High bit depth
-    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct8x8 sse2/;
-
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+      specialize qw/aom_highbd_fdct8x8 sse2/;
+    }
     # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
     add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
 
@@ -519,7 +532,7 @@
   specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
 
   add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_adaptive sse2/;
+  specialize qw/aom_quantize_b_adaptive sse2 avx2/;
 
   add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
@@ -529,39 +542,53 @@
 
   add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_64x64 ssse3/;
+
+  add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64_adaptive sse2/;
 }  # CONFIG_AV1_ENCODER
 
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b sse2 avx2/;
 
+  add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2/;
+
   add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b_32x32 sse2/;
 
+  add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2/;
+
   add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b_64x64 sse2/;
+
+  add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2/;
 }  # CONFIG_AV1_ENCODER
 
 #
 # Alpha blending with mask
 #
-add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
 specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
 add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
 add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
 specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
 specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
 specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
 
-add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
-add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
-specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
+  add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
+  specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+}
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
@@ -570,14 +597,19 @@
   add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
   specialize qw/aom_subtract_block neon msa sse2 avx2/;
 
-  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/aom_highbd_subtract_block sse2/;
-
   add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
-  specialize qw/aom_sse  sse4_1 avx2/;
+  specialize qw/aom_sse  sse4_1 avx2 neon/;
 
-  add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
-  specialize qw/aom_highbd_sse  sse4_1 avx2/;
+  add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
+  specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/aom_highbd_subtract_block sse2/;
+
+    add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+    specialize qw/aom_highbd_sse  sse4_1 avx2 neon/;
+  }
 
   if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     #
@@ -589,8 +621,12 @@
     add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
     specialize qw/aom_sum_squares_i16 sse2/;
 
-  }
+    add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+    specialize qw/aom_var_2d_u8 sse2 avx2/;
 
+    add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+    specialize qw/aom_var_2d_u16 sse2 avx2/;
+  }
 
   #
   # Single block SAD / Single block Avg SAD
@@ -671,8 +707,8 @@
   specialize qw/aom_dist_wtd_sad16x4_avg     ssse3/;
   specialize qw/aom_dist_wtd_sad8x32_avg     ssse3/;
   specialize qw/aom_dist_wtd_sad32x8_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad16x64_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad64x16_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad16x64_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad64x16_avg    ssse3/;
 
   add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
   add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
@@ -688,7 +724,7 @@
   specialize qw/aom_sad64xh  sse2/;
   specialize qw/aom_sad128xh sse2/;
 
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -711,6 +747,8 @@
     specialize qw/aom_highbd_sad16x16   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8    avx2 sse2/;
     specialize qw/aom_highbd_sad8x4     sse2/;
+    specialize qw/aom_highbd_sad4x8     sse2/;
+    specialize qw/aom_highbd_sad4x4     sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
@@ -724,19 +762,23 @@
     specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
     specialize qw/aom_highbd_sad8x4_avg     sse2/;
+    specialize qw/aom_highbd_sad4x8_avg     sse2/;
+    specialize qw/aom_highbd_sad4x4_avg     sse2/;
 
-    specialize qw/aom_highbd_sad16x4       sse2/;
-    specialize qw/aom_highbd_sad8x32       sse2/;
-    specialize qw/aom_highbd_sad32x8       sse2/;
-    specialize qw/aom_highbd_sad16x64      sse2/;
-    specialize qw/aom_highbd_sad64x16      sse2/;
+    specialize qw/aom_highbd_sad4x16        sse2/;
+    specialize qw/aom_highbd_sad16x4        avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32        sse2/;
+    specialize qw/aom_highbd_sad32x8        avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64       avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16       avx2 sse2/;
 
-    specialize qw/aom_highbd_sad16x4_avg   sse2/;
-    specialize qw/aom_highbd_sad8x32_avg   sse2/;
-    specialize qw/aom_highbd_sad32x8_avg   sse2/;
-    specialize qw/aom_highbd_sad16x64_avg  sse2/;
-    specialize qw/aom_highbd_sad64x16_avg  sse2/;
-
+    specialize qw/aom_highbd_sad4x16_avg    sse2/;
+    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32_avg    sse2/;
+    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2/;
+  }
   #
   # Masked SAD
   #
@@ -746,13 +788,13 @@
     specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
   }
 
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
       specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
     }
-
+  }
 
   #
   # OBMC SAD
@@ -765,7 +807,7 @@
     }
   }
 
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
@@ -773,7 +815,7 @@
         specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
       }
     }
-
+  }
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
@@ -781,6 +823,8 @@
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, const uint8_t *second_pred, uint32_t *sad_array";
+    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]";
   }
 
   specialize qw/aom_sad128x128x4d avx2          sse2/;
@@ -788,78 +832,145 @@
   specialize qw/aom_sad64x128x4d  avx2          sse2/;
   specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
   specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
+  specialize qw/aom_sad64x16x4d   avx2          sse2/;
   specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
   specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16x4d             msa sse2/;
+  specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
+  specialize qw/aom_sad32x8x4d    avx2          sse2/;
+  specialize qw/aom_sad16x64x4d                 sse2/;
   specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d        neon msa sse2/;
-  specialize qw/aom_sad16x8x4d              msa sse2/;
+  specialize qw/aom_sad16x16x4d         neon msa sse2/;
+  specialize qw/aom_sad16x8x4d               msa sse2/;
+
   specialize qw/aom_sad8x16x4d              msa sse2/;
   specialize qw/aom_sad8x8x4d               msa sse2/;
   specialize qw/aom_sad8x4x4d               msa sse2/;
+  specialize qw/aom_sad4x16x4d              msa sse2/;
   specialize qw/aom_sad4x8x4d               msa sse2/;
   specialize qw/aom_sad4x4x4d               msa sse2/;
 
+  specialize qw/aom_sad4x32x4d  sse2/;
   specialize qw/aom_sad4x16x4d  sse2/;
   specialize qw/aom_sad16x4x4d  sse2/;
   specialize qw/aom_sad8x32x4d  sse2/;
   specialize qw/aom_sad32x8x4d  sse2/;
-  specialize qw/aom_sad16x64x4d sse2/;
   specialize qw/aom_sad64x16x4d sse2/;
 
+  specialize qw/aom_sad128x128x4d_avg sse2/;
+  specialize qw/aom_sad128x64x4d_avg  sse2/;
+  specialize qw/aom_sad64x128x4d_avg  sse2/;
+  specialize qw/aom_sad64x64x4d_avg   sse2/;
+  specialize qw/aom_sad64x32x4d_avg   sse2/;
+  specialize qw/aom_sad64x16x4d_avg   sse2/;
+  specialize qw/aom_sad32x64x4d_avg   sse2/;
+  specialize qw/aom_sad32x32x4d_avg   sse2/;
+  specialize qw/aom_sad32x16x4d_avg   sse2/;
+  specialize qw/aom_sad32x8x4d_avg    sse2/;
+  specialize qw/aom_sad16x64x4d_avg   sse2/;
+  specialize qw/aom_sad16x32x4d_avg   sse2/;
+  specialize qw/aom_sad16x16x4d_avg   sse2/;
+  specialize qw/aom_sad16x8x4d_avg    sse2/;
+
+  specialize qw/aom_sad8x16x4d_avg    sse2/;
+  specialize qw/aom_sad8x8x4d_avg     sse2/;
+  specialize qw/aom_sad8x4x4d_avg     sse2/;
+  specialize qw/aom_sad4x16x4d_avg    sse2/;
+  specialize qw/aom_sad4x8x4d_avg     sse2/;
+  specialize qw/aom_sad4x4x4d_avg     sse2/;
+
+  specialize qw/aom_sad4x32x4d_avg    sse2/;
+  specialize qw/aom_sad4x16x4d_avg    sse2/;
+  specialize qw/aom_sad16x4x4d_avg    sse2/;
+  specialize qw/aom_sad8x32x4d_avg    sse2/;
+  specialize qw/aom_sad32x8x4d_avg    sse2/;
+  specialize qw/aom_sad64x16x4d_avg   sse2/;
+
+  specialize qw/aom_masked_sad128x128x4d  ssse3/;
+  specialize qw/aom_masked_sad128x64x4d   ssse3/;
+  specialize qw/aom_masked_sad64x128x4d   ssse3/;
+  specialize qw/aom_masked_sad64x64x4d    ssse3/;
+  specialize qw/aom_masked_sad64x32x4d    ssse3/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3/;
+  specialize qw/aom_masked_sad32x64x4d    ssse3/;
+  specialize qw/aom_masked_sad32x32x4d    ssse3/;
+  specialize qw/aom_masked_sad32x16x4d    ssse3/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3/;
+  specialize qw/aom_masked_sad16x64x4d    ssse3/;
+  specialize qw/aom_masked_sad16x32x4d    ssse3/;
+  specialize qw/aom_masked_sad16x16x4d    ssse3/;
+  specialize qw/aom_masked_sad16x8x4d     ssse3/;
+
+  specialize qw/aom_masked_sad8x16x4d     ssse3/;
+  specialize qw/aom_masked_sad8x8x4d      ssse3/;
+  specialize qw/aom_masked_sad8x4x4d      ssse3/;
+  specialize qw/aom_masked_sad4x16x4d     ssse3/;
+  specialize qw/aom_masked_sad4x8x4d      ssse3/;
+  specialize qw/aom_masked_sad4x4x4d      ssse3/;
+
+  specialize qw/aom_masked_sad4x32x4d     ssse3/;
+  specialize qw/aom_masked_sad4x16x4d     ssse3/;
+  specialize qw/aom_masked_sad16x4x4d     ssse3/;
+  specialize qw/aom_masked_sad8x32x4d     ssse3/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3/;
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-    if ($w != 128 && $h != 128) {
-      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      if ($w != 128 && $h != 128) {
+        specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+      }
     }
+    specialize qw/aom_highbd_sad128x128x4d avx2/;
+    specialize qw/aom_highbd_sad128x64x4d  avx2/;
+    specialize qw/aom_highbd_sad64x128x4d  avx2/;
+    specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
+    specialize qw/aom_highbd_sad8x16x4d    sse2/;
+    specialize qw/aom_highbd_sad8x8x4d     sse2/;
+    specialize qw/aom_highbd_sad8x4x4d     sse2/;
+    specialize qw/aom_highbd_sad4x8x4d     sse2/;
+    specialize qw/aom_highbd_sad4x4x4d     sse2/;
+
+    specialize qw/aom_highbd_sad4x16x4d         sse2/;
+    specialize qw/aom_highbd_sad16x4x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32x4d         sse2/;
+    specialize qw/aom_highbd_sad32x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16x4d   avx2 sse2/;
   }
-  specialize qw/aom_highbd_sad128x128x4d avx2/;
-  specialize qw/aom_highbd_sad128x64x4d  avx2/;
-  specialize qw/aom_highbd_sad64x128x4d  avx2/;
-  specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
-  specialize qw/aom_highbd_sad8x16x4d    sse2/;
-  specialize qw/aom_highbd_sad8x8x4d     sse2/;
-  specialize qw/aom_highbd_sad8x4x4d     sse2/;
-  specialize qw/aom_highbd_sad4x8x4d     sse2/;
-  specialize qw/aom_highbd_sad4x4x4d     sse2/;
-
-  specialize qw/aom_highbd_sad4x16x4d  sse2/;
-  specialize qw/aom_highbd_sad16x4x4d  sse2/;
-  specialize qw/aom_highbd_sad8x32x4d  sse2/;
-  specialize qw/aom_highbd_sad32x8x4d  sse2/;
-  specialize qw/aom_highbd_sad16x64x4d sse2/;
-  specialize qw/aom_highbd_sad64x16x4d sse2/;
-
   #
   # Avg
   #
   add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/aom_avg_8x8 sse2/;
+  specialize qw/aom_avg_8x8 sse2 neon/;
 
   add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/aom_avg_4x4 sse2/;
+  specialize qw/aom_avg_4x4 sse2 neon/;
 
   add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/aom_minmax_8x8 sse2/;
 
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+    add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+    add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  }
+
   add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_int_pro_row sse2/;
+  specialize qw/aom_int_pro_row sse2/;
 
   add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_int_pro_col sse2/;
+  specialize qw/aom_int_pro_col sse2/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
@@ -869,16 +980,37 @@
   # hamadard transform and satd for implmenting temporal dependency model
   #
   add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_8x8 sse2/;
+  specialize qw/aom_hadamard_8x8 sse2 neon/;
 
   add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_16x16 avx2 sse2/;
+  specialize qw/aom_hadamard_16x16 avx2 sse2 neon/;
 
   add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
   specialize qw/aom_hadamard_32x32 avx2 sse2/;
 
+  add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_8x8 avx2/;
+
+    add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_16x16 avx2/;
+
+    add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_32x32 avx2/;
+  }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd avx2 sse2/;
+  specialize qw/aom_satd avx2/;
+
+  add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
+  specialize qw/aom_satd_lp avx2 neon/;
+
 
   #
   # Structured Similarity (SSIM)
@@ -890,8 +1022,9 @@
     add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
 
-    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    }
   }
 }  # CONFIG_AV1_ENCODER
 
@@ -904,8 +1037,8 @@
 
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-  specialize qw/aom_get16x16var           neon msa/;
-  specialize qw/aom_get8x8var             neon msa/;
+  specialize qw/aom_get16x16var                neon msa/;
+  specialize qw/aom_get8x8var             sse2 neon msa/;
 
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
@@ -918,6 +1051,7 @@
   specialize qw/aom_mse8x16           sse2           msa/;
   specialize qw/aom_mse8x8            sse2           msa/;
 
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
       add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
       add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
@@ -930,7 +1064,7 @@
       specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
       specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
     }
-
+  }
 
   #
   #
@@ -959,22 +1093,23 @@
                                                        int subpel_search";
   specialize qw/aom_comp_mask_upsampled_pred sse2/;
 
-  add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                 const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
-                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-  specialize qw/aom_highbd_upsampled_pred sse2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_upsampled_pred sse2/;
 
-  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                          const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
 
-  add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                              const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                              int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
-
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+  }
 
   #
   #
@@ -1001,15 +1136,15 @@
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
-  specialize qw/aom_variance128x128   sse2 avx2         /;
+  specialize qw/aom_variance128x128   sse2 avx2 neon    /;
   specialize qw/aom_variance128x64    sse2 avx2         /;
   specialize qw/aom_variance64x128    sse2 avx2         /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2 msa/;
-  specialize qw/aom_variance16x32     sse2 avx2 msa/;
+  specialize qw/aom_variance32x16     sse2 avx2      msa/;
+  specialize qw/aom_variance16x32     sse2 avx2      msa/;
   specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
   specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
   specialize qw/aom_variance8x16      sse2      neon msa/;
@@ -1026,9 +1161,9 @@
   specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
@@ -1058,11 +1193,12 @@
   specialize qw/aom_variance32x8 sse2 avx2/;
   specialize qw/aom_variance16x64 sse2 avx2/;
   specialize qw/aom_variance64x16 sse2 avx2/;
+
   specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
@@ -1096,40 +1232,40 @@
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
 
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd (8, 10, 12) {
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-  foreach $bd (8, 10, 12) {
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-      }
-      # TODO(david.barker): When ext-partition-types is enabled, we currently
-      # don't have vectorized 4x16 highbd variance functions
-      if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
         }
-      if ($w != 128 && $h != 128 && $w != 4) {
-        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-      }
-      if ($w == 4 && $h == 4) {
-        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-      }
+        # TODO(david.barker): When ext-partition-types is enabled, we currently
+        # don't have vectorized 4x16 highbd variance functions
+        if ($w == 4 && $h == 4) {
+            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+          }
+        if ($w != 128 && $h != 128 && $w != 4) {
+          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+        }
+        if ($w == 4 && $h == 4) {
+          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+        }
 
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+      }
     }
   }
-
   #
   # Masked Variance / Masked Subpixel Variance
   #
@@ -1139,7 +1275,7 @@
     specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
   }
 
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd ("_8_", "_10_", "_12_") {
       foreach (@block_sizes) {
         ($w, $h) = @$_;
@@ -1147,7 +1283,7 @@
         specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
       }
     }
-
+  }
 
   #
   # OBMC Variance / OBMC Subpixel Variance
@@ -1160,7 +1296,7 @@
     specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
   }
 
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd ("_", "_10_", "_12_") {
       foreach (@block_sizes) {
         ($w, $h) = @$_;
@@ -1169,7 +1305,7 @@
         specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
       }
     }
-
+  }
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
@@ -1218,13 +1354,15 @@
   add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
   specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
 
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+
     add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance128x128 sse2/;
 
-	add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance128x64 sse2/;
 
-	add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance64x128 sse2/;
 
     add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1261,13 +1399,13 @@
     add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-	add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
 
-	add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
 
-	add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
 
     add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1304,13 +1442,13 @@
     add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-	add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_8_variance128x128 sse2/;
 
-	add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_8_variance128x64 sse2/;
 
-	add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_8_variance64x128 sse2/;
 
     add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1384,260 +1522,263 @@
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
     specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
-
+  }
     #
     # Subpixel Variance
     #
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
 
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
 
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    }
 
 
   add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
   specialize qw/aom_comp_mask_pred ssse3 avx2/;
 
-  add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+    specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+  }
 
 }  # CONFIG_AV1_ENCODER
 

diff --git a/libaom/aom_dsp/arm/avg_neon.c b/libaom/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000..af3769e
--- /dev/null
+++ b/libaom/aom_dsp/arm/avg_neon.c

@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+#if defined(__aarch64__)
+  const uint32_t d = vaddlvq_u16(c);
+  return (d + 8) >> 4;
+#else
+  const uint32x2_t d = horizontal_add_u16x8(c);
+  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+#endif
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  uint16x8_t sum;
+  uint32x2_t d;
+  uint8x8_t b = vld1_u8(a);
+  a += a_stride;
+  uint8x8_t c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
+
+  for (int i = 0; i < 6; ++i) {
+    const uint8x8_t e = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, e);
+  }
+
+  d = horizontal_add_u16x8(sum);
+
+  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+}
+
+int aom_satd_lp_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}

diff --git a/libaom/aom_dsp/arm/fwd_txfm_neon.c b/libaom/aom_dsp/arm/fwd_txfm_neon.c
index e4300c9..ce93523 100644
--- a/libaom/aom_dsp/arm/fwd_txfm_neon.c
+++ b/libaom/aom_dsp/arm/fwd_txfm_neon.c

@@ -14,9 +14,103 @@
 #include "config/aom_config.h"
 
 #include "aom_dsp/txfm_common.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static void aom_fdct4x4_helper(const int16_t *input, int stride,
+                               int16x4_t *input_0, int16x4_t *input_1,
+                               int16x4_t *input_2, int16x4_t *input_3) {
+  *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    *input_0 = vadd_s16(*input_0, one);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    const int16x8_t input_01 = vcombine_s16(*input_0, *input_1);
+    const int16x8_t input_32 = vcombine_s16(*input_3, *input_2);
+
+    // in_0 +/- in_3, in_1 +/- in_2
+    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+    // step_0 +/- step_1, step_2 +/- step_3
+    const int16x4_t s_0 = vget_low_s16(s_01);
+    const int16x4_t s_1 = vget_high_s16(s_01);
+    const int16x4_t s_2 = vget_high_s16(s_32);
+    const int16x4_t s_3 = vget_low_s16(s_32);
+
+    // (s_0 +/- s_1) * cospi_16_64
+    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+    // fdct_round_shift
+    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+    // s_3 * cospi_8_64 + s_2 * cospi_24_64
+    // s_3 * cospi_24_64 - s_2 * cospi_8_64
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+    // fdct_round_shift
+    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+    *input_0 = out_0;
+    *input_1 = out_1;
+    *input_2 = out_2;
+    *input_3 = out_3;
+  }
+}
+
+void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+  store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
+                         int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  vst1q_s16(final_output + 0 * 8, out_01);
+  vst1q_s16(final_output + 1 * 8, out_23);
+}
 
 void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
-  int i;
   // stage 1
   int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
   int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
@@ -26,7 +120,7 @@
   int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
   int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
   int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
     const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
     const int16x8_t v_s1 = vaddq_s16(input_1, input_6);

diff --git a/libaom/aom_dsp/arm/hadamard_neon.c b/libaom/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000..929792a
--- /dev/null
+++ b/libaom/aom_dsp/arm/hadamard_neon.c

@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  store_s16q_to_tran_low(coeff + 0, a0);
+  store_s16q_to_tran_low(coeff + 8, a1);
+  store_s16q_to_tran_low(coeff + 16, a2);
+  store_s16q_to_tran_low(coeff + 24, a3);
+  store_s16q_to_tran_low(coeff + 32, a4);
+  store_s16q_to_tran_low(coeff + 40, a5);
+  store_s16q_to_tran_low(coeff + 48, a6);
+  store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+                           coeff + 0);
+  /* Top right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+                           coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+                           coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+                           coeff + 192);
+
+  for (int i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (int i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 64, c1);
+    store_s16q_to_tran_low(coeff + 128, c2);
+    store_s16q_to_tran_low(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}

diff --git a/libaom/aom_dsp/arm/sse_neon.c b/libaom/aom_dsp/arm/sse_neon.c
new file mode 100644
index 0000000..06b81cc
--- /dev/null
+++ b/libaom/aom_dsp/arm/sse_neon.c

@@ -0,0 +1,487 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE uint32_t sse_W16x1_neon(uint8x16_t q2, uint8x16_t q3) {
+  const uint16_t sse1 = 0;
+  const uint16x8_t q1 = vld1q_dup_u16(&sse1);
+
+  uint32_t sse;
+
+  uint8x16_t q4 = vabdq_u8(q2, q3);  // diff = abs(a[x] - b[x])
+  uint8x8_t d0 = vget_low_u8(q4);
+  uint8x8_t d1 = vget_high_u8(q4);
+
+  uint16x8_t q6 = vmlal_u8(q1, d0, d0);
+  uint16x8_t q7 = vmlal_u8(q1, d1, d1);
+
+  uint32x4_t q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6));
+  uint32x4_t q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7));
+
+  uint32x2_t d4 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8));
+  uint32x2_t d5 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9));
+
+  uint32x2_t d6 = vadd_u32(d4, d5);
+
+  sse = vget_lane_u32(d6, 0);
+  sse += vget_lane_u32(d6, 1);
+
+  return sse;
+}
+
+int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  const uint8x16_t q0 = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  int addinc, x, y;
+  uint8x8_t d0, d1, d2, d3;
+  uint8_t dx;
+  uint8x16_t q2, q3, q4, q5;
+  uint32_t sse = 0;
+  uint8x8x2_t tmp, tmp2;
+
+  switch (width) {
+    case 4:
+      for (y = 0; y < height; y += 4) {
+        d0 = vld1_u8(a);  // load 4 data
+        a += a_stride;
+        d1 = vld1_u8(a);
+        a += a_stride;
+        d2 = vld1_u8(a);
+        a += a_stride;
+        d3 = vld1_u8(a);
+        a += a_stride;
+        tmp = vzip_u8(d0, d1);
+        tmp2 = vzip_u8(d2, d3);
+        q2 = vcombine_u8(tmp.val[0], tmp2.val[0]);  // make a 16 data vector
+
+        d0 = vld1_u8(b);
+        b += b_stride;
+        d1 = vld1_u8(b);
+        b += b_stride;
+        d2 = vld1_u8(b);
+        b += b_stride;
+        d3 = vld1_u8(b);
+        b += b_stride;
+        tmp = vzip_u8(d0, d1);
+        tmp2 = vzip_u8(d2, d3);
+        q3 = vcombine_u8(tmp.val[0], tmp2.val[0]);
+
+        sse += sse_W16x1_neon(q2, q3);
+      }
+      break;
+    case 8:
+      for (y = 0; y < height; y += 2) {
+        d0 = vld1_u8(a);  // load 8 data
+        d1 = vld1_u8(a + a_stride);
+        q2 = vcombine_u8(d0, d1);  // make a 16 data vector
+
+        d0 = vld1_u8(b);
+        d1 = vld1_u8(b + b_stride);
+        q3 = vcombine_u8(d0, d1);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += 2 * a_stride;
+        b += 2 * b_stride;
+      }
+      break;
+    case 16:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 32:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 64:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 32);
+        q3 = vld1q_u8(b + 32);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 48);
+        q3 = vld1q_u8(b + 48);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 128:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 32);
+        q3 = vld1q_u8(b + 32);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 48);
+        q3 = vld1q_u8(b + 48);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 64);
+        q3 = vld1q_u8(b + 64);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 80);
+        q3 = vld1q_u8(b + 80);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 96);
+        q3 = vld1q_u8(b + 96);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 112);
+        q3 = vld1q_u8(b + 112);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    default:
+      for (y = 0; y < height; y++) {
+        x = width;
+        while (x > 0) {
+          addinc = width - x;
+          q2 = vld1q_u8(a + addinc);
+          q3 = vld1q_u8(b + addinc);
+          if (x < 16) {
+            dx = x;
+            q4 = vld1q_dup_u8(&dx);
+            q5 = vcltq_u8(q0, q4);
+            q2 = vandq_u8(q2, q5);
+            q3 = vandq_u8(q3, q5);
+          }
+          sse += sse_W16x1_neon(q2, q3);
+          x -= 16;
+        }
+        a += a_stride;
+        b += b_stride;
+      }
+  }
+  return (int64_t)sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) {
+  uint32_t sse;
+  const uint32_t sse1 = 0;
+  const uint32x4_t q1 = vld1q_dup_u32(&sse1);
+
+  uint16x8_t q4 = vabdq_u16(q2, q3);  // diff = abs(a[x] - b[x])
+  uint16x4_t d0 = vget_low_u16(q4);
+  uint16x4_t d1 = vget_high_u16(q4);
+
+  uint32x4_t q6 = vmlal_u16(q1, d0, d0);
+  uint32x4_t q7 = vmlal_u16(q1, d1, d1);
+
+  uint32x2_t d4 = vadd_u32(vget_low_u32(q6), vget_high_u32(q6));
+  uint32x2_t d5 = vadd_u32(vget_low_u32(q7), vget_high_u32(q7));
+
+  uint32x2_t d6 = vadd_u32(d4, d5);
+
+  sse = vget_lane_u32(d6, 0);
+  sse += vget_lane_u32(d6, 1);
+
+  return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  const uint16x8_t q0 = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  int addinc;
+  uint16x4_t d0, d1, d2, d3;
+  uint16_t dx;
+  uint16x8_t q2, q3, q4, q5;
+
+  switch (width) {
+    case 4:
+      for (y = 0; y < height; y += 2) {
+        d0 = vld1_u16(a);  // load 4 data
+        a += a_stride;
+        d1 = vld1_u16(a);
+        a += a_stride;
+
+        d2 = vld1_u16(b);
+        b += b_stride;
+        d3 = vld1_u16(b);
+        b += b_stride;
+        q2 = vcombine_u16(d0, d1);  // make a 8 data vector
+        q3 = vcombine_u16(d2, d3);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+      }
+      break;
+    case 8:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 16:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 32:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 64:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 32);
+        q3 = vld1q_u16(b + 32);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 40);
+        q3 = vld1q_u16(b + 40);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 48);
+        q3 = vld1q_u16(b + 48);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 56);
+        q3 = vld1q_u16(b + 56);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 128:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 32);
+        q3 = vld1q_u16(b + 32);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 40);
+        q3 = vld1q_u16(b + 40);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 48);
+        q3 = vld1q_u16(b + 48);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 56);
+        q3 = vld1q_u16(b + 56);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 64);
+        q3 = vld1q_u16(b + 64);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 72);
+        q3 = vld1q_u16(b + 72);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 80);
+        q3 = vld1q_u16(b + 80);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 88);
+        q3 = vld1q_u16(b + 88);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 96);
+        q3 = vld1q_u16(b + 96);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 104);
+        q3 = vld1q_u16(b + 104);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 112);
+        q3 = vld1q_u16(b + 112);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 120);
+        q3 = vld1q_u16(b + 120);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    default:
+
+      for (y = 0; y < height; y++) {
+        x = width;
+        while (x > 0) {
+          addinc = width - x;
+          q2 = vld1q_u16(a + addinc);
+          q3 = vld1q_u16(b + addinc);
+          if (x < 8) {
+            dx = x;
+            q4 = vld1q_dup_u16(&dx);
+            q5 = vcltq_u16(q0, q4);
+            q2 = vandq_u16(q2, q5);
+            q3 = vandq_u16(q3, q5);
+          }
+          sse += highbd_sse_W8x1_neon(q2, q3);
+          x -= 8;
+        }
+        a += a_stride;
+        b += b_stride;
+      }
+  }
+  return (int64_t)sse;
+}
+#endif

diff --git a/libaom/aom_dsp/arm/sum_neon.h b/libaom/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000..809e51c
--- /dev/null
+++ b/libaom/aom_dsp/arm/sum_neon.h

@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}

diff --git a/libaom/aom_dsp/arm/variance_neon.c b/libaom/aom_dsp/arm/variance_neon.c
index 74385a6..d4107ce 100644
--- a/libaom/aom_dsp/arm/variance_neon.c
+++ b/libaom/aom_dsp/arm/variance_neon.c

@@ -13,25 +13,10 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
-
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
 // w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, uint32_t *sse,
@@ -145,6 +130,24 @@
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
+unsigned int aom_variance128x128_neon(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  sum1 = sse1 = 0;
+  for (int i = 0; i < 16; i++) {
+    variance_neon_w8(a + (8 * i * a_stride), a_stride, b + (8 * i * b_stride),
+                     b_stride, 128, 8, &sse2, &sum2);
+    sse1 += sse2;
+    sum1 += sum2;
+  }
+
+  *sse = sse1;
+
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 14);
+}
+
 unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
                                    int source_stride,
                                    const unsigned char *ref_ptr,
@@ -370,9 +373,7 @@
   d6u8 = vld1_u8(ref_ptr);
   ref_ptr += recon_stride;
   d3u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
   d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
 
   q11u16 = vsubl_u8(d0u8, d4u8);
   q12u16 = vsubl_u8(d1u8, d5u8);

diff --git a/libaom/aom_dsp/avg.c b/libaom/aom_dsp/avg.c
index 43d2760..7386296 100644
--- a/libaom/aom_dsp/avg.c
+++ b/libaom/aom_dsp/avg.c

@@ -48,6 +48,46 @@
   return (sum + 32) >> 6;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -107,6 +147,30 @@
   for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
 }
 
+void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                           int16_t *coeff) {
+  int16_t buffer[64];
+  int16_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
+}
+
 // In place 16x16 2D Hadamard transform
 void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
                           tran_low_t *coeff) {
@@ -139,6 +203,35 @@
   }
 }
 
+void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                             int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (int idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
 void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
                           tran_low_t *coeff) {
   int idx;
@@ -170,6 +263,164 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int32_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
 int aom_satd_c(const tran_low_t *coeff, int length) {
@@ -181,6 +432,14 @@
   return satd;
 }
 
+int aom_satd_lp_c(const int16_t *coeff, int length) {
+  int satd = 0;
+  for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return satd;
+}
+
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64, 128}.
 void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,

diff --git a/libaom/aom_dsp/bitreader.c b/libaom/aom_dsp/bitreader.c
new file mode 100644
index 0000000..4c70a91
--- /dev/null
+++ b/libaom/aom_dsp/bitreader.c

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitreader.h"
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
+  if (size && !buffer) {
+    return 1;
+  }
+  r->buffer_end = buffer + size;
+  r->buffer = buffer;
+  od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
+#if CONFIG_ACCOUNTING
+  r->accounting = NULL;
+#endif
+  return 0;
+}
+
+const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
+
+const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
+
+uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
+
+uint32_t aom_reader_tell_frac(const aom_reader *r) {
+  return od_ec_dec_tell_frac(&r->ec);
+}
+
+int aom_reader_has_overflowed(const aom_reader *r) {
+  const uint32_t tell_bits = aom_reader_tell(r);
+  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}

diff --git a/libaom/aom_dsp/bitreader.h b/libaom/aom_dsp/bitreader.h
index 38b17ea..a8b3f55 100644
--- a/libaom/aom_dsp/bitreader.h
+++ b/libaom/aom_dsp/bitreader.h

@@ -19,7 +19,7 @@
 
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
-#include "aom_dsp/daalaboolreader.h"
+#include "aom_dsp/entdec.h"
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 
@@ -50,36 +50,33 @@
 extern "C" {
 #endif
 
-typedef struct daala_reader aom_reader;
+struct aom_reader {
+  const uint8_t *buffer;
+  const uint8_t *buffer_end;
+  od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  uint8_t allow_update_cdf;
+};
 
-static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
-                                  size_t size) {
-  return aom_daala_reader_init(r, buffer, (int)size);
-}
+typedef struct aom_reader aom_reader;
 
-static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
-  return aom_daala_reader_find_begin(r);
-}
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size);
 
-static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
-  return aom_daala_reader_find_end(r);
-}
+const uint8_t *aom_reader_find_begin(aom_reader *r);
+
+const uint8_t *aom_reader_find_end(aom_reader *r);
 
 // Returns true if the bit reader has tried to decode more data from the buffer
 // than was actually provided.
-static INLINE int aom_reader_has_overflowed(const aom_reader *r) {
-  return aom_daala_reader_has_overflowed(r);
-}
+int aom_reader_has_overflowed(const aom_reader *r);
 
 // Returns the position in the bit reader in bits.
-static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
-  return aom_daala_reader_tell(r);
-}
+uint32_t aom_reader_tell(const aom_reader *r);
 
 // Returns the position in the bit reader in 1/8th bits.
-static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
-  return aom_daala_reader_tell_frac(r);
-}
+uint32_t aom_reader_tell_frac(const aom_reader *r);
 
 #if CONFIG_ACCOUNTING
 static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
@@ -101,13 +98,48 @@
 #endif
 
 static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_daala_read(r, prob);
+  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+  int bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int ref_bit, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+    if (ref_nsymbs != 2) {
+      fprintf(stderr,
+              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+              "%d queue_r %d\n",
+              frame_idx, 2, ref_nsymbs, queue_r);
+      assert(0);
+    }
+    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+        (ref_cdf[1] != 32767)) {
+      fprintf(stderr,
+              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+              frame_idx, p, 32767, ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (bit != ref_bit) {
+      fprintf(stderr,
+              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_bit, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
   aom_update_symb_counts(r, 1);
 #endif
-  return ret;
+  return bit;
 }
 
 static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
@@ -131,14 +163,54 @@
 
 static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
                                 int nsymbs ACCT_STR_PARAM) {
-  int ret;
-  ret = daala_read_symbol(r, cdf, nsymbs);
+  int symb;
+  assert(cdf != NULL);
+  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int cdf_error = 0;
+    int ref_symb, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+    if (nsymbs != ref_nsymbs) {
+      fprintf(stderr,
+              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+              "queue_r %d\n",
+              frame_idx, nsymbs, ref_nsymbs, queue_r);
+      cdf_error = 0;
+      assert(0);
+    } else {
+      for (i = 0; i < nsymbs; ++i)
+        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+    }
+    if (cdf_error) {
+      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+              cdf[0]);
+      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (symb != ref_symb) {
+      fprintf(
+          stderr,
+          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+          frame_idx, symb, ref_symb, queue_r);
+      assert(0);
+    }
+  }
+#endif
 
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
   aom_update_symb_counts(r, (nsymbs == 2));
 #endif
-  return ret;
+  return symb;
 }
 
 static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,

diff --git a/libaom/aom_dsp/bitwriter.c b/libaom/aom_dsp/bitwriter.c
new file mode 100644
index 0000000..41fcc51
--- /dev/null
+++ b/libaom/aom_dsp/bitwriter.c

@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/bitwriter.h"
+
+void aom_start_encode(aom_writer *w, uint8_t *source) {
+  w->buffer = source;
+  w->pos = 0;
+  od_ec_enc_init(&w->ec, 62025);
+}
+
+int aom_stop_encode(aom_writer *w) {
+  int nb_bits;
+  uint32_t bytes;
+  unsigned char *data;
+  data = od_ec_enc_done(&w->ec, &bytes);
+  nb_bits = od_ec_enc_tell(&w->ec);
+  memcpy(w->buffer, data, bytes);
+  w->pos = bytes;
+  od_ec_enc_clear(&w->ec);
+  return nb_bits;
+}

diff --git a/libaom/aom_dsp/bitwriter.h b/libaom/aom_dsp/bitwriter.h
index b5ecc23..4e77a17 100644
--- a/libaom/aom_dsp/bitwriter.h
+++ b/libaom/aom_dsp/bitwriter.h

@@ -16,7 +16,7 @@
 
 #include "config/aom_config.h"
 
-#include "aom_dsp/daalaboolwriter.h"
+#include "aom_dsp/entenc.h"
 #include "aom_dsp/prob.h"
 
 #if CONFIG_RD_DEBUG
@@ -28,7 +28,14 @@
 extern "C" {
 #endif
 
-typedef struct daala_writer aom_writer;
+struct aom_writer {
+  unsigned int pos;
+  uint8_t *buffer;
+  od_ec_enc ec;
+  uint8_t allow_update_cdf;
+};
+
+typedef struct aom_writer aom_writer;
 
 typedef struct TOKEN_STATS {
   int cost;
@@ -49,16 +56,26 @@
   token_stats->cost = 0;
 }
 
-static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
-  aom_daala_start_encode(bc, buffer);
-}
+void aom_start_encode(aom_writer *w, uint8_t *buffer);
 
-static INLINE int aom_stop_encode(aom_writer *bc) {
-  return aom_daala_stop_encode(bc);
-}
+int aom_stop_encode(aom_writer *w);
 
-static INLINE void aom_write(aom_writer *br, int bit, int probability) {
-  aom_daala_write(br, bit, probability);
+static INLINE void aom_write(aom_writer *w, int bit, int probability) {
+  int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(bit, cdf, 2);
+#endif
+
+  od_ec_encode_bool_q15(&w->ec, bit, p);
 }
 
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
@@ -73,7 +90,19 @@
 
 static INLINE void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
-  daala_write_symbol(w, symb, cdf, nsymbs);
+#if CONFIG_BITSTREAM_DEBUG
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
 }
 
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,

diff --git a/libaom/aom_dsp/bitwriter_buffer.c b/libaom/aom_dsp/bitwriter_buffer.c
index c08cc9d..7d0ab94 100644
--- a/libaom/aom_dsp/bitwriter_buffer.c
+++ b/libaom/aom_dsp/bitwriter_buffer.c

@@ -88,8 +88,8 @@
   aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
 }
 
-static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
-                                            uint16_t n, uint16_t v) {
+static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+                                        uint16_t n, uint16_t v) {
   if (n <= 1) return;
   const int l = get_msb(n) + 1;
   const int m = (1 << l) - n;
@@ -101,16 +101,15 @@
   }
 }
 
-static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
-                                             uint16_t n, uint16_t k,
-                                             uint16_t v) {
+static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+                                         uint16_t n, uint16_t k, uint16_t v) {
   int i = 0;
   int mk = 0;
   while (1) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
     if (n <= mk + 3 * a) {
-      aom_wb_write_primitive_quniform(wb, n - mk, v - mk);
+      wb_write_primitive_quniform(wb, n - mk, v - mk);
       break;
     } else {
       int t = (v >= mk + a);
@@ -126,10 +125,10 @@
   }
 }
 
-static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
-                                                uint16_t n, uint16_t k,
-                                                uint16_t ref, uint16_t v) {
-  aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                            uint16_t n, uint16_t k,
+                                            uint16_t ref, uint16_t v) {
+  wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
 }
 
 void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
@@ -138,5 +137,5 @@
   ref += n - 1;
   v += n - 1;
   const uint16_t scaled_n = (n << 1) - 1;
-  aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+  wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
 }

diff --git a/libaom/aom_dsp/blend_a64_hmask.c b/libaom/aom_dsp/blend_a64_hmask.c
index 0554b43..e9e38ef 100644
--- a/libaom/aom_dsp/blend_a64_hmask.c
+++ b/libaom/aom_dsp/blend_a64_hmask.c

@@ -40,6 +40,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
@@ -67,3 +68,4 @@
     }
   }
 }
+#endif

diff --git a/libaom/aom_dsp/blend_a64_mask.c b/libaom/aom_dsp/blend_a64_mask.c
index 79956c3..32f2dc6 100644
--- a/libaom/aom_dsp/blend_a64_mask.c
+++ b/libaom/aom_dsp/blend_a64_mask.c

@@ -120,6 +120,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_d16_mask_c(
     uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
@@ -219,6 +220,7 @@
     }
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Blending with alpha mask. Mask values come from the range [0, 64],
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
@@ -281,6 +283,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
                                  const uint8_t *src0_8, uint32_t src0_stride,
                                  const uint8_t *src1_8, uint32_t src1_stride,
@@ -343,3 +346,4 @@
     }
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/blend_a64_vmask.c b/libaom/aom_dsp/blend_a64_vmask.c
index 4f222e1..c938bb3 100644
--- a/libaom/aom_dsp/blend_a64_vmask.c
+++ b/libaom/aom_dsp/blend_a64_vmask.c

@@ -41,6 +41,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
@@ -69,3 +70,4 @@
     }
   }
 }
+#endif

diff --git a/libaom/aom_dsp/blk_sse_sum.c b/libaom/aom_dsp/blk_sse_sum.c
new file mode 100644
index 0000000..d76c3f8
--- /dev/null
+++ b/libaom/aom_dsp/blk_sse_sum.c

@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh,
+                           int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      *x_sum += val;
+      *x2_sum += val * val;
+    }
+    data += stride;
+  }
+}

diff --git a/libaom/aom_dsp/buf_ans.c b/libaom/aom_dsp/buf_ans.c
deleted file mode 100644
index f7703df..0000000
--- a/libaom/aom_dsp/buf_ans.c
+++ /dev/null

@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-
-#include "aom_dsp/buf_ans.h"
-#include "aom_mem/aom_mem.h"
-#include "aom/internal/aom_codec_internal.h"
-
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error) {
-  c->error = error;
-  assert(c->size > 1);
-  AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
-  // Initialize to overfull to trigger the assert in write.
-  c->offset = c->size + 1;
-}
-
-void aom_buf_ans_free(struct BufAnsCoder *c) {
-  aom_free(c->buf);
-  c->buf = NULL;
-  c->size = 0;
-}
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c) {
-  struct buffered_ans_symbol *new_buf = NULL;
-  int new_size = c->size * 2;
-  AOM_CHECK_MEM_ERROR(c->error, new_buf,
-                      aom_malloc(new_size * sizeof(*new_buf)));
-  memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
-  aom_free(c->buf);
-  c->buf = new_buf;
-  c->size = new_size;
-}
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c) {
-  int offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == 0) return;
-#endif
-  assert(c->offset > 0);
-  offset = c->offset - 1;
-  // Code the first symbol such that it brings the state to the smallest normal
-  // state from an initial state that would have been a subnormal/refill state.
-  if (c->buf[offset].method == ANS_METHOD_RANS) {
-    c->ans.state += c->buf[offset].val_start;
-  } else {
-    c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0;
-  }
-  for (offset = offset - 1; offset >= 0; --offset) {
-    if (c->buf[offset].method == ANS_METHOD_RANS) {
-      rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob);
-    } else {
-      rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start,
-                 (AnsP8)c->buf[offset].prob);
-    }
-  }
-  c->offset = 0;
-  c->output_bytes += ans_write_end(&c->ans);
-}

diff --git a/libaom/aom_dsp/buf_ans.h b/libaom/aom_dsp/buf_ans.h
deleted file mode 100644
index 985fcdf..0000000
--- a/libaom/aom_dsp/buf_ans.h
+++ /dev/null

@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BUF_ANS_H_
-#define AOM_AOM_DSP_BUF_ANS_H_
-// Buffered forward ANS writer.
-// Symbols are written to the writer in forward (decode) order and serialized
-// backwards due to ANS's stack like behavior.
-
-#include <assert.h>
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/answriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#define ANS_METHOD_RABS 0
-#define ANS_METHOD_RANS 1
-
-struct buffered_ans_symbol {
-  unsigned int method : 1;  // one of ANS_METHOD_RABS or ANS_METHOD_RANS
-  // TODO(aconverse): Should be possible to write this in terms of start for ABS
-  unsigned int val_start : RANS_PROB_BITS;  // Boolean value for ABS
-                                            // start in symbol cycle for Rans
-  unsigned int prob : RANS_PROB_BITS;       // Probability of this symbol
-};
-
-struct BufAnsCoder {
-  struct aom_internal_error_info *error;
-  struct buffered_ans_symbol *buf;
-  struct AnsCoder ans;
-  int size;
-  int offset;
-  int output_bytes;
-#if ANS_MAX_SYMBOLS
-  int window_size;
-#endif
-  int pos;  // Dummy variable to store the output buffer after closing
-  uint8_t allow_update_cdf;
-};
-
-// Allocate a buffered ANS coder to store size symbols.
-// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS
-// partition.
-// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
-// buffer will grow on demand
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error);
-
-void aom_buf_ans_free(struct BufAnsCoder *c);
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c);
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c);
-
-static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
-                                      uint8_t *const output_buffer) {
-  c->offset = 0;
-  c->output_bytes = 0;
-  ans_write_init(&c->ans, output_buffer);
-}
-
-static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val,
-                                  AnsP8 prob) {
-  assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-#endif
-  c->buf[c->offset].method = ANS_METHOD_RABS;
-  c->buf[c->offset].val_start = val;
-  c->buf[c->offset].prob = prob;
-  ++c->offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-// Buffer one symbol for encoding using rANS.
-// cum_prob: The cumulative probability before this symbol (the offset of
-// the symbol in the symbol cycle)
-// prob: The probability of this symbol (l_s from the paper)
-// RANS_PRECISION takes the place of m from the paper.
-static INLINE void buf_rans_write(struct BufAnsCoder *const c,
-                                  aom_cdf_prob cum_prob, aom_cdf_prob prob) {
-  assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-#endif
-  c->buf[c->offset].method = ANS_METHOD_RANS;
-  c->buf[c->offset].val_start = cum_prob;
-  c->buf[c->offset].prob = prob;
-  ++c->offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) {
-  buf_rabs_write(c, bit, 128);
-}
-
-static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal,
-                                          int bits) {
-  int bit;
-
-  assert(bits < 31);
-  for (bit = bits - 1; bit >= 0; bit--)
-    buf_rabs_write_bit(c, 1 & (literal >> bit));
-}
-
-static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
-  assert(c->offset == 0);
-  return c->output_bytes;
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_AOM_DSP_BUF_ANS_H_

diff --git a/libaom/aom_dsp/daalaboolreader.c b/libaom/aom_dsp/daalaboolreader.c
deleted file mode 100644
index 6c2259f..0000000
--- a/libaom/aom_dsp/daalaboolreader.c
+++ /dev/null

@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/daalaboolreader.h"
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
-  if (size && !buffer) {
-    return 1;
-  }
-  r->buffer_end = buffer + size;
-  r->buffer = buffer;
-  od_ec_dec_init(&r->ec, buffer, size);
-#if CONFIG_ACCOUNTING
-  r->accounting = NULL;
-#endif
-  return 0;
-}
-
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
-  return r->buffer;
-}
-
-const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
-  return r->buffer_end;
-}
-
-uint32_t aom_daala_reader_tell(const daala_reader *r) {
-  return od_ec_dec_tell(&r->ec);
-}
-
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
-  return od_ec_dec_tell_frac(&r->ec);
-}
-
-int aom_daala_reader_has_overflowed(const daala_reader *r) {
-  const uint32_t tell_bits = aom_daala_reader_tell(r);
-  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
-  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
-}

diff --git a/libaom/aom_dsp/daalaboolreader.h b/libaom/aom_dsp/daalaboolreader.h
deleted file mode 100644
index b1810fc..0000000
--- a/libaom/aom_dsp/daalaboolreader.h
+++ /dev/null

@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_
-#define AOM_AOM_DSP_DAALABOOLREADER_H_
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#endif
-#if CONFIG_BITSTREAM_DEBUG
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_reader {
-  const uint8_t *buffer;
-  const uint8_t *buffer_end;
-  od_ec_dec ec;
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-  uint8_t allow_update_cdf;
-};
-
-typedef struct daala_reader daala_reader;
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
-const uint8_t *aom_daala_reader_find_end(daala_reader *r);
-uint32_t aom_daala_reader_tell(const daala_reader *r);
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
-// Returns true if the reader has tried to decode more data from the buffer
-// than was actually provided.
-int aom_daala_reader_has_overflowed(const daala_reader *r);
-
-static INLINE int aom_daala_read(daala_reader *r, int prob) {
-  int bit;
-  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
-/*{
-  const int queue_r = bitstream_queue_get_read();
-  const int frame_idx = bitstream_queue_get_frame_read();
-  if (frame_idx == 0 && queue_r == 0) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n",
-            frame_idx, queue_r);
-  }
-}*/
-#endif
-
-  bit = od_ec_decode_bool_q15(&r->ec, p);
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int i;
-    int ref_bit, ref_nsymbs;
-    aom_cdf_prob ref_cdf[16];
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
-    if (ref_nsymbs != 2) {
-      fprintf(stderr,
-              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
-              "%d queue_r %d\n",
-              frame_idx, 2, ref_nsymbs, queue_r);
-      assert(0);
-    }
-    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
-        (ref_cdf[1] != 32767)) {
-      fprintf(stderr,
-              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
-              frame_idx, p, 32767, ref_cdf[0]);
-      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
-      fprintf(stderr, "} queue_r %d\n", queue_r);
-      assert(0);
-    }
-    if (bit != ref_bit) {
-      fprintf(stderr,
-              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
-              "queue_r %d\n",
-              frame_idx, bit, ref_bit, queue_r);
-      assert(0);
-    }
-  }
-#endif
-
-  return bit;
-}
-
-static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
-                                    int nsymbs) {
-  int symb;
-  assert(cdf != NULL);
-  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int i;
-    int cdf_error = 0;
-    int ref_symb, ref_nsymbs;
-    aom_cdf_prob ref_cdf[16];
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
-    if (nsymbs != ref_nsymbs) {
-      fprintf(stderr,
-              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
-              "queue_r %d\n",
-              frame_idx, nsymbs, ref_nsymbs, queue_r);
-      cdf_error = 0;
-      assert(0);
-    } else {
-      for (i = 0; i < nsymbs; ++i)
-        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
-    }
-    if (cdf_error) {
-      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
-              cdf[0]);
-      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
-      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
-      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
-      fprintf(stderr, "} queue_r %d\n", queue_r);
-      assert(0);
-    }
-    if (symb != ref_symb) {
-      fprintf(
-          stderr,
-          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
-          frame_idx, symb, ref_symb, queue_r);
-      assert(0);
-    }
-  }
-#endif
-
-  return symb;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_DAALABOOLREADER_H_

diff --git a/libaom/aom_dsp/daalaboolwriter.c b/libaom/aom_dsp/daalaboolwriter.c
deleted file mode 100644
index b24ffbf..0000000
--- a/libaom/aom_dsp/daalaboolwriter.c
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/daalaboolwriter.h"
-
-void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
-  br->buffer = source;
-  br->pos = 0;
-  od_ec_enc_init(&br->ec, 62025);
-}
-
-int aom_daala_stop_encode(daala_writer *br) {
-  int nb_bits;
-  uint32_t daala_bytes;
-  unsigned char *daala_data;
-  daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
-  nb_bits = od_ec_enc_tell(&br->ec);
-  memcpy(br->buffer, daala_data, daala_bytes);
-  br->pos = daala_bytes;
-  od_ec_enc_clear(&br->ec);
-  return nb_bits;
-}

diff --git a/libaom/aom_dsp/daalaboolwriter.h b/libaom/aom_dsp/daalaboolwriter.h
deleted file mode 100644
index 3848877..0000000
--- a/libaom/aom_dsp/daalaboolwriter.h
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_
-#define AOM_AOM_DSP_DAALABOOLWRITER_H_
-
-#include <stdio.h>
-
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_BITSTREAM_DEBUG
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_writer {
-  unsigned int pos;
-  uint8_t *buffer;
-  od_ec_enc ec;
-  uint8_t allow_update_cdf;
-};
-
-typedef struct daala_writer daala_writer;
-
-void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-int aom_daala_stop_encode(daala_writer *w);
-
-static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
-  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
-  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = bitstream_queue_get_frame_write();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
-  bitstream_queue_push(bit, cdf, 2);
-#endif
-
-  od_ec_encode_bool_q15(&w->ec, bit, p);
-}
-
-static INLINE void daala_write_symbol(daala_writer *w, int symb,
-                                      const aom_cdf_prob *cdf, int nsymbs) {
-#if CONFIG_BITSTREAM_DEBUG
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = bitstream_queue_get_frame_write();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
-  bitstream_queue_push(symb, cdf, nsymbs);
-#endif
-
-  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_DAALABOOLWRITER_H_

diff --git a/libaom/aom_dsp/fwd_txfm.c b/libaom/aom_dsp/fwd_txfm.c
index e50f951..3d30444 100644
--- a/libaom/aom_dsp/fwd_txfm.c
+++ b/libaom/aom_dsp/fwd_txfm.c

@@ -13,6 +13,130 @@
 #include "aom_dsp/txfm_common.h"
 #include "config/aom_dsp_rtcd.h"
 
+void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const tran_low_t *in_low = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (int pass = 0; pass < 2; ++pass) {
+    tran_high_t in_high[4];    // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      ++input;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
+void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[4 * 4];
+  const int16_t *in_low = NULL;
+  int16_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (int pass = 0; pass < 2; ++pass) {
+    int32_t in_high[4];    // canbe16
+    int32_t step[4];       // canbe16
+    int32_t temp1, temp2;  // needs32
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp1 = (step[0] + step[1]) * (int32_t)cospi_16_64;
+      temp2 = (step[0] - step[1]) * (int32_t)cospi_16_64;
+      out[0] = (int16_t)fdct_round_shift(temp1);
+      out[2] = (int16_t)fdct_round_shift(temp2);
+      temp1 = step[2] * (int32_t)cospi_24_64 + step[3] * (int32_t)cospi_8_64;
+      temp2 = -step[2] * (int32_t)cospi_8_64 + step[3] * (int32_t)cospi_24_64;
+      out[1] = (int16_t)fdct_round_shift(temp1);
+      out[3] = (int16_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      ++input;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
 void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   int i, j;
   tran_low_t intermediate[64];
@@ -97,7 +221,9 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
                           int stride) {
   aom_fdct8x8_c(input, final_output, stride);
 }
+#endif

diff --git a/libaom/aom_dsp/grain_synthesis.c b/libaom/aom_dsp/grain_synthesis.c
index 4b94dbc..626eb76 100644
--- a/libaom/aom_dsp/grain_synthesis.c
+++ b/libaom/aom_dsp/grain_synthesis.c

@@ -1078,7 +1078,7 @@
 
   const int grain_center = 128 << (bit_depth - 8);
   grain_min = 0 - grain_center;
-  grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
+  grain_max = grain_center - 1;
 
   init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
               &pred_pos_chroma, &luma_grain_block, &cb_grain_block,

diff --git a/libaom/aom_dsp/grain_table.c b/libaom/aom_dsp/grain_table.c
index 5eb5b68..e03f04d 100644
--- a/libaom/aom_dsp/grain_table.c
+++ b/libaom/aom_dsp/grain_table.c

@@ -203,7 +203,7 @@
                                 aom_film_grain_t *grain) {
   aom_film_grain_table_entry_t *entry = t->head;
   aom_film_grain_table_entry_t *prev_entry = 0;
-  int16_t random_seed = grain ? grain->random_seed : 0;
+  uint16_t random_seed = grain ? grain->random_seed : 0;
   if (grain) memset(grain, 0, sizeof(*grain));
 
   while (entry) {

diff --git a/libaom/aom_dsp/loopfilter.c b/libaom/aom_dsp/loopfilter.c
index a3f2618..903ebcd 100644
--- a/libaom/aom_dsp/loopfilter.c
+++ b/libaom/aom_dsp/loopfilter.c

@@ -21,6 +21,7 @@
   return (int8_t)clamp(t, -128, 127);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
@@ -29,6 +30,7 @@
     default: return (int16_t)clamp(t, -128, 128 - 1);
   }
 }
+#endif
 
 // should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
@@ -103,11 +105,11 @@
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
-  const int8_t ps1 = (int8_t)*op1 ^ 0x80;
-  const int8_t ps0 = (int8_t)*op0 ^ 0x80;
-  const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
-  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+  const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+  const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+  const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+  const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+  const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
 
   // add outer taps if we have high edge variance
   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
@@ -121,14 +123,14 @@
   filter1 = signed_char_clamp(filter + 4) >> 3;
   filter2 = signed_char_clamp(filter + 3) >> 3;
 
-  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+  *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+  *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
 
   // outer tap adjustments
   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
 
-  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+  *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+  *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
 }
 
 void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
@@ -442,6 +444,7 @@
   mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
                                          uint16_t p1, uint16_t p0, uint16_t q0,
@@ -539,7 +542,7 @@
   const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
   const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
   const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
-  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+  const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
 
   // Add outer taps if we have high edge variance.
   int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
@@ -865,10 +868,10 @@
   }
 }
 
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
 }
 
 void aom_highbd_lpf_horizontal_14_dual_c(
@@ -923,3 +926,4 @@
   highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
                                 4, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/mips/add_noise_msa.c b/libaom/aom_dsp/mips/add_noise_msa.c
deleted file mode 100644
index 96d04cf..0000000
--- a/libaom/aom_dsp/mips/add_noise_msa.c
+++ /dev/null

@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "aom_dsp/mips/macros_msa.h"
-
-void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16], uint32_t width,
-                             uint32_t height, int32_t pitch) {
-  uint32_t i, j;
-
-  for (i = 0; i < height / 2; ++i) {
-    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
-    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
-    for (j = width / 16; j--;) {
-      v16i8 temp00_s, temp01_s;
-      v16u8 temp00, temp01, black_clamp, white_clamp;
-      v16u8 pos0, ref0, pos1, ref1;
-      v16i8 const127 = __msa_ldi_b(127);
-
-      pos0 = LD_UB(pos0_ptr);
-      ref0 = LD_UB(ref0_ptr);
-      pos1 = LD_UB(pos1_ptr);
-      ref1 = LD_UB(ref1_ptr);
-      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-      temp00 = (pos0 < black_clamp);
-      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-      temp01 = (pos1 < black_clamp);
-      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-      XORI_B2_128_UB(pos0, pos1);
-      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp00 = (v16u8)(temp00_s < pos0);
-      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp01 = (temp01_s < pos1);
-      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-      XORI_B2_128_UB(pos0, pos1);
-      pos0 += ref0;
-      ST_UB(pos0, pos0_ptr);
-      pos1 += ref1;
-      ST_UB(pos1, pos1_ptr);
-      pos0_ptr += 16;
-      pos1_ptr += 16;
-      ref0_ptr += 16;
-      ref1_ptr += 16;
-    }
-  }
-}

diff --git a/libaom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/libaom/aom_dsp/mips/aom_convolve8_horiz_msa.c
index 363fad3..c8ab612 100644
--- a/libaom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ b/libaom/aom_dsp/mips/aom_convolve8_horiz_msa.c

@@ -446,7 +446,6 @@
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
 
     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
     ST8x4_UB(out0, out1, dst, dst_stride);

diff --git a/libaom/aom_dsp/mips/aom_convolve8_vert_msa.c b/libaom/aom_dsp/mips/aom_convolve8_vert_msa.c
index aa962b4..2c3bc08 100644
--- a/libaom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ b/libaom/aom_dsp/mips/aom_convolve8_vert_msa.c

@@ -313,7 +313,6 @@
   filt0 = (v16u8)__msa_splati_h(filt, 0);
 
   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  src += (5 * src_stride);
 
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
@@ -341,7 +340,6 @@
   src += (8 * src_stride);
 
   src8 = LD_SB(src);
-  src += src_stride;
 
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);

diff --git a/libaom/aom_dsp/noise_model.c b/libaom/aom_dsp/noise_model.c
index 2faee85..c7a0003 100644
--- a/libaom/aom_dsp/noise_model.c
+++ b/libaom/aom_dsp/noise_model.c

@@ -214,6 +214,7 @@
 
 int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
   if (!lut) return 0;
+  lut->num_points = 0;
   lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
   if (!lut->points) return 0;
   lut->num_points = num_points;
@@ -426,6 +427,9 @@
   double *AtA_inv = 0;
   double *A = 0;
   int x = 0, y = 0, i = 0, j = 0;
+  block_finder->A = NULL;
+  block_finder->AtA_inv = NULL;
+
   if (!equation_system_init(&eqns, kLowPolyNumParams)) {
     fprintf(stderr, "Failed to init equation system for block_size=%d\n",
             block_size);
@@ -632,10 +636,12 @@
         //    [{var}, {ratio}, {trace}, {norm}, offset]
         // with one of the most discriminative being simply the variance.
         const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
-        const float score =
-            (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
-                                     weights[2] * trace + weights[3] * norm +
-                                     weights[4]))));
+        double sum_weights = weights[0] * var + weights[1] * ratio +
+                             weights[2] * trace + weights[3] * norm +
+                             weights[4];
+        // clamp the value to [-25.0, 100.0] to prevent overflow
+        sum_weights = fclamp(sum_weights, -25.0, 100.0);
+        const float score = (float)(1.0 / (1 + exp(-sum_weights)));
         flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
         scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
         scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;

diff --git a/libaom/aom_dsp/noise_util.c b/libaom/aom_dsp/noise_util.c
index 87e8e9f..7e7e380 100644
--- a/libaom/aom_dsp/noise_util.c
+++ b/libaom/aom_dsp/noise_util.c

@@ -96,7 +96,9 @@
     for (int x = 0; x < block_size; ++x) {
       int i = y * block_size + x;
       float *c = noise_tx->tx_block + 2 * i;
-      const float p = c[0] * c[0] + c[1] * c[1];
+      const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f);
+      const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f);
+      const float p = c0 * c0 + c1 * c1;
       if (p > kBeta * psd[i] && p > 1e-6) {
         noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
         noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);

diff --git a/libaom/aom_dsp/postproc.h b/libaom/aom_dsp/postproc.h
deleted file mode 100644
index f3d87f2..0000000
--- a/libaom/aom_dsp/postproc.h
+++ /dev/null

@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_POSTPROC_H_
-#define AOM_AOM_DSP_POSTPROC_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Fills a noise buffer with gaussian noise strength determined by sigma.
-int aom_setup_noise(double sigma, int size, char *noise);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AOM_DSP_POSTPROC_H_

diff --git a/libaom/aom_dsp/prob.h b/libaom/aom_dsp/prob.h
index 20ffdea..ea5e4cb 100644
--- a/libaom/aom_dsp/prob.h
+++ b/libaom/aom_dsp/prob.h

@@ -26,7 +26,6 @@
 extern "C" {
 #endif
 
-// TODO(negge): Rename this aom_prob once we remove vpxbool.
 typedef uint16_t aom_cdf_prob;
 
 #define CDF_SIZE(x) ((x) + 1)

diff --git a/libaom/aom_dsp/psnr.c b/libaom/aom_dsp/psnr.c
index 50f376a..c66dd52 100644
--- a/libaom/aom_dsp/psnr.c
+++ b/libaom/aom_dsp/psnr.c

@@ -26,9 +26,6 @@
   }
 }
 
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
 static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, unsigned int *sse,
                              int *sum) {
@@ -49,6 +46,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride, int w,
                                       int h, uint64_t *sse, int64_t *sum) {
@@ -81,6 +79,7 @@
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
                        int b_stride, int width, int height) {
@@ -122,6 +121,7 @@
   return total_sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
                                     const uint8_t *b8, int b_stride, int width,
                                     int height, unsigned int input_shift) {
@@ -174,6 +174,28 @@
   }
   return total_sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                       width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
 
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
@@ -226,6 +248,28 @@
                  a->uv_crop_width, a->uv_crop_height);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
+                        a->y_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height) {
@@ -284,9 +328,11 @@
   return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
                         a->uv_crop_width, a->uv_crop_height);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     switch (plane) {
       case 0: return aom_highbd_get_y_sse(a, b);
@@ -294,15 +340,26 @@
       case 2: return aom_highbd_get_v_sse(a, b);
       default: assert(plane >= 0 && plane <= 2); return 0;
     }
+  } else {
+    switch (plane) {
+      case 0: return aom_get_y_sse(a, b);
+      case 1: return aom_get_u_sse(a, b);
+      case 2: return aom_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
   }
+#else
+  (void)highbd;
   switch (plane) {
     case 0: return aom_get_y_sse(a, b);
     case 1: return aom_get_u_sse(a, b);
     case 2: return aom_get_v_sse(a, b);
     default: assert(plane >= 0 && plane <= 2); return 0;
   }
+#endif
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
@@ -347,6 +404,7 @@
   psnr->psnr[0] =
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
+#endif
 
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {

diff --git a/libaom/aom_dsp/psnr.h b/libaom/aom_dsp/psnr.h
index 58e4e71..7f40b8b 100644
--- a/libaom/aom_dsp/psnr.h
+++ b/libaom/aom_dsp/psnr.h

@@ -35,6 +35,12 @@
  * \param[in]    sse           Sum of squared errors
  */
 double aom_sse_to_psnr(double samples, double peak, double sse);
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
                            int vstart, int height);
@@ -49,6 +55,13 @@
 int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height);
@@ -67,6 +80,7 @@
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr);
 

diff --git a/libaom/aom_dsp/psnrhvs.c b/libaom/aom_dsp/psnrhvs.c
index 30fe21d..69a1d99 100644
--- a/libaom/aom_dsp/psnrhvs.c
+++ b/libaom/aom_dsp/psnrhvs.c

@@ -102,13 +102,8 @@
     0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
 };
 
-static double convert_score_db(double _score, double _weight, int bit_depth) {
-  int16_t pix_max = 255;
+static double convert_score_db(double _score, double _weight, int16_t pix_max) {
   assert(_score * _weight >= 0.0);
-  if (bit_depth == 10)
-    pix_max = 1023;
-  else if (bit_depth == 12)
-    pix_max = 4095;
 
   if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
   return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
@@ -117,7 +112,8 @@
 static double calc_psnrhvs(const unsigned char *src, int _systride,
                            const unsigned char *dst, int _dystride, double _par,
                            int _w, int _h, int _step, const double _csf[8][8],
-                           uint32_t _shift, int buf_is_hbd) {
+                           uint32_t _shift, int buf_is_hbd, int16_t pix_max,
+                           int luma) {
   double ret;
   const uint8_t *_src8 = src;
   const uint8_t *_dst8 = dst;
@@ -131,8 +127,24 @@
   int pixels;
   int x;
   int y;
+  float sum1;
+  float sum2;
+  float delt;
   (void)_par;
   ret = pixels = 0;
+  sum1 = sum2 = delt = 0.0f;
+  for (y = 0; y < _h; y++) {
+    for (x = 0; x < _w; x++) {
+      if (!buf_is_hbd) {
+        sum1 += _src8[y * _systride + x];
+        sum2 += _dst8[y * _dystride + x];
+      } else {
+        sum1 += _src16[y * _systride + x] >> _shift;
+        sum2 += _dst16[y * _dystride + x] >> _shift;
+      }
+    }
+  }
+  if (luma) delt = (sum1 - sum2) / (_w * _h);
   /*In the PSNR-HVS-M paper[1] the authors describe the construction of
    their masking table as "we have used the quantization table for the
    color component Y of JPEG [6] that has been also obtained on the
@@ -140,7 +152,7 @@
    been normalized and then squared." Their CSF matrix (from PSNR-HVS)
    was also constructed from the JPEG matrices. I can not find any obvious
    scheme of normalizing to produce their table, but if I multiply their
-   CSF by 0.38857 and square the result I get their masking table.
+   CSF by 0.3885746225901003 and square the result I get their masking table.
    I have no idea where this constant comes from, but deviating from it
    too greatly hurts MOS agreement.
 
@@ -148,30 +160,28 @@
    Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
    of DCT basis functions", CD-ROM Proceedings of the Third
    International Workshop on Video Processing and Quality Metrics for Consumer
-   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+   Suggested in aomedia issue#2363:
+   0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+   of the old JPEG based matrix from the paper. Since you are not using that,
+   divide by actual maximum coefficient. */
   for (x = 0; x < 8; x++)
     for (y = 0; y < 8; y++)
-      mask[x][y] =
-          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
+      mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
   for (y = 0; y < _h - 7; y += _step) {
     for (x = 0; x < _w - 7; x += _step) {
       int i;
       int j;
-      double s_means[4];
-      double d_means[4];
-      double s_vars[4];
-      double d_vars[4];
+      int n = 0;
+      double s_gx = 0;
+      double s_gy = 0;
+      double g = 0;
       double s_gmean = 0;
-      double d_gmean = 0;
       double s_gvar = 0;
-      double d_gvar = 0;
       double s_mask = 0;
-      double d_mask = 0;
-      for (i = 0; i < 4; i++)
-        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
           if (!buf_is_hbd) {
             dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
             dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
@@ -179,35 +189,27 @@
             dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
             dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
           }
-          s_gmean += dct_s[i * 8 + j];
-          d_gmean += dct_d[i * 8 + j];
-          s_means[sub] += dct_s[i * 8 + j];
-          d_means[sub] += dct_d[i * 8 + j];
+          dct_d[i * 8 + j] += (int)(delt + 0.5f);
         }
       }
-      s_gmean /= 64.f;
-      d_gmean /= 64.f;
-      for (i = 0; i < 4; i++) s_means[i] /= 16.f;
-      for (i = 0; i < 4; i++) d_means[i] /= 16.f;
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
-          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
-          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
-                         (dct_s[i * 8 + j] - s_means[sub]);
-          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
-                         (dct_d[i * 8 + j] - d_means[sub]);
+      for (i = 1; i < 7; i++) {
+        for (j = 1; j < 7; j++) {
+          s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 -
+                  dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 -
+                  dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          g = sqrt(s_gx * s_gx + s_gy * s_gy);
+          if (g > 0.1f) n++;
+          s_gmean += g;
         }
       }
-      s_gvar *= 1 / 63.f * 64;
-      d_gvar *= 1 / 63.f * 64;
-      for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
-      for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
-      if (s_gvar > 0)
-        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
-      if (d_gvar > 0)
-        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+      s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
       if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
@@ -218,12 +220,7 @@
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
-      for (i = 0; i < 8; i++)
-        for (j = (i == 0); j < 8; j++)
-          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
-      s_mask = sqrt(s_mask * s_gvar) / 32.f;
-      d_mask = sqrt(d_mask * d_gvar) / 32.f;
-      if (d_mask > s_mask) s_mask = d_mask;
+      s_mask = sqrt(s_mask * s_gvar) / 8.f;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           double err;
@@ -238,6 +235,7 @@
   }
   if (pixels <= 0) return 0;
   ret /= pixels;
+  ret += 0.04 * delt * delt;
   return ret;
 }
 
@@ -254,19 +252,26 @@
   assert(src->flags == dst->flags);
   const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
 
+  int16_t pix_max = 255;
+  if (in_bd == 10)
+    pix_max = 1023;
+  else if (in_bd == 12)
+    pix_max = 4095;
+
   bd_shift = bd - in_bd;
 
-  *y_psnrhvs = calc_psnrhvs(
-      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
-      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
+  *y_psnrhvs =
+      calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride,
+                   par, src->y_crop_width, src->y_crop_height, step, csf_y,
+                   bd_shift, buf_is_hbd, pix_max, 1);
   *u_psnrhvs =
       calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
                    par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cb420, bd_shift, buf_is_hbd);
+                   csf_cb420, bd_shift, buf_is_hbd, pix_max, 0);
   *v_psnrhvs =
       calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
                    par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cr420, bd_shift, buf_is_hbd);
+                   csf_cr420, bd_shift, buf_is_hbd, pix_max, 0);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
-  return convert_score_db(psnrhvs, 1.0, in_bd);
+  return convert_score_db(psnrhvs, 1.0, pix_max);
 }

diff --git a/libaom/aom_dsp/quantize.c b/libaom/aom_dsp/quantize.c
index ced34b4..edd4d96 100644
--- a/libaom/aom_dsp/quantize.c
+++ b/libaom/aom_dsp/quantize.c

@@ -13,7 +13,7 @@
 #include "aom_mem/aom_mem.h"
 #include "av1/encoder/av1_quantize.h"
 
-void quantize_b_adaptive_helper_c(
+void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
@@ -54,7 +54,7 @@
   for (i = 0; i < non_zero_count; i++) {
     const int rc = scan[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     int tmp32;
 
@@ -104,14 +104,15 @@
   *eob_ptr = eob + 1;
 }
 
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan, const qm_val_t *qm_ptr,
-                         const qm_val_t *iqm_ptr, const int log_scale) {
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale) {
   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
@@ -139,7 +140,7 @@
   for (i = 0; i < non_zero_count; i++) {
     const int rc = scan[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     int tmp32;
 
@@ -166,21 +167,19 @@
   *eob_ptr = eob + 1;
 }
 
-void highbd_quantize_b_adaptive_helper_c(
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, const int log_scale) {
-  int i, eob = -1;
   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int dequant;
-  int idx_arr[4096];
   (void)iscan;
-  int idx = 0;
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -190,47 +189,49 @@
     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
 
   // Pre-scan pass
-  for (i = 0; i < n_coeffs; i++) {
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
     const int rc = scan[i];
     const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
     const int coeff = coeff_ptr[rc] * wt;
-
-    // If the coefficient is out of the base ZBIN range, keep it for
-    // quantization.
     const int prescan_add_val = prescan_add[rc != 0];
-    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
-        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
-      idx_arr[idx++] = i;
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
   }
 
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
 #if SKIP_EOB_FACTOR_ADJUST
   int first = -1;
 #endif  // SKIP_EOB_FACTOR_ADJUST
-  for (i = 0; i < idx; i++) {
-    const int rc = scan[idx_arr[i]];
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 =
-        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
-    const int64_t tmpw = tmp1 * wt;
-    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
-                                 (16 - log_scale + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dequant =
-        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
-    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-    if (abs_qcoeff) {
-      eob = idx_arr[i];
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                   (16 - log_scale + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      if (abs_qcoeff) {
+        eob = i;
 #if SKIP_EOB_FACTOR_ADJUST
-      if (first == -1) first = eob;
+        if (first == -1) first = eob;
 #endif  // SKIP_EOB_FACTOR_ADJUST
+      }
     }
   }
 #if SKIP_EOB_FACTOR_ADJUST
@@ -254,7 +255,7 @@
   *eob_ptr = eob + 1;
 }
 
-void highbd_quantize_b_helper_c(
+void aom_highbd_quantize_b_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
@@ -291,7 +292,7 @@
   for (i = 0; i < idx; i++) {
     const int rc = scan[idx_arr[i]];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
     const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -310,6 +311,7 @@
   }
   *eob_ptr = eob + 1;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* These functions should only be called when quantisation matrices
    are not used. */
@@ -321,10 +323,10 @@
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
-  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                               NULL, NULL, 0);
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 0);
 }
 
 void aom_quantize_b_32x32_adaptive_c(
@@ -333,10 +335,10 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                               NULL, NULL, 1);
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 1);
 }
 
 void aom_quantize_b_64x64_adaptive_c(
@@ -345,22 +347,23 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                               NULL, NULL, 2);
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 2);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_quantize_b_adaptive_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                      iscan, NULL, NULL, 0);
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
 void aom_highbd_quantize_b_32x32_adaptive_c(
@@ -369,10 +372,10 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                      iscan, NULL, NULL, 1);
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
 void aom_highbd_quantize_b_64x64_adaptive_c(
@@ -381,11 +384,12 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                      iscan, NULL, NULL, 2);
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 2);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -393,9 +397,9 @@
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 0);
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
 void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -405,9 +409,9 @@
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 1);
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
 void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -417,11 +421,12 @@
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 2);
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 2);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                              const int16_t *quant_ptr,
@@ -429,10 +434,10 @@
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 0);
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 0);
 }
 
 void aom_highbd_quantize_b_32x32_c(
@@ -441,10 +446,10 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 1);
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 1);
 }
 
 void aom_highbd_quantize_b_64x64_c(
@@ -453,8 +458,9 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 2);
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 2);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/quantize.h b/libaom/aom_dsp/quantize.h
index 43c30ee..3956318 100644
--- a/libaom/aom_dsp/quantize.h
+++ b/libaom/aom_dsp/quantize.h

@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-void quantize_b_adaptive_helper_c(
+void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
@@ -51,7 +51,8 @@
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan);
 
-void highbd_quantize_b_adaptive_helper_c(
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
@@ -79,15 +80,17 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan, const qm_val_t *qm_ptr,
-                         const qm_val_t *iqm_ptr, const int log_scale);
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale);
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -96,7 +99,8 @@
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan);
 
-void highbd_quantize_b_helper_c(
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
@@ -111,6 +115,7 @@
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/aom_dsp/sad.c b/libaom/aom_dsp/sad.c
index 9169e78..8ddc683 100644
--- a/libaom/aom_dsp/sad.c
+++ b/libaom/aom_dsp/sad.c

@@ -64,15 +64,24 @@
   }
 
 // Calculate sad against 4 reference locations and store each in sad_array
-#define sadMxNx4D(m, n)                                                    \
-  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i) {                                              \
-      sad_array[i] =                                                       \
-          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
-    }                                                                      \
+#define sadMxNx4D(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,           \
+                               const uint8_t *const ref_array[],             \
+                               int ref_stride, uint32_t *sad_array) {        \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] =                                                         \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);   \
+    }                                                                        \
+  }                                                                          \
+  void aom_sad##m##x##n##x4d_avg_c(                                          \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, const uint8_t *second_pred, uint32_t *sad_array) {     \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i], \
+                                              ref_stride, second_pred);      \
+    }                                                                        \
   }
 
 // 128x128
@@ -159,6 +168,7 @@
 sadMxN(64, 16);
 sadMxNx4D(64, 16);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride,
                                       int width, int height) {
@@ -178,11 +188,12 @@
 }
 
 static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
+                                       const uint8_t *b8, int b_stride,
                                        int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
   for (y = 0; y < height; y++) {
     for (x = 0; x < width; x++) {
       sad += abs(a[x] - b[x]);
@@ -204,18 +215,18 @@
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred) {                                            \
     uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \
-                             ref, ref_stride);                                 \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride);  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
   }                                                                            \
   unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c(                       \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
     uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred),           \
-                                      second_pred, m, n, ref, ref_stride,      \
-                                      jcp_param);                              \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref,      \
+                                      ref_stride, jcp_param);                  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
   }
 
 #define highbd_sadMxNx4D(m, n)                                               \
@@ -305,3 +316,4 @@
 highbd_sadMxNx4D(16, 64);
 highbd_sadMxN(64, 16);
 highbd_sadMxNx4D(64, 16);
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/sad_av1.c b/libaom/aom_dsp/sad_av1.c
index c176001..4675181 100644
--- a/libaom/aom_dsp/sad_av1.c
+++ b/libaom/aom_dsp/sad_av1.c

@@ -35,7 +35,6 @@
     b += b_stride;
     m += m_stride;
   }
-  sad = (sad + 31) >> 6;
   return sad;
 }
 
@@ -50,6 +49,21 @@
     else                                                                       \
       return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
                         msk_stride, m, n);                                     \
+  }                                                                            \
+  void aom_masked_sad##m##x##n##x4d_c(                                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, unsigned sads[]) {                      \
+    if (!invert_mask)                                                          \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
+                             m, msk, msk_stride, m, n);                        \
+      }                                                                        \
+    else                                                                       \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i],          \
+                             ref_stride, msk, msk_stride, m, n);               \
+      }                                                                        \
   }
 
 /* clang-format off */
@@ -75,10 +89,10 @@
 MASKSADMxN(32, 8)
 MASKSADMxN(16, 64)
 MASKSADMxN(64, 16)
+/* clang-format on */
 
-    /* clang-format on */
-
-    static INLINE
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
     unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
                                    const uint8_t *a8, int a_stride,
                                    const uint8_t *b8, int b_stride,
@@ -101,7 +115,6 @@
     b += b_stride;
     m += m_stride;
   }
-  sad = (sad + 31) >> 6;
 
   return sad;
 }
@@ -141,6 +154,7 @@
 HIGHBD_MASKSADMXN(32, 8)
 HIGHBD_MASKSADMXN(16, 64)
 HIGHBD_MASKSADMXN(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // pre: predictor being evaluated
 // wsrc: target weighted prediction (has been *4096 to keep precision)
@@ -193,9 +207,10 @@
 OBMCSADMxN(32, 8)
 OBMCSADMxN(16, 64)
 OBMCSADMxN(64, 16)
-    /* clang-format on */
+/* clang-format on */
 
-    static INLINE
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
     unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int width, int height) {
@@ -246,3 +261,4 @@
 HIGHBD_OBMCSADMXN(16, 64)
 HIGHBD_OBMCSADMXN(64, 16)
 /* clang-format on */
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/simd/v128_intrinsics.h b/libaom/aom_dsp/simd/v128_intrinsics.h
index 01dbb8f..218a7a6 100644
--- a/libaom/aom_dsp/simd/v128_intrinsics.h
+++ b/libaom/aom_dsp/simd/v128_intrinsics.h

@@ -54,26 +54,28 @@
   return c_v128_align(a, b, c);
 }
 
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
 SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
 
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+  return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
   return c_v128_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
   return c_v128_sad_u8_sum(s);
 }
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+  return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
   return c_v128_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
   return c_v128_ssd_u8_sum(s);
 }
 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
@@ -318,7 +320,7 @@
 }
 
 typedef uint32_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
   return c_v128_sad_u16_init();
 }
 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
@@ -330,7 +332,7 @@
 }
 
 typedef uint64_t ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
   return c_v128_ssd_s16_init();
 }
 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,

diff --git a/libaom/aom_dsp/simd/v128_intrinsics_arm.h b/libaom/aom_dsp/simd/v128_intrinsics_arm.h
index 3c669d5..2d497f4 100644
--- a/libaom/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/libaom/aom_dsp/simd/v128_intrinsics_arm.h

@@ -68,9 +68,11 @@
 #endif
 }
 
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
 
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
+SIMD_INLINE v128 v128_ones(void) {
+  return vreinterpretq_s64_u8(vdupq_n_u8(-1));
+}
 
 SIMD_INLINE v128 v128_dup_8(uint8_t x) {
   return vreinterpretq_s64_u8(vdupq_n_u8(x));
@@ -136,7 +138,7 @@
   sad64_internal hi, lo;
 } sad128_internal;
 
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
   sad128_internal s;
   s.hi = s.lo = vdupq_n_u16(0);
   return s;
@@ -165,7 +167,7 @@
   ssd64_internal hi, lo;
 } ssd128_internal;
 
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
   ssd128_internal s;
   s.hi = s.lo = v64_ssd_u8_init();
   return s;
@@ -784,68 +786,79 @@
 }
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        n * 8),
-                   (uint64_t)vorr_u64(
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  (8 - n) * 8)))
-             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
-                                             vget_high_s64(a)))
-                       : v128_from_64(
-                             0, (uint64_t)vshr_n_u64(
-                                    vreinterpret_u64_s64(vget_high_s64(a)),
-                                    (n - 8) * 8)));
+  return n == 0
+             ? a
+             : (n < 8
+                    ? v128_from_64(
+                          (uint64_t)vshr_n_u64(
+                              vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
+                          (uint64_t)vorr_u64(
+                              vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                         n * 8),
+                              vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                         (8 - n) * 8)))
+                    : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                                    vget_high_s64(a)))
+                              : v128_from_64(0, (uint64_t)vshr_n_u64(
+                                                    vreinterpret_u64_s64(
+                                                        vget_high_s64(a)),
+                                                    (n - 8) * 8))));
 }
 
 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
+  return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
+  return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
+  return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
+  return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
+  return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
+  return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
+  return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
+  return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
+  return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+  return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+  return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return vshrq_n_s64(a, c);
+  return c ? vshrq_n_s64(a, c) : a;
 }
 
 #else
@@ -920,7 +933,9 @@
 
 typedef uint32x4_t sad128_internal_u16;
 
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+  return vdupq_n_u32(0);
+}
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_sad_u16_sum(). */
@@ -939,7 +954,7 @@
 }
 
 typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_s16_sum(). */

diff --git a/libaom/aom_dsp/simd/v128_intrinsics_c.h b/libaom/aom_dsp/simd/v128_intrinsics_c.h
index bbe9a9d..466a41e 100644
--- a/libaom/aom_dsp/simd/v128_intrinsics_c.h
+++ b/libaom/aom_dsp/simd/v128_intrinsics_c.h

@@ -93,7 +93,7 @@
   c_v128_store_unaligned(p, a);
 }
 
-SIMD_INLINE c_v128 c_v128_zero() {
+SIMD_INLINE c_v128 c_v128_zero(void) {
   c_v128 t;
   t.u64[1] = t.u64[0] = 0;
   return t;
@@ -145,26 +145,39 @@
   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
 }
 
-typedef uint32_t c_sad128_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad128_internal;
 
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+  c_sad128_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
                                             c_v128 b) {
   int c;
   for (c = 0; c < 16; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
 
 typedef uint32_t c_ssd128_internal;
 
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_u8_sum(). */
@@ -720,6 +733,7 @@
 }
 
 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
   if (n < 8)
     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -729,6 +743,7 @@
 }
 
 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
   if (n < 8)
     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -854,7 +869,7 @@
 
 typedef uint32_t c_sad128_internal_u16;
 
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_sad_u16_sum(). */
@@ -870,7 +885,7 @@
 
 typedef uint64_t c_ssd128_internal_s16;
 
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_s16_sum(). */

diff --git a/libaom/aom_dsp/simd/v128_intrinsics_x86.h b/libaom/aom_dsp/simd/v128_intrinsics_x86.h
index 6c7241f..c404015 100644
--- a/libaom/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/libaom/aom_dsp/simd/v128_intrinsics_x86.h

@@ -45,7 +45,7 @@
 
 SIMD_INLINE v128 v128_load_unaligned(const void *p) {
 #if defined(__SSSE3__)
-  return (__m128i)_mm_lddqu_si128((__m128i *)p);
+  return _mm_lddqu_si128((__m128i *)p);
 #else
   return _mm_loadu_si128((__m128i *)p);
 #endif
@@ -89,7 +89,8 @@
 
 SIMD_INLINE v128 v128_dup_64(uint64_t x) {
   // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
-  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
+  return _mm_set_epi32((uint32_t)(x >> 32), (uint32_t)x, (uint32_t)(x >> 32),
+                       (uint32_t)x);
 }
 
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
@@ -538,7 +539,7 @@
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
                        _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 

diff --git a/libaom/aom_dsp/simd/v256_intrinsics.h b/libaom/aom_dsp/simd/v256_intrinsics.h
index cb99d35..17e36ee 100644
--- a/libaom/aom_dsp/simd/v256_intrinsics.h
+++ b/libaom/aom_dsp/simd/v256_intrinsics.h

@@ -57,29 +57,42 @@
   return c_v256_align(a, b, c);
 }
 
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
 SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
 
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+  return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
   return c_v256_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
   return c_v256_sad_u8_sum(s);
 }
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+  return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
   return c_v256_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
   return c_v256_ssd_u8_sum(s);
 }
 
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+                                               v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
+
 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
   return c_v256_dotp_su8(a, b);
 }
@@ -350,7 +363,7 @@
 }
 
 typedef uint32_t sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
   return c_v256_sad_u16_init();
 }
 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
@@ -361,16 +374,4 @@
   return c_v256_sad_u16_sum(s);
 }
 
-typedef uint64_t ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
-  return c_v256_ssd_s16_init();
-}
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
-                                             v256 b) {
-  return c_v256_ssd_s16(s, a, b);
-}
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
-  return c_v256_ssd_s16_sum(s);
-}
-
 #endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_

diff --git a/libaom/aom_dsp/simd/v256_intrinsics_c.h b/libaom/aom_dsp/simd/v256_intrinsics_c.h
index a1c08e9..8127ee3 100644
--- a/libaom/aom_dsp/simd/v256_intrinsics_c.h
+++ b/libaom/aom_dsp/simd/v256_intrinsics_c.h

@@ -149,9 +149,16 @@
   return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
 }
 
-typedef uint32_t c_sad256_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad256_internal;
 
-SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+  c_sad256_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
    v256_sad_u8_sum().
@@ -160,11 +167,17 @@
                                             c_v256 b) {
   int c;
   for (c = 0; c < 32; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
 
 typedef uint32_t c_ssd256_internal;
 
@@ -746,6 +759,7 @@
 }
 
 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
   if (n < 16)
     return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
                                       c_v128_shr_n_byte(a.v128[0], 16 - n)),
@@ -758,6 +772,7 @@
 }
 
 SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
   if (n < 16)
     return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
                             c_v128_or(c_v128_shr_n_byte(a.v128[0], n),

diff --git a/libaom/aom_dsp/simd/v256_intrinsics_v128.h b/libaom/aom_dsp/simd/v256_intrinsics_v128.h
index d5b7905..0d22667 100644
--- a/libaom/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/libaom/aom_dsp/simd/v256_intrinsics_v128.h

@@ -73,7 +73,7 @@
   v128_store_aligned((uint8_t *)p + 16, a.val[1]);
 }
 
-SIMD_INLINE v256 v256_zero() {
+SIMD_INLINE v256 v256_zero(void) {
   return v256_from_v128(v128_zero(), v128_zero());
 }
 
@@ -117,7 +117,7 @@
   sad128_internal val[2];
 } sad256_internal;
 
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
   sad256_internal t;
   t.val[1] = v128_sad_u8_init();
   t.val[0] = v128_sad_u8_init();
@@ -142,7 +142,7 @@
   ssd128_internal val[2];
 } ssd256_internal;
 
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
   ssd256_internal t;
   t.val[1] = v128_ssd_u8_init();
   t.val[0] = v128_ssd_u8_init();
@@ -780,13 +780,16 @@
                   (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
                   v128_zero()))
 
-#define v256_shr_n_byte(a, n)                                              \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
-                             v128_or(v128_shr_n_byte(a.val[0], n),         \
-                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
-            : v256_from_v128(                                              \
-                  v128_zero(),                                             \
-                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
+#define v256_shr_n_byte(a, n)                                                \
+  (n == 0                                                                    \
+       ? a                                                                   \
+       : ((n) < 16                                                           \
+              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                               v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                       v128_shl_n_byte(a.val[1], 16 - (n)))) \
+              : v256_from_v128(                                              \
+                    v128_zero(),                                             \
+                    (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
 
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
@@ -823,7 +826,7 @@
   sad128_internal_u16 val[2];
 } sad256_internal_u16;
 
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
   sad256_internal_u16 t;
   t.val[1] = v128_sad_u16_init();
   t.val[0] = v128_sad_u16_init();
@@ -849,7 +852,7 @@
   ssd128_internal_s16 val[2];
 } ssd256_internal_s16;
 
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
   ssd256_internal_s16 t;
   t.val[1] = v128_ssd_s16_init();
   t.val[0] = v128_ssd_s16_init();

diff --git a/libaom/aom_dsp/simd/v256_intrinsics_x86.h b/libaom/aom_dsp/simd/v256_intrinsics_x86.h
index 44594bc..5983cb8 100644
--- a/libaom/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/libaom/aom_dsp/simd/v256_intrinsics_x86.h

@@ -57,7 +57,7 @@
 }
 
 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+  return _mm256_set_epi64x(a, b, c, d);
 }
 
 SIMD_INLINE v256 v256_load_aligned(const void *p) {
@@ -76,7 +76,7 @@
   _mm256_storeu_si256((__m256i *)p, a);
 }
 
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
 
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
 
@@ -187,11 +187,11 @@
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+  return _mm256_permute2x128_si256(a, b, 0x02);
 }
 
 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+  return _mm256_permute2x128_si256(a, b, 0x13);
 }
 
 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -256,9 +256,7 @@
       _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
   return _mm256_unpacklo_epi8(
@@ -311,11 +309,11 @@
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+  return _mm256_cvtepu16_epi32(a);
 }
 
 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+  return _mm256_cvtepi16_epi32(a);
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
@@ -442,7 +440,7 @@
 
 typedef v256 sad256_internal;
 
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
   return _mm256_setzero_si256();
 }
 
@@ -460,7 +458,7 @@
 
 typedef v256 ssd256_internal;
 
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
   return _mm256_setzero_si256();
 }
 
@@ -603,7 +601,7 @@
 }
 
 SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
                           _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
@@ -646,7 +644,7 @@
 }
 
 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-#if defined(__AVX512F__)
+#if defined(__AVX512VL__)
   return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
 #else
   return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
@@ -670,13 +668,15 @@
   ((n) < 16                                                                  \
        ? _mm256_alignr_epi8(                                                 \
              _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
-       : _mm256_inserti128_si256(                                            \
-             _mm256_setzero_si256(),                                         \
-             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
+       : ((n) == 16                                                          \
+              ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3)      \
+              : _mm256_inserti128_si256(                                     \
+                    _mm256_setzero_si256(),                                  \
+                    v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)))
 
 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
 #define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
 #define v256_shl_n_8(a, c)                                   \
   _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
@@ -701,7 +701,7 @@
 
 typedef v256 sad256_internal_u16;
 
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v256_sad_u16_sum(). */
@@ -728,7 +728,7 @@
 
 typedef v256 ssd256_internal_s16;
 
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v256_ssd_s16_sum(). */

diff --git a/libaom/aom_dsp/simd/v64_intrinsics.h b/libaom/aom_dsp/simd/v64_intrinsics.h
index afc5542..7079949 100644
--- a/libaom/aom_dsp/simd/v64_intrinsics.h
+++ b/libaom/aom_dsp/simd/v64_intrinsics.h

@@ -65,7 +65,7 @@
   return c_v64_align(a, b, c);
 }
 
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
 SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
 SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
@@ -128,20 +128,22 @@
   return c_v64_shuffle_8(a, pattern);
 }
 
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+  return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
   return c_v64_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
   return c_v64_sad_u8_sum(s);
 }
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+  return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
   return c_v64_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
   return c_v64_ssd_u8_sum(s);
 }
 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }

diff --git a/libaom/aom_dsp/simd/v64_intrinsics_arm.h b/libaom/aom_dsp/simd/v64_intrinsics_arm.h
index 8f39ad6..a4ecdf4 100644
--- a/libaom/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/libaom/aom_dsp/simd/v64_intrinsics_arm.h

@@ -71,7 +71,11 @@
 #elif defined(__CC_ARM)
   *(__packed uint32_t *)p) = a;
 #elif defined(__GNUC__)
-  *((__attribute((packed)) uint32_t *)p) = a;
+  struct Unaligned32Struct {
+    uint32_t value;
+    uint8_t dummy;  // To make the size non-power-of-two.
+  } __attribute__((__packed__));
+  ((struct Unaligned32Struct *)p)->value = a;
 #else
   vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
                 0);
@@ -107,7 +111,7 @@
 #endif
 }
 
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); }
 
 SIMD_INLINE v64 v64_dup_8(uint8_t x) {
   return vreinterpret_s64_u8(vdup_n_u8(x));
@@ -158,7 +162,7 @@
 
 typedef uint16x8_t sad64_internal;
 
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); }
 
 // Implementation dependent return value. Result must be finalised with
 // v64_sad_u8_sum().
@@ -177,7 +181,7 @@
 
 typedef uint32x4_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); }
 
 // Implementation dependent return value. Result must be finalised with
 // v64_ssd_u8_sum().
@@ -604,39 +608,39 @@
 }
 
 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
+  return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
+  return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
+  return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
+  return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
+  return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
+  return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
+  return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
+  return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
+  return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a;
 }
 
 #else

diff --git a/libaom/aom_dsp/simd/v64_intrinsics_c.h b/libaom/aom_dsp/simd/v64_intrinsics_c.h
index 028d68c..b84f243 100644
--- a/libaom/aom_dsp/simd/v64_intrinsics_c.h
+++ b/libaom/aom_dsp/simd/v64_intrinsics_c.h

@@ -143,7 +143,7 @@
   c_v64_store_unaligned(p, a);
 }
 
-SIMD_INLINE c_v64 c_v64_zero() {
+SIMD_INLINE c_v64 c_v64_zero(void) {
   c_v64 t;
   t.u64 = 0;
   return t;
@@ -171,14 +171,14 @@
 SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
   return t;
 }
 
 SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
   return t;
 }
 
@@ -228,7 +228,7 @@
 SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
   return t;
 }
 
@@ -252,7 +252,7 @@
 SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
   return t;
 }
 
@@ -288,14 +288,15 @@
   c_v64 t;
   int c;
   for (c = 0; c < 4; c++)
-    t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
+    t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
   return t;
 }
 
 SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
   return t;
 }
 
@@ -526,14 +527,14 @@
     a = b;
     b = u;
   }
-  t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
-  t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
-  t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
-  t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
-  t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
-  t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
-  t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
-  t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
+  t.u8[7] = (uint8_t)(a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]);
+  t.u8[6] = (uint8_t)(a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]);
+  t.u8[5] = (uint8_t)(a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]);
+  t.u8[4] = (uint8_t)(a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]);
+  t.u8[3] = (uint8_t)(b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]);
+  t.u8[2] = (uint8_t)(b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]);
+  t.u8[1] = (uint8_t)(b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]);
+  t.u8[0] = (uint8_t)(b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]);
   return t;
 }
 
@@ -600,28 +601,41 @@
   return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
 }
 
-typedef uint32_t c_sad64_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+  c_sad64_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
+   v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+   undefined. */
 SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
                                           c_v64 b) {
   int c;
   for (c = 0; c < 8; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
 
 typedef uint32_t c_ssd64_internal;
 
 /* Implementation dependent return value.  Result must be finalised with
  * v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
 
 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
                                           c_v64 b) {
@@ -817,7 +831,7 @@
     fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
     abort();
   }
-  for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
+  for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
   return t;
 }
 
@@ -850,7 +864,7 @@
     fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
     abort();
   }
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
   return t;
 }
 

diff --git a/libaom/aom_dsp/simd/v64_intrinsics_x86.h b/libaom/aom_dsp/simd/v64_intrinsics_x86.h
index 5f9a57b..1f273fe 100644
--- a/libaom/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/libaom/aom_dsp/simd/v64_intrinsics_x86.h

@@ -99,7 +99,7 @@
        : (b))
 #endif
 
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
 
 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
 
@@ -319,7 +319,7 @@
 
 typedef v64 sad64_internal;
 
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
 
 /* Implementation dependent return value.  Result must be finalised with
    v64_sad_u8_sum().
@@ -332,7 +332,7 @@
 
 typedef v64 ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v64_ssd_u8_sum(). */
@@ -438,7 +438,7 @@
 }
 
 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
                        _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 

diff --git a/libaom/aom_dsp/sse.c b/libaom/aom_dsp/sse.c
index 2493948..16f6b58 100644
--- a/libaom/aom_dsp/sse.c
+++ b/libaom/aom_dsp/sse.c

@@ -33,6 +33,7 @@
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
                          int b_stride, int width, int height) {
   int y, x;
@@ -50,3 +51,4 @@
   }
   return sse;
 }
+#endif

diff --git a/libaom/aom_dsp/ssim.c b/libaom/aom_dsp/ssim.c
index 681770b..95b8888 100644
--- a/libaom/aom_dsp/ssim.c
+++ b/libaom/aom_dsp/ssim.c

@@ -49,6 +49,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -64,6 +65,7 @@
     }
   }
 }
+#endif
 
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2

diff --git a/libaom/aom_dsp/subtract.c b/libaom/aom_dsp/subtract.c
index 2f6da96..4f4e355 100644
--- a/libaom/aom_dsp/subtract.c
+++ b/libaom/aom_dsp/subtract.c

@@ -32,6 +32,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
                                  ptrdiff_t diff_stride, const uint8_t *src8,
                                  ptrdiff_t src_stride, const uint8_t *pred8,
@@ -51,3 +52,4 @@
     src += src_stride;
   }
 }
+#endif

diff --git a/libaom/aom_dsp/sum_squares.c b/libaom/aom_dsp/sum_squares.c
index 44ec41f..d739a60 100644
--- a/libaom/aom_dsp/sum_squares.c
+++ b/libaom/aom_dsp/sum_squares.c

@@ -38,3 +38,36 @@
 
   return ss;
 }
+
+uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) {
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint8_t v = src[c];
+      ss += v * v;
+      s += v;
+    }
+    src += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint16_t v = srcp[c];
+      ss += v * v;
+      s += v;
+    }
+    srcp += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}

diff --git a/libaom/aom_dsp/variance.c b/libaom/aom_dsp/variance.c
index 18a33c5..695f12a 100644
--- a/libaom/aom_dsp/variance.c
+++ b/libaom/aom_dsp/variance.c

@@ -23,9 +23,10 @@
 #include "aom_dsp/blend.h"
 #include "aom_dsp/variance.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -295,75 +296,24 @@
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }
@@ -468,6 +418,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int a_stride,
                               const uint8_t *b8, int b_stride, int w, int h,
                               uint64_t *sse, int64_t *sum) {
@@ -888,74 +839,24 @@
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }
@@ -1070,6 +971,7 @@
     pred += width;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                           int height, const uint8_t *ref, int ref_stride,
@@ -1153,6 +1055,7 @@
 MASK_SUBPIX_VAR(16, 64)
 MASK_SUBPIX_VAR(64, 16)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
                                  int width, int height, const uint8_t *ref8,
                                  int ref_stride, const uint8_t *mask,
@@ -1277,6 +1180,7 @@
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
@@ -1384,6 +1288,7 @@
 OBMC_VAR(64, 16)
 OBMC_SUBPIX_VAR(64, 16)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
@@ -1575,3 +1480,4 @@
 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/vmaf.c b/libaom/aom_dsp/vmaf.c
new file mode 100644
index 0000000..3a012e7
--- /dev/null
+++ b/libaom/aom_dsp/vmaf.c

@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <libvmaf/libvmaf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/vmaf.h"
+#include "aom_ports/system_state.h"
+
+typedef struct FrameData {
+  const YV12_BUFFER_CONFIG *source;
+  const YV12_BUFFER_CONFIG *distorted;
+  int frame_set;
+  int bit_depth;
+} FrameData;
+
+static void vmaf_fatal_error(const char *message) {
+  fprintf(stderr, "Fatal error: %s\n", message);
+  exit(EXIT_FAILURE);
+}
+
+// A callback function used to pass data to VMAF.
+// Returns 0 after reading a frame.
+// Returns 2 when there is no more frame to read.
+static int read_frame(float *ref_data, float *main_data, float *temp_data,
+                      int stride, void *user_data) {
+  FrameData *frames = (FrameData *)user_data;
+
+  if (!frames->frame_set) {
+    const int width = frames->source->y_width;
+    const int height = frames->source->y_height;
+    assert(width == frames->distorted->y_width);
+    assert(height == frames->distorted->y_height);
+
+    if (frames->bit_depth > 8) {
+      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
+      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
+      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          ref_data[col] = scale_factor * (float)ref_ptr[col];
+        }
+        ref_ptr += frames->source->y_stride;
+        ref_data += stride / sizeof(*ref_data);
+      }
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          main_data[col] = scale_factor * (float)main_ptr[col];
+        }
+        main_ptr += frames->distorted->y_stride;
+        main_data += stride / sizeof(*main_data);
+      }
+    } else {
+      uint8_t *ref_ptr = frames->source->y_buffer;
+      uint8_t *main_ptr = frames->distorted->y_buffer;
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          ref_data[col] = (float)ref_ptr[col];
+        }
+        ref_ptr += frames->source->y_stride;
+        ref_data += stride / sizeof(*ref_data);
+      }
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          main_data[col] = (float)main_ptr[col];
+        }
+        main_ptr += frames->distorted->y_stride;
+        main_data += stride / sizeof(*main_data);
+      }
+    }
+    frames->frame_set = 1;
+    return 0;
+  }
+
+  (void)temp_data;
+  return 2;
+}
+
+void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, const int bit_depth,
+                   double *const vmaf) {
+  aom_clear_system_state();
+  const int width = source->y_width;
+  const int height = source->y_height;
+  FrameData frames = { source, distorted, 0, bit_depth };
+  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
+  double vmaf_score;
+  const int ret =
+      compute_vmaf(&vmaf_score, fmt, width, height, read_frame,
+                   /*user_data=*/&frames, (char *)model_path,
+                   /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1,
+                   /*disable_avx=*/0, /*enable_transform=*/0,
+                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
+                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
+                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
+  if (ret) vmaf_fatal_error("Failed to compute VMAF scores.");
+
+  aom_clear_system_state();
+  *vmaf = vmaf_score;
+}
+
+void aom_calc_vmaf_multi_frame(
+    void *user_data, const char *model_path,
+    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
+                      int stride_byte, void *user_data),
+    int frame_width, int frame_height, int bit_depth, double *vmaf) {
+  aom_clear_system_state();
+
+  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
+  double vmaf_score;
+  const int ret = compute_vmaf(
+      &vmaf_score, fmt, frame_width, frame_height, read_frame,
+      /*user_data=*/user_data, (char *)model_path,
+      /*log_path=*/"vmaf_scores.xml", /*log_fmt=*/NULL, /*disable_clip=*/0,
+      /*disable_avx=*/0, /*enable_transform=*/0,
+      /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
+      /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
+      /*n_subsample=*/1, /*enable_conf_interval=*/0);
+  FILE *vmaf_log = fopen("vmaf_scores.xml", "r");
+  if (vmaf_log == NULL || ret) {
+    vmaf_fatal_error("Failed to compute VMAF scores.");
+  }
+
+  int frame_index = 0;
+  char buf[512];
+  while (fgets(buf, 511, vmaf_log) != NULL) {
+    if (memcmp(buf, "\t\t<frame ", 9) == 0) {
+      char *p = strstr(buf, "vmaf=");
+      if (p != NULL && p[5] == '"') {
+        char *p2 = strstr(&p[6], "\"");
+        *p2 = '\0';
+        const double score = atof(&p[6]);
+        if (score < 0.0 || score > 100.0) {
+          vmaf_fatal_error("Failed to compute VMAF scores.");
+        }
+        vmaf[frame_index++] = score;
+      }
+    }
+  }
+  fclose(vmaf_log);
+
+  aom_clear_system_state();
+}

diff --git a/libaom/aom_dsp/vmaf.h b/libaom/aom_dsp/vmaf.h
new file mode 100644
index 0000000..fb8bf46
--- /dev/null
+++ b/libaom/aom_dsp/vmaf.h

@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VMAF_H_
+#define AOM_AOM_DSP_VMAF_H_
+
+#include "aom_scale/yv12config.h"
+
+void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   double *vmaf);
+
+void aom_calc_vmaf_multi_frame(
+    void *user_data, const char *model_path,
+    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
+                      int stride_byte, void *user_data),
+    int frame_width, int frame_height, int bit_depth, double *vmaf);
+
+#endif  // AOM_AOM_DSP_VMAF_H_

diff --git a/libaom/aom_dsp/x86/adaptive_quantize_avx2.c b/libaom/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 0000000..e33dff2
--- /dev/null
+++ b/libaom/aom_dsp/x86/adaptive_quantize_avx2.c

@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+                                      const int16_t *round_ptr, __m256i *round,
+                                      const int16_t *quant_ptr, __m256i *quant,
+                                      const int16_t *dequant_ptr,
+                                      __m256i *dequant,
+                                      const int16_t *shift_ptr,
+                                      __m256i *shift) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i zero = _mm256_setzero_si256();
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+  coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+  coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+                                         const __m256i *quant,
+                                         const __m256i *shift) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_adds_epi16(*coeff, *round);
+  tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+  qcoeff = _mm256_add_epi16(tmp, qcoeff);
+  *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+  return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff, qcoeff;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                     dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff = load_coefficients_avx2(coeff_ptr);
+  qcoeff = _mm256_abs_epi16(coeff);
+  update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+  update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    // Reinsert signs
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    // Mask out zbin threshold coeffs
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff = load_coefficients_avx2(coeff_ptr + index);
+    qcoeff = _mm256_abs_epi16(coeff);
+    update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+    update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff0 = qcoeff_ptr[rc];
+    if (qcoeff0) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff0 = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff0);
+      const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}

diff --git a/libaom/aom_dsp/x86/adaptive_quantize_sse2.c b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
index 3822c27..584cd67 100644
--- a/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c

@@ -22,41 +22,31 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  const __m128i zero = _mm_setzero_si128();
   int index = 16;
-  int non_zero_count = (int)n_coeffs;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i cmp_mask0, cmp_mask1;
-  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
 
   int prescan_add[2];
-  for (int i = 0; i < 2; ++i)
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
-  // max buffer is of size 256 as this functions calls with
-  // maximum n_coeffs as 256
-  int16_t prescan[256];
-  memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
-  // TODO(Aniket): Experiment the following loop with intrinsic
-  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = 1 << AOM_QM_BITS;
-    const int coeff = coeff_ptr[rc] * wt;
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int prescan_add_val = prescan_add[rc != 0];
-    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
-      prescan[rc] = 0;
-      non_zero_count--;
-    } else {
-      break;
-    }
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
   }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
 #if SKIP_EOB_FACTOR_ADJUST
   int first = -1;
 #endif
@@ -74,13 +64,15 @@
   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
-  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
 
-  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
   if (_mm_movemask_epi8(all_zero) == 0) {
     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -121,13 +113,9 @@
 
     store_coefficients(coeff0, dqcoeff_ptr);
     store_coefficients(coeff1, dqcoeff_ptr + 8);
-
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
-  // TODO(Aniket): Reduce the processing of coeff quatization
-  // based on eob logic
   while (index < n_coeffs) {
     coeff0 = load_coefficients(coeff_ptr + index);
     coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -137,11 +125,13 @@
     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
-    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
 
-    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
-    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
 
     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
     if (_mm_movemask_epi8(all_zero) == 0) {
@@ -174,14 +164,27 @@
     store_coefficients(coeff0, dqcoeff_ptr + index);
     store_coefficients(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
-    eob = _mm_max_epi16(eob, eob0);
     index += 16;
   }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
 
-  *eob_ptr = accumulate_eob(eob);
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
 
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
 #if SKIP_EOB_FACTOR_ADJUST
   // TODO(Aniket): Experiment the following loop with intrinsic by combining
   // with the quantization loop above
@@ -196,14 +199,14 @@
   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
     const int rc = scan[(*eob_ptr - 1)];
     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
-      const qm_val_t wt = (1 << AOM_QM_BITS);
       const int coeff = coeff_ptr[rc] * wt;
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
       const int prescan_add_val =
           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
-      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
         qcoeff_ptr[rc] = 0;
         dqcoeff_ptr[rc] = 0;
         *eob_ptr = 0;
@@ -220,8 +223,11 @@
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
   int index = 16;
-  int non_zero_count = (int)n_coeffs;
   const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
@@ -229,34 +235,23 @@
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i cmp_mask0, cmp_mask1;
-  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-
   int prescan_add[2];
-  for (int i = 0; i < 2; ++i)
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
-  // max buffer is of size 1024 as this functions calls with
-  // maximum n_coeffs as 1024
-  int16_t prescan[1024];
-  memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
-  // TODO(Aniket): Experiment the following loop with intrinsic
-  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = 1 << AOM_QM_BITS;
-    const int coeff = coeff_ptr[rc] * wt;
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int prescan_add_val = prescan_add[rc != 0];
-    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
-      prescan[rc] = 0;
-      non_zero_count--;
-    } else {
-      break;
-    }
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
   }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
 #if SKIP_EOB_FACTOR_ADJUST
   int first = -1;
 #endif
@@ -273,6 +268,7 @@
   zbin = _mm_srli_epi16(zbin, log_scale);
   round = _mm_srli_epi16(round, log_scale);
   zbin = _mm_sub_epi16(zbin, one);
+
   // Do DC and first 15 AC.
   coeff0 = load_coefficients(coeff_ptr);
   coeff1 = load_coefficients(coeff_ptr + 8);
@@ -282,13 +278,15 @@
   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
-  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
 
-  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
   if (_mm_movemask_epi8(all_zero) == 0) {
     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -305,11 +303,9 @@
     dequant = _mm_unpackhi_epi64(dequant, dequant);
   } else {
     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
-
     round = _mm_unpackhi_epi64(round, round);
     quant = _mm_unpackhi_epi64(quant, quant);
     shift = _mm_unpackhi_epi64(shift, shift);
-
     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
 
     // Reinsert signs
@@ -328,14 +324,9 @@
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
                                           dqcoeff_ptr + 8, &log_scale);
-
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
-  // TODO(Aniket): Reduce the processing of coeff quatization
-  // based on eob logic
   while (index < n_coeffs) {
     coeff0 = load_coefficients(coeff_ptr + index);
     coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -345,11 +336,13 @@
     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
-    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
 
-    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
-    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
 
     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
     if (_mm_movemask_epi8(all_zero) == 0) {
@@ -380,15 +373,27 @@
                                           dqcoeff_ptr + index, &log_scale);
     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
                                           dqcoeff_ptr + index + 8, &log_scale);
-
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
-    eob = _mm_max_epi16(eob, eob0);
     index += 16;
   }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
 
-  *eob_ptr = accumulate_eob(eob);
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
 
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
 #if SKIP_EOB_FACTOR_ADJUST
   // TODO(Aniket): Experiment the following loop with intrinsic by combining
   // with the quantization loop above
@@ -403,9 +408,216 @@
   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
     const int rc = scan[(*eob_ptr - 1)];
     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
-      const qm_val_t wt = (1 << AOM_QM_BITS);
       const int coeff = coeff_ptr[rc] * wt;
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
       const int prescan_add_val =

diff --git a/libaom/aom_dsp/x86/aom_asm_stubs.c b/libaom/aom_dsp/x86/aom_asm_stubs.c
index 3bf7b55..ce8285e 100644
--- a/libaom/aom_dsp/x86/aom_asm_stubs.c
+++ b/libaom/aom_dsp/x86/aom_asm_stubs.c

@@ -49,6 +49,7 @@
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
@@ -90,5 +91,5 @@
 //                                     int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
+#endif
 #endif  // HAVE_SSE2

diff --git a/libaom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/libaom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 79324f5..f64b821 100644
--- a/libaom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/libaom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c

@@ -20,30 +20,30 @@
 #include "aom_ports/emmintrin_compat.h"
 
 // filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
+                                                              3, 4, 2, 3, 3, 4,
+                                                              4, 5, 5, 6 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
+                                                              7, 8, 6, 7, 7, 8,
+                                                              8, 9, 9, 10 };
 
 // filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
+                                      4, 5, 5, 6, 6, 7, 7, 8 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
+                                      6, 7, 7, 8, 8, 9, 9, 10 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt3_global[16]) = { 4, 5, 5, 6,  6,  7,  7,  8,
+                                      8, 9, 9, 10, 10, 11, 11, 12 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt4_global[16]) = { 6,  7,  7,  8,  8,  9,  9,  10,
+                                      10, 11, 11, 12, 12, 13, 13, 14 };
 
 DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
   0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,

diff --git a/libaom/aom_dsp/x86/avg_intrin_avx2.c b/libaom/aom_dsp/x86/avg_intrin_avx2.c
index e0ba8d5..3bbffbd 100644
--- a/libaom/aom_dsp/x86/avg_intrin_avx2.c
+++ b/libaom/aom_dsp/x86/avg_intrin_avx2.c

@@ -181,6 +181,38 @@
   hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
 }
 
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (int idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+    _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
 void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
   // For high bitdepths, it is unnecessary to store_tran_low
@@ -224,13 +256,236 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 int aom_satd_avx2(const tran_low_t *coeff, int length) {
-  const __m256i one = _mm256_set1_epi16(1);
   __m256i accum = _mm256_setzero_si256();
   int i;
 
-  for (i = 0; i < length; i += 16) {
-    const __m256i src_line = load_tran_low(coeff);
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+
+  for (int i = 0; i < length; i += 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
     const __m256i abs = _mm256_abs_epi16(src_line);
     const __m256i sum = _mm256_madd_epi16(abs, one);
     accum = _mm256_add_epi32(accum, sum);

diff --git a/libaom/aom_dsp/x86/avg_intrin_sse2.c b/libaom/aom_dsp/x86/avg_intrin_sse2.c
index 0c20261..260ca2a 100644
--- a/libaom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/avg_intrin_sse2.c

@@ -139,7 +139,7 @@
   return (avg + 8) >> 4;
 }
 
-static void hadamard_col8_sse2(__m128i *in, int iter) {
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
   __m128i a2 = in[2];
@@ -272,6 +272,38 @@
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
 static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
@@ -406,3 +438,75 @@
 
   return _mm_cvtsi128_si32(accum);
 }
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int idx = 1;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+  do {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+    idx += 2;
+  } while (idx < height_1);
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+  if (height == 128) {
+    s0 = _mm_srai_epi16(s0, 6);
+    s1 = _mm_srai_epi16(s1, 6);
+  } else if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    assert(height == 16);
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}

diff --git a/libaom/aom_dsp/x86/blend_a64_hmask_sse4.c b/libaom/aom_dsp/x86/blend_a64_hmask_sse4.c
index 4f5e3f8..e0289ab 100644
--- a/libaom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/libaom/aom_dsp/x86/blend_a64_hmask_sse4.c

@@ -24,6 +24,7 @@
                             src1_stride, mask, 0, w, h, 0, 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
@@ -32,3 +33,4 @@
                                    src1_8, src1_stride, mask, 0, w, h, 0, 0,
                                    bd);
 }
+#endif

diff --git a/libaom/aom_dsp/x86/blend_a64_mask_avx2.c b/libaom/aom_dsp/x86/blend_a64_mask_avx2.c
index 057f615..95383d2 100644
--- a/libaom/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/libaom/aom_dsp/x86/blend_a64_mask_avx2.c

@@ -870,7 +870,7 @@
                              const uint8_t *src0, uint32_t src0_stride,
                              const uint8_t *src1, uint32_t src1_stride,
                              const uint8_t *mask, uint32_t mask_stride, int w,
-                             int h, int subx, int suby) {
+                             int h, int subw, int subh) {
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
@@ -881,15 +881,15 @@
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
+                         mask, mask_stride, w, h, subw, subh);
   } else {
-    if (subx & suby) {
+    if (subw & subh) {
       blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
                                 src1_stride, mask, mask_stride, w, h);
-    } else if (subx) {
+    } else if (subw) {
       blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
                              src1_stride, mask, mask_stride, w, h);
-    } else if (suby) {
+    } else if (subh) {
       blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
                              src1_stride, mask, mask_stride, w, h);
     } else {
@@ -899,6 +899,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // aom_highbd_blend_a64_d16_mask_avx2()
 //////////////////////////////////////////////////////////////////////////////
@@ -1370,3 +1371,4 @@
                                     subh, conv_params, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/blend_a64_mask_sse4.c b/libaom/aom_dsp/x86/blend_a64_mask_sse4.c
index b7a2468..4a368ef 100644
--- a/libaom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/libaom/aom_dsp/x86/blend_a64_mask_sse4.c

@@ -339,8 +339,8 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
@@ -386,7 +386,7 @@
                                const uint8_t *src0, uint32_t src0_stride,
                                const uint8_t *src1, uint32_t src1_stride,
                                const uint8_t *mask, uint32_t mask_stride, int w,
-                               int h, int subx, int suby) {
+                               int h, int subw, int subh) {
   typedef void (*blend_fn)(
       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
@@ -415,14 +415,15 @@
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
+                         mask, mask_stride, w, h, subw, subh);
   } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
                                               src0_stride, src1, src1_stride,
                                               mask, mask_stride, w, h);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -518,8 +519,8 @@
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -565,8 +566,8 @@
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -710,8 +711,8 @@
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -762,8 +763,8 @@
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -812,20 +813,19 @@
 //////////////////////////////////////////////////////////////////////////////
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
-
 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
                                       const uint8_t *src0_8,
                                       uint32_t src0_stride,
                                       const uint8_t *src1_8,
                                       uint32_t src1_stride, const uint8_t *mask,
                                       uint32_t mask_stride, int w, int h,
-                                      int subx, int suby, int bd) {
+                                      int subw, int subh, int bd) {
   typedef void (*blend_fn)(
       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
       const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
-  // Dimensions are: bd_index X width_index X subx X suby
+  // Dimensions are: bd_index X width_index X subw X subh
   static const blend_fn blend[2][2][2][2] = {
     {   // bd == 8 or 10
       { // w % 8 == 0
@@ -858,18 +858,19 @@
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, w, h, subx,
-                                suby, bd);
+                                src1_stride, mask, mask_stride, w, h, subw,
+                                subh, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
         mask_stride, w, h);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void blend_a64_d16_mask_w16_sse41(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
@@ -1111,7 +1112,7 @@
 //////////////////////////////////////////////////////////////////////////////
 // aom_highbd_blend_a64_d16_mask_sse4_1()
 //////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
@@ -1556,3 +1557,4 @@
                                     subh, conv_params, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/blend_a64_vmask_sse4.c b/libaom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 0649102..75fb1c5 100644
--- a/libaom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/libaom/aom_dsp/x86/blend_a64_vmask_sse4.c

@@ -143,6 +143,7 @@
                  h);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -281,3 +282,4 @@
                                   src1_stride, mask, w, h);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/blk_sse_sum_avx2.c b/libaom/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 0000000..f7c0eb0
--- /dev/null
+++ b/libaom/aom_dsp/x86/blk_sse_sum_avx2.c

@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+                                      int *x_sum, int64_t *x2_sum) {
+  __m256i sum_buffer, sse_buffer;
+  __m128i out_buffer;
+
+  // Accumulate the various elements of register into first element.
+  sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+  sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+  regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+  out_buffer = _mm256_castsi256_si128(regx_sum);
+  *x_sum += _mm_cvtsi128_si32(out_buffer);
+  out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i row1, row2, row3;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 2); ++j) {
+    // Load 4 rows at a time.
+    row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    row1 = _mm_unpacklo_epi64(row1, row2);
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+    row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+    row2 = _mm_unpacklo_epi64(row2, row3);
+    load_pixels =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 4 * stride;
+  }
+
+  // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i load_128bit, load_next_128bit;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows at a time.
+    load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+    load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+    load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+                                          load_next_128bit, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 2 * stride;
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+                                     int *x_sum, int64_t *x2_sum,
+                                     int loop_count) {
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int i = 0; i < loop_count; ++i) {
+    data_tmp = data + 16 * i;
+    for (int j = 0; j < bh; ++j) {
+      load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+      row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+      row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+      sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+      sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+      data_tmp += stride;
+    }
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+        // For smaller block widths, compute multiple rows simultaneously.
+      case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 16:
+      case 32:
+        sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        break;
+      case 64:
+        // 32-bit variables will overflow for 64 rows at a single time, so
+        // compute 32 rows at a time.
+        if (bh <= 32) {
+          sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        } else {
+          sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+          sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                            bw >> 4);
+        }
+        break;
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}

diff --git a/libaom/aom_dsp/x86/blk_sse_sum_sse2.c b/libaom/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 0000000..ef0a024
--- /dev/null
+++ b/libaom/aom_dsp/x86/blk_sse_sum_sse2.c

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  const int16_t *data_tmp = data;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows (8 pixels) at a time.
+    load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+    sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+    sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+    regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+    regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+    data_tmp += 2 * stride;
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum = _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum,
+                                    int loop_cycles) {
+  const int16_t *data_tmp;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+  __m128i load_pixels, sum_buffer, sse_buffer;
+
+  for (int i = 0; i < loop_cycles; ++i) {
+    data_tmp = data + (8 * i);
+    for (int j = 0; j < bh; ++j) {
+      // Load 1 row (8-pixels) at a time.
+      load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+      sum_buffer = _mm_madd_epi16(load_pixels, one);
+      sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+      regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+      regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+      data_tmp += stride;
+    }
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum += _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+      case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+      case 8:
+      case 16:
+        sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+        break;
+        // For widths 32 and 64, the registers may overflow. So compute
+        // partial widths at a time.
+      case 32:
+        if (bh <= 32) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+          sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                           bw >> 3);
+          break;
+        }
+
+      case 64:
+        if (bh <= 16) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          for (int i = 0; i < bh; i += 16)
+            sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+                             bw >> 3);
+          break;
+        }
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}

diff --git a/libaom/aom_dsp/x86/convolve.h b/libaom/aom_dsp/x86/convolve.h
index ff3224e..b4ff697 100644
--- a/libaom/aom_dsp/x86/convolve.h
+++ b/libaom/aom_dsp/x86/convolve.h

@@ -107,6 +107,7 @@
     }                                                                        \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
@@ -197,5 +198,6 @@
           dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
     }                                                                      \
   }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_H_

diff --git a/libaom/aom_dsp/x86/convolve_avx2.h b/libaom/aom_dsp/x86/convolve_avx2.h
index 4a1068e..d516de5 100644
--- a/libaom/aom_dsp/x86/convolve_avx2.h
+++ b/libaom/aom_dsp/x86/convolve_avx2.h

@@ -34,6 +34,31 @@
   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
 #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
   for (i = 0; i < (im_h - 2); i += 2) {                                        \
     __m256i data = _mm256_castsi128_si256(                                     \
@@ -113,8 +138,8 @@
       xx_storel_32(p_0, res_0);                                               \
       xx_storel_32(p_1, res_1);                                               \
     } else {                                                                  \
-      *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);                            \
-      *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);                            \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
     }                                                                         \
                                                                               \
     s[0] = s[1];                                                              \
@@ -256,7 +281,7 @@
   // Since all filter co-efficients are even, this change will not affect the
   // end result
   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
-                            _mm_set1_epi16(0xffff)));
+                            _mm_set1_epi16((short)0xffff)));
 
   const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
 

diff --git a/libaom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/libaom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 1e3d13e..89fe189 100644
--- a/libaom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/libaom/aom_dsp/x86/fwd_txfm_impl_sse2.h

@@ -30,6 +30,206 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+                              __m128i *in1) {
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 =
+      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+  // Load inputs.
+  *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  *in1 = _mm_unpacklo_epi64(
+      *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  *in0 = _mm_unpacklo_epi64(
+      *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+  // multiply by 16 to give some extra precision
+  *in0 = _mm_slli_epi16(*in0, 4);
+  *in1 = _mm_slli_epi16(*in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+    *in0 = _mm_add_epi16(*in0, mask);
+    *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+    const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    *in0 = _mm_shuffle_epi32(x0, 0xD8);
+    *in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(*in0, *in1);
+    const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        *in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        *in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+  _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+  _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants

diff --git a/libaom/aom_dsp/x86/fwd_txfm_sse2.c b/libaom/aom_dsp/x86/fwd_txfm_sse2.c
index 6b7c11b..0e4fb80 100644
--- a/libaom/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/libaom/aom_dsp/x86/fwd_txfm_sse2.c

@@ -18,12 +18,22 @@
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
 #define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
 #undef FDCT8x8_2D
 
+#if CONFIG_AV1_HIGHBITDEPTH
+
 #undef DCT_HIGH_BIT_DEPTH
 #define DCT_HIGH_BIT_DEPTH 1
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
 #undef FDCT8x8_2D
+
+#endif

diff --git a/libaom/aom_dsp/x86/fwd_txfm_sse2.h b/libaom/aom_dsp/x86/fwd_txfm_sse2.h
index 260d8dd..ab3cd91 100644
--- a/libaom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/libaom/aom_dsp/x86/fwd_txfm_sse2.h

@@ -136,16 +136,21 @@
 }
 
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_store_si128((__m128i *)(dst_ptr), out0);
-    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-  }
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
 }
 
 #ifdef __cplusplus

diff --git a/libaom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/libaom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 0000000..c500b0a
--- /dev/null
+++ b/libaom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c

@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE void highbd_load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift) {
+  *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+  *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  *dequant =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+                                            __m256i *threshold,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+                                         __m256i *p, const int shift) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, shift);
+  prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+                                                const __m256i *round,
+                                                const __m256i *quant,
+                                                const __m256i *shift,
+                                                const int *log_scale) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_add_epi32(*coeff, *round);
+  highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm256_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+                                                    __m256i dequant) {
+  return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+    __m256i qcoeff, __m256i dequant, const int log_scale) {
+  __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+  highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+                                                  __m256i coeff1,
+                                                  tran_low_t *coeff_ptr) {
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const int log_scale = 0;
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+                            &quant, dequant_ptr, &dequant, quant_shift_ptr,
+                            &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const int log_scale = 1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  const __m256i one = _mm256_set1_epi32(1);
+  const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+  int prescan_add[2];
+  int thresh[2];
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  shift =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+  // Shift with rounding.
+  zbin = _mm256_add_epi32(zbin, log_scale_vec);
+  round = _mm256_add_epi32(round, log_scale_vec);
+  zbin = _mm256_srli_epi32(zbin, log_scale);
+  round = _mm256_srli_epi32(round, log_scale);
+  zbin = _mm256_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}

diff --git a/libaom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/libaom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 0000000..8f31f35
--- /dev/null
+++ b/libaom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c

@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+                                         __m128i *p, const int shift) {
+  __m128i sign = _mm_srai_epi32(*y, 31);
+  __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+  __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+  __m128i abs_y = invert_sign_32_sse2(*y, sign);
+  __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+  __m128i prod_hi = _mm_srli_epi64(*x, 32);
+  const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+  prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+  prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+  prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+  prod_lo = _mm_srli_epi64(prod_lo, shift);
+  const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+  prod_lo = _mm_and_si128(prod_lo, mask);
+  prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm_slli_epi64(prod_hi, 32);
+  *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+                                           const __m128i *quant,
+                                           const __m128i *shift,
+                                           const int *log_scale) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_add_epi32(*coeff, *round);
+  highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i temp_mask = _mm_setzero_si128();
+  if (_mm_movemask_epi8(*cmp_mask0)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    temp_mask = mask0;
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                       __m128i *threshold,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+  coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+  highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+                                               const int log_scale) {
+  __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+  __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+  highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 0;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}

diff --git a/libaom/aom_dsp/x86/highbd_convolve_avx2.c b/libaom/aom_dsp/x86/highbd_convolve_avx2.c
index ebcb5ac..b43a7d7 100644
--- a/libaom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/libaom/aom_dsp/x86/highbd_convolve_avx2.c

@@ -115,13 +115,13 @@
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -138,7 +138,7 @@
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -264,12 +264,12 @@
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
@@ -293,7 +293,7 @@
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */

diff --git a/libaom/aom_dsp/x86/highbd_convolve_ssse3.c b/libaom/aom_dsp/x86/highbd_convolve_ssse3.c
index e7b33d1..a79350f 100644
--- a/libaom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/libaom/aom_dsp/x86/highbd_convolve_ssse3.c

@@ -20,14 +20,14 @@
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -44,7 +44,7 @@
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -168,13 +168,13 @@
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
@@ -195,7 +195,7 @@
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */

diff --git a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 70b91c6..ea7dc6a 100644
--- a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c

@@ -90,7 +90,7 @@
 
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
 
   __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
   max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
@@ -112,7 +112,7 @@
                                                  __m128i *hev, __m128i *mask) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
   __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
   __m128i max, max01, h;
 
@@ -497,8 +497,9 @@
 }
 
 void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
-                                       const uint8_t *blt, const uint8_t *lt,
-                                       const uint8_t *thr, int bd) {
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
   __m128i p[7], q[7], pq[7];
   int i;
 
@@ -507,7 +508,7 @@
     q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
   }
 
-  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 
   for (i = 0; i < 6; i++) {
     _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);

diff --git a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 2f4ffd3..1764a49 100644
--- a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -134,7 +134,7 @@
   for (i = 0; i < idx; i++) {
     const int rc = idx_arr[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
@@ -192,7 +192,7 @@
   for (i = 0; i < idx; i++) {
     const int rc = idx_arr[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

diff --git a/libaom/aom_dsp/x86/highbd_sad_sse2.asm b/libaom/aom_dsp/x86/highbd_sad_sse2.asm
index 3398d8a..09e64d5 100644
--- a/libaom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/libaom/aom_dsp/x86/highbd_sad_sse2.asm

@@ -372,3 +372,71 @@
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+  HIGH_SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movq                  m1, [refq]
+  movq                  m2, [refq+ref_strideq*2]
+  movq                  m3, [refq+ref_strideq*4]
+  movq                  m4, [refq+ref_stride3q*2]
+  punpcklwd             m1, m3
+  punpcklwd             m2, m4
+%if %2 == 1
+  movq                  m3, [second_predq+8*0]
+  movq                  m5, [second_predq+8*2]
+  punpcklwd             m3, m5
+  movq                  m4, [second_predq+8*1]
+  movq                  m5, [second_predq+8*3]
+  punpcklwd             m4, m5
+  lea         second_predq, [second_predq+8*4]
+  pavgw                 m1, m3
+  pavgw                 m2, m4
+%endif
+  movq                  m5, [srcq]
+  movq                  m3, [srcq+src_strideq*4]
+  punpcklwd             m5, m3
+  movdqa                m3, m1
+  psubusw               m1, m5
+  psubusw               m5, m3
+  por                   m1, m5
+  movq                  m5, [srcq+src_strideq*2]
+  movq                  m4, [srcq+src_stride3q*2]
+  punpcklwd             m5, m4
+  movdqa                m4, m2
+  psubusw               m2, m5
+  psubusw               m5, m4
+  por                   m2, m5
+  paddw                 m1, m2
+  movdqa                m2, m1
+  punpcklwd             m1, m6
+  punpckhwd             m2, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2

diff --git a/libaom/aom_dsp/x86/highbd_variance_sse2.c b/libaom/aom_dsp/x86/highbd_variance_sse2.c
index fc5678d..b7d15f9 100644
--- a/libaom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_variance_sse2.c

@@ -20,9 +20,10 @@
 
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -192,7 +193,6 @@
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
 VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
@@ -303,19 +303,19 @@
       sse += sse2;                                                             \
       if (w > wf) {                                                            \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         if (w > wf * 2) {                                                      \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,  \
-              h, &sse2, NULL, NULL);                                           \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
           se += se2;                                                           \
           sse += sse2;                                                         \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,  \
-              h, &sse2, NULL, NULL);                                           \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
           se += se2;                                                           \
           sse += sse2;                                                         \
         }                                                                      \
@@ -346,19 +346,19 @@
       if (w > wf) {                                                            \
         uint32_t sse2;                                                         \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,  \
-              h, &sse2, NULL, NULL);                                           \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,  \
-              h, &sse2, NULL, NULL);                                           \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
         }                                                                      \
@@ -397,19 +397,19 @@
         long_sse += sse2;                                                      \
         if (w > wf) {                                                          \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src_tmp + 16, src_stride, x_offset, y_offset, dst_tmp + 16,      \
+              src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf,      \
               dst_stride, height, &sse2, NULL, NULL);                          \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           if (w > wf * 2) {                                                    \
             se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
-                src_tmp + 32, src_stride, x_offset, y_offset, dst_tmp + 32,    \
-                dst_stride, height, &sse2, NULL, NULL);                        \
+                src_tmp + 2 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL);      \
             se += se2;                                                         \
             long_sse += sse2;                                                  \
             se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
-                src_tmp + 48, src_stride, x_offset, y_offset, dst_tmp + 48,    \
-                dst_stride, height, &sse2, NULL, NULL);                        \
+                src_tmp + 3 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL);      \
             se += se2;                                                         \
             long_sse += sse2;                                                  \
           }                                                                    \
@@ -479,19 +479,19 @@
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
@@ -515,19 +515,19 @@
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
@@ -562,22 +562,22 @@
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
-            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
+            src + wf + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + wf + (start_row * dst_stride), dst_stride,         \
+            sec + wf + (start_row * w), w, height, &sse2, NULL, NULL);         \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+              src + 2 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+              src + 3 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
         }                                                                      \
@@ -630,72 +630,24 @@
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }

diff --git a/libaom/aom_dsp/x86/intrapred_avx2.c b/libaom/aom_dsp/x86/intrapred_avx2.c
index 17f35a0..546ee74 100644
--- a/libaom/aom_dsp/x86/intrapred_avx2.c
+++ b/libaom/aom_dsp/x86/intrapred_avx2.c

@@ -12,6 +12,7 @@
 #include <immintrin.h>
 
 #include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 
 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
@@ -64,6 +65,74 @@
   }
 }
 
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
+    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
+    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+    0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+    0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+    0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
 static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
 
@@ -351,28 +420,6 @@
 
 // -----------------------------------------------------------------------------
 // Rectangle
-
-// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
-// Use a header file, intrapred_common_x86.h
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m128i top_sum = dc_sum_32_sse2(above);
@@ -707,7 +754,7 @@
   __m128i x = _mm_loadl_epi64((const __m128i *)left);
   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -731,7 +778,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i l = get_left_vector(left);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -750,7 +797,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   __m256i l = get_left_vector(left);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -765,7 +812,7 @@
   }
 
   l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
+  rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
@@ -784,7 +831,7 @@
 
   for (int j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
@@ -817,7 +864,7 @@
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -839,7 +886,7 @@
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -857,7 +904,7 @@
   }
 
   l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
+  rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -882,7 +929,7 @@
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -910,7 +957,7 @@
   int i, j;
   for (j = 0; j < 2; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -942,7 +989,7 @@
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -973,7 +1020,7 @@
 
   int i;
   const __m256i l = get_left_vector(left);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -999,13 +1046,83 @@
     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   const int frac_bits = 6 - upsample_above;
   const int max_base_x = ((N + 4) - 1) << upsample_above;
-  int x;
-  // a assert(dx > 0);
+
+  assert(dx > 0);
   // pre-filter above pixels
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+  __m128i a0_128, a1_128;
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+                                   base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+              _mm256_set1_epi16(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+                                   base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_128);
+    a1 = _mm256_castsi128_si256(a1_128);
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+    res1 = _mm256_castsi256_si128(res);
+
+    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0, a1, a32, a16;
   __m256i diff;
@@ -1015,7 +1132,7 @@
   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
   max_base_x128 = _mm_set1_epi32(max_base_x);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++) {
     __m256i b, res, shift;
     __m128i res1;
@@ -1068,28 +1185,32 @@
 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
                                              ptrdiff_t stride,
                                              const uint16_t *above,
-                                             int upsample_above, int dx) {
+                                             int upsample_above, int dx,
+                                             int bd) {
   __m128i dstvec[16];
-
-  highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
-                                            dx);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
   for (int i = 0; i < N; i++) {
     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   }
 }
 
-static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   const int frac_bits = 6 - upsample_above;
   const int max_base_x = ((8 + N) - 1) << upsample_above;
 
-  int x;
-  // a assert(dx > 0);
+  assert(dx > 0);
   // pre-filter above pixels
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0, a1, a0_1, a1_1, a32, a16;
   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
@@ -1098,7 +1219,7 @@
   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   max_base_x256 = _mm256_set1_epi32(max_base_x);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++) {
     __m256i b, res, res1, shift;
 
@@ -1162,22 +1283,108 @@
   }
 }
 
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+    if (upsample_above) {
+      __m128i mask, atmp0, atmp1, atmp2, atmp3;
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp2 =
+          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      atmp3 =
+          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      mask =
+          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+                            _mm_set1_epi8(15));
+      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+                                      base + 8, base + 10, base + 12, base + 14,
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7, 0,
+                                      0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_x128);
+    a1 = _mm256_castsi128_si256(a1_x128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
                                              ptrdiff_t stride,
                                              const uint16_t *above,
-                                             int upsample_above, int dx) {
+                                             int upsample_above, int dx,
+                                             int bd) {
   __m128i dstvec[32];
-
-  highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
-                                            dx);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
   for (int i = 0; i < N; i++) {
     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   }
 }
 
-static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
-  int x;
   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   (void)upsample_above;
   const int frac_bits = 6;
@@ -1187,7 +1394,7 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0, a0_1, a1, a1_1, a32, a16;
   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
@@ -1196,7 +1403,7 @@
   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   max_base_x256 = _mm256_set1_epi16(max_base_x);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++) {
     __m256i b, res[2], res1;
 
@@ -1255,21 +1462,82 @@
   }
 }
 
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    a0 = _mm256_loadu_si256((__m256i *)(above + base));
+    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi16(diff, shift);
+
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    x += dx;
+  }
+}
+
 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
                                               ptrdiff_t stride,
                                               const uint16_t *above,
-                                              int upsample_above, int dx) {
+                                              int upsample_above, int dx,
+                                              int bd) {
   __m256i dstvec[64];
-  highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
-                                             dx);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
   for (int i = 0; i < N; i++) {
     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   }
 }
 
-static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
-  int x;
   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   (void)upsample_above;
   const int frac_bits = 6;
@@ -1279,16 +1547,17 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 
   a16 = _mm256_set1_epi32(16);
   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++) {
     __m256i b, res[2], res1;
 
@@ -1301,8 +1570,8 @@
       return;
     }
 
-    __m256i shift = _mm256_srli_epi32(
-        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
 
     for (int j = 0; j < 32; j += 16) {
       int mdif = max_base_x - (base + j);
@@ -1354,10 +1623,83 @@
         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
       }
-      if (!j)
+      if (!j) {
         dstvec[r] = res1;
-      else
+      } else {
         dstvec[r + N] = res1;
+      }
+    }
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res = a_mbase_x;
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res;
+      } else {
+        dstvec[r + N] = res;
+      }
     }
     x += dx;
   }
@@ -1366,23 +1708,27 @@
 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
                                               ptrdiff_t stride,
                                               const uint16_t *above,
-                                              int upsample_above, int dx) {
+                                              int upsample_above, int dx,
+                                              int bd) {
   __m256i dstvec[128];
-
-  highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
-                                             dx);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
   for (int i = 0; i < N; i++) {
     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
     _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
   }
 }
 
-static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
-                                              ptrdiff_t stride,
-                                              const uint16_t *above,
-                                              int upsample_above, int dx) {
-  int x;
-
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+                                                    ptrdiff_t stride,
+                                                    const uint16_t *above,
+                                                    int upsample_above,
+                                                    int dx) {
   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   (void)upsample_above;
   const int frac_bits = 6;
@@ -1392,7 +1738,7 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0, a0_1, a1, a1_1, a32, a16;
   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
@@ -1401,7 +1747,7 @@
   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   max_base_x256 = _mm256_set1_epi16(max_base_x);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++, dst += stride) {
     __m256i b, res[2], res1;
 
@@ -1477,6 +1823,79 @@
   }
 }
 
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
+      }
+    }
+    x += dx;
+  }
+}
+
 // Directional prediction, zone 1: 0 < angle < 90
 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
@@ -1484,182 +1903,61 @@
                                       int dx, int dy, int bd) {
   (void)left;
   (void)dy;
-  (void)bd;
 
   switch (bw) {
     case 4:
       highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
-                                       dx);
+                                       dx, bd);
       break;
     case 8:
       highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
-                                       dx);
+                                       dx, bd);
       break;
     case 16:
       highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
-                                        dx);
+                                        dx, bd);
       break;
     case 32:
       highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
-                                        dx);
+                                        dx, bd);
       break;
     case 64:
-      highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above,
-                                        dx);
+      if (bd < 12) {
+        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+                                          upsample_above, dx);
+      } else {
+        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+                                                upsample_above, dx);
+      }
       break;
     default: break;
   }
   return;
 }
 
-static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
-                                    uint16_t *dst, ptrdiff_t pitchDst) {
-  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo,
-      r5_Lo, r6_Lo;
-  r0 = _mm_load_si128(
-      (__m128i *)(src + 0 * pitchSrc));  // 07,06,05,04,03,02,01,00
-  r1 = _mm_load_si128(
-      (__m128i *)(src + 1 * pitchSrc));  // 17,16,15,14,13,12,11,10
-  r2 = _mm_load_si128(
-      (__m128i *)(src + 2 * pitchSrc));  // 27,26,25,24,23,22,21,20
-  r3 = _mm_load_si128(
-      (__m128i *)(src + 3 * pitchSrc));  // 37,36,35,34,33,32,31,30
-  r4 = _mm_load_si128(
-      (__m128i *)(src + 4 * pitchSrc));  // 47,46,45,44,43,42,41,40
-  r5 = _mm_load_si128(
-      (__m128i *)(src + 5 * pitchSrc));  // 57,56,55,54,53,52,51,50
-  r6 = _mm_load_si128(
-      (__m128i *)(src + 6 * pitchSrc));  // 67,66,65,64,63,62,61,60
-  r7 = _mm_load_si128(
-      (__m128i *)(src + 7 * pitchSrc));  // 77,76,75,74,73,72,71,70
-
-  r0_Lo = _mm_unpacklo_epi16(r0, r1);
-  r2_Lo = _mm_unpacklo_epi16(r2, r3);
-  r4_Lo = _mm_unpacklo_epi16(r4, r5);
-  r6_Lo = _mm_unpacklo_epi16(r6, r7);
-
-  r1_Lo = r0_Lo;
-  r0_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
-  r1_Lo = _mm_unpackhi_epi32(r1_Lo, r2_Lo);
-  r5_Lo = r4_Lo;
-  r4_Lo = _mm_unpacklo_epi32(r4_Lo, r6_Lo);
-  r5_Lo = _mm_unpackhi_epi32(r5_Lo, r6_Lo);
-  r2_Lo = r0_Lo;
-  r0_Lo = _mm_unpacklo_epi64(r0_Lo, r4_Lo);  // 64
-  r2_Lo = _mm_unpackhi_epi64(r2_Lo, r4_Lo);
-  r3_Lo = r1_Lo;
-  r1_Lo = _mm_unpacklo_epi64(r1_Lo, r5_Lo);
-  r3_Lo = _mm_unpackhi_epi64(r3_Lo, r5_Lo);
-
-  _mm_storeu_si128((__m128i *)(dst + 0 * pitchDst), r0_Lo);
-  _mm_storeu_si128((__m128i *)(dst + 1 * pitchDst), r2_Lo);
-  _mm_storeu_si128((__m128i *)(dst + 2 * pitchDst), r1_Lo);
-  _mm_storeu_si128((__m128i *)(dst + 3 * pitchDst), r3_Lo);
-
-  r0 = _mm_unpackhi_epi16(r0, r1);
-  r2 = _mm_unpackhi_epi16(r2, r3);
-  r4 = _mm_unpackhi_epi16(r4, r5);
-  r6 = _mm_unpackhi_epi16(r6, r7);
-
-  r1 = r0;
-  r0 = _mm_unpacklo_epi32(r0, r2);
-  r1 = _mm_unpackhi_epi32(r1, r2);
-  r5 = r4;
-  r4 = _mm_unpacklo_epi32(r4, r6);
-  r5 = _mm_unpackhi_epi32(r5, r6);
-  r2 = r0;
-  r0 = _mm_unpacklo_epi64(r0, r4);
-  r2 = _mm_unpackhi_epi64(r2, r4);
-  r3 = r1;
-  r1 = _mm_unpacklo_epi64(r1, r5);
-  r3 = _mm_unpackhi_epi64(r3, r5);
-
-  _mm_storeu_si128((__m128i *)(dst + 4 * pitchDst), r0);
-  _mm_storeu_si128((__m128i *)(dst + 5 * pitchDst), r2);
-  _mm_storeu_si128((__m128i *)(dst + 6 * pitchDst), r1);
-  _mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3);
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+                                      uint16_t *dst, ptrdiff_t pitchDst) {
+  __m256i r[16];
+  __m256i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+  }
+  highbd_transpose16x16_avx2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+  }
 }
 
-static uint8_t HighbdLoadMaskx[8][16] = {
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
-  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
-};
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+                             uint16_t *dst, ptrdiff_t pitchDst, int width,
+                             int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                                dst + j * pitchDst + i, pitchDst);
+}
 
-static uint8_t HighbdEvenOddMaskx4[8][16] = {
-  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14,
-    15 },  // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15,
-           // >7=0,1
-  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
-  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
-  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 },
-  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 }
-};
-
-static uint16_t HighbdEvenOddMaskx8_2[8][16] = {
-  { 0, 2, 4, 6, 8, 10, 12, 14 },      { 2, 2, 4, 6, 8, 10, 12, 14 },
-  { 4, 4, 4, 6, 8, 10, 12, 14 },      { 6, 6, 6, 6, 8, 10, 12, 14 },
-  { 8, 8, 8, 8, 8, 10, 12, 14 },      { 10, 10, 10, 10, 10, 10, 12, 14 },
-  { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 },
-};
-
-static uint16_t HighbdBaseMask[17][16] = {
-  {
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-  },
-  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
-    0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
-    0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
-    0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
-  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
-    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
-};
-
-static void highbd_dr_prediction_z2_Nx4_avx2(
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
     const uint16_t *left, int upsample_above, int upsample_left, int dx,
     int dy) {
@@ -1668,12 +1966,12 @@
   const int frac_bits_x = 6 - upsample_above;
   const int frac_bits_y = 6 - upsample_left;
 
-  // a assert(dx > 0);
+  assert(dx > 0);
   // pre-filter above pixels
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0_x, a1_x, a32, a16;
   __m256i diff;
@@ -1691,7 +1989,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+      base_shift = (min_base_x - base_x) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -1785,6 +2083,130 @@
   }
 }
 
+static void highbd_dr_prediction_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                              (2 << 6) - y * dx,
+                                              (3 << 6) - y * dx, 0, 0, 0, 0),
+                               upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                               (3 << 6) - y * dx, 0, 0, 0, 0),
+                c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+                            0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
     const uint16_t *left, int upsample_above, int upsample_left, int dx,
@@ -1798,7 +2220,7 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
   __m256i diff;
@@ -1815,7 +2237,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+      base_shift = (min_base_x - base_x) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -1828,25 +2250,24 @@
     if (base_shift > 7) {
       resx = _mm_setzero_si128();
     } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
       if (upsample_above) {
-        a0_x128 = _mm_setr_epi16(
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
-        a1_x128 = _mm_setr_epi16(
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
         shift = _mm256_srli_epi32(
             _mm256_and_si256(
                 _mm256_slli_epi32(
@@ -1858,7 +2279,6 @@
                 c3f),
             1);
       } else {
-        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
         a0_x128 =
             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
@@ -1874,7 +2294,6 @@
                 c3f),
             1);
       }
-
       a0_x = _mm256_cvtepu16_epi32(a0_x128);
       a1_x = _mm256_cvtepu16_epi32(a1_x128);
 
@@ -1951,7 +2370,7 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m128i c3f, min_base_y128;
   __m256i a0_x, a1_x, diff, a32, a16;
@@ -1968,7 +2387,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+      base_shift = (min_base_x - base_x) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -1983,25 +2402,25 @@
       a1_x = _mm256_setzero_si256();
       shift = _mm256_setzero_si256();
     } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
       if (upsample_above) {
-        a0_x128 = _mm_setr_epi16(
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
-            above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
-        a1_x128 = _mm_setr_epi16(
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
-            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
         shift = _mm256_castsi128_si256(_mm_srli_epi16(
             _mm_and_si128(
                 _mm_slli_epi16(
@@ -2013,7 +2432,6 @@
                 c3f),
             1));
       } else {
-        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
         a0_x128 =
             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
@@ -2035,7 +2453,7 @@
     // y calc
     __m128i a0_y, a1_y, shifty;
     if (base_x < min_base_x) {
-      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
       r6 = _mm_set1_epi16(r << 6);
       dy128 = _mm_set1_epi16(dy);
@@ -2101,28 +2519,37 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
-  __m256i diff, min_base_y256, c3f;
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+  DECLARE_ALIGNED(32, int, base_y_c[16]);
 
   a16 = _mm256_set1_epi32(16);
+  c1 = _mm256_srli_epi32(a16, 4);
+  c8 = _mm256_srli_epi32(a16, 1);
   min_base_y256 = _mm256_set1_epi16(min_base_y);
   c3f = _mm256_set1_epi32(0x3f);
+  dy256 = _mm256_set1_epi32(dy);
+  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  c1234 = _mm256_add_epi32(c0123, c1);
 
   for (int r = 0; r < H; r++) {
-    __m256i b, res, shift;
+    __m256i b, res, shift, ydx;
     __m256i resx[2], resy[2];
-    __m256i resxy;
+    __m256i resxy, j256, r6;
     for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi32(j);
       int y = r + 1;
-      int base_x = (-y * dx) >> frac_bits_x;
+      ydx = _mm256_set1_epi32(y * dx);
+
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
       int base_shift = 0;
-      if ((base_x + j) < (min_base_x - 1)) {
-        base_shift = (min_base_x - (base_x + j) - 1);
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - base_x - 1);
       }
-      int base_min_diff = (min_base_x - base_x - j);
+      int base_min_diff = (min_base_x - base_x);
       if (base_min_diff > 16) {
         base_min_diff = 16;
       } else {
@@ -2132,9 +2559,8 @@
       if (base_shift > 7) {
         resx[0] = _mm256_setzero_si256();
       } else {
-        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
-        a1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
         a0_x128 =
             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
         a1_x128 =
@@ -2143,15 +2569,9 @@
         a0_x = _mm256_cvtepu16_epi32(a0_x128);
         a1_x = _mm256_cvtepu16_epi32(a1_x128);
 
+        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
         shift = _mm256_srli_epi32(
-            _mm256_and_si256(
-                _mm256_setr_epi32(
-                    ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
-                    ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
-                    ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
-                    ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
-                c3f),
-            1);
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 
         diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
         a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
@@ -2165,16 +2585,16 @@
             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
       }
       int base_shift8 = 0;
-      if ((base_x + j + 8) < (min_base_x - 1)) {
-        base_shift8 = (min_base_x - (base_x + j + 8) - 1);
+      if ((base_x + 8) < (min_base_x - 1)) {
+        base_shift8 = (min_base_x - (base_x + 8) - 1);
       }
       if (base_shift8 > 7) {
         resx[1] = _mm256_setzero_si256();
       } else {
         a0_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j));
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
         a1_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j));
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
@@ -2183,15 +2603,10 @@
         a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
         a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
 
+        r6 = _mm256_slli_epi32(
+            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
         shift = _mm256_srli_epi32(
-            _mm256_and_si256(
-                _mm256_setr_epi32(
-                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
-                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
-                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
-                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
-                c3f),
-            1);
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 
         diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
         a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
@@ -2209,20 +2624,17 @@
                                   1);  // 16 16bit values
 
       // y calc
+      resy[0] = _mm256_setzero_si256();
       if ((base_x < min_base_x)) {
-        DECLARE_ALIGNED(32, int, base_y_c[16]);
-        __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
+        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
         r6 = _mm256_set1_epi32(r << 6);
-        dy256 = _mm256_set1_epi32(dy);
-        c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
-                                 7 + j, 8 + j);
+        c256 = _mm256_add_epi32(j256, c1234);
         y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
         base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
-        c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
-                                 15 + j, 16 + j);
+        c256 = _mm256_add_epi32(c256, c8);
         y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
         base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
@@ -2276,9 +2688,8 @@
         resy[0] =
             _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
                                     1);  // 16 16bit values
-      } else {
-        resy[0] = resx[0];
       }
+
       resxy = _mm256_blendv_epi8(resx[0], resy[0],
                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
@@ -2304,43 +2715,46 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0_x, a1_x, a32, a16, c3f;
-  __m256i diff, min_base_y256;
+  __m256i a0_x, a1_x, a32, a16, c3f, c1;
+  __m256i diff, min_base_y256, dy256, c1234, c0123;
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
 
   a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
   min_base_y256 = _mm256_set1_epi16(min_base_y);
   c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
 
   for (int r = 0; r < H; r++) {
     __m256i b, res, shift;
-    __m256i resx, resy;
-    __m256i resxy;
-    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx;
+    __m256i resx, resy, ydx;
+    __m256i resxy, j256, r6;
+    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((short)(y * dx));
 
     for (int j = 0; j < W; j += 16) {
-      int y = r + 1;
-      int base_x = (-y * dx) >> frac_bits_x;
+      j256 = _mm256_set1_epi16(j);
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
       int base_shift = 0;
-      if ((base_x + j) < (min_base_x - 1)) {
-        base_shift = (min_base_x - (base_x + j) - 1);
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x)-1);
       }
-      int base_min_diff = (min_base_x - base_x - j);
+      int base_min_diff = (min_base_x - base_x);
       if (base_min_diff > 16) {
         base_min_diff = 16;
       } else {
         if (base_min_diff < 0) base_min_diff = 0;
       }
 
-      if (base_shift > 7) {
-        a0_x = _mm256_setzero_si256();
-        a1_x = _mm256_setzero_si256();
-        shift = _mm256_setzero_si256();
-      } else {
-        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
-        a1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+      if (base_shift < 8) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
         a0_x128 =
             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
         a1_x128 =
@@ -2348,45 +2762,31 @@
 
         a0_x = _mm256_castsi128_si256(a0_x128);
         a1_x = _mm256_castsi128_si256(a1_x128);
-
-        shift = _mm256_castsi128_si256(_mm_srli_epi16(
-            _mm_and_si128(_mm_setr_epi16(
-                              ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
-                              ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
-                              ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
-                              ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
-                          _mm256_castsi256_si128(c3f)),
-            1));
+      } else {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
       }
 
-      base_shift = 0;
-      if ((base_x + j + 8) < (min_base_x - 1)) {
-        base_shift = (min_base_x - (base_x + j + 8) - 1);
+      int base_shift1 = 0;
+      if (base_shift > 8) {
+        base_shift1 = base_shift - 8;
       }
-      if (base_shift <= 7) {
+      if (base_shift1 < 8) {
         a0_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
         a1_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
-                                     *(__m128i *)HighbdLoadMaskx[base_shift]);
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
-                                     *(__m128i *)HighbdLoadMaskx[base_shift]);
-
-        shiftx = _mm_srli_epi16(
-            _mm_and_si128(
-                _mm_setr_epi16(
-                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
-                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
-                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
-                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
-                _mm256_castsi256_si128(c3f)),
-            1);
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
 
         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
-        shift = _mm256_inserti128_si256(shift, shiftx, 1);
       }
+      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
 
       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
@@ -2397,15 +2797,12 @@
       resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 
       // y calc
+      resy = _mm256_setzero_si256();
       __m256i a0_y, a1_y, shifty;
       if ((base_x < min_base_x)) {
-        DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
-        __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
         r6 = _mm256_set1_epi16(r << 6);
-        dy256 = _mm256_set1_epi16(dy);
-        c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
-                                 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
-                                 13 + j, 14 + j, 15 + j, 16 + j);
+        c256 = _mm256_add_epi16(j256, c1234);
         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
                                  _mm256_srli_epi16(min_base_y256, 1));
         y_c256 = _mm256_sub_epi16(r6, mul16);
@@ -2421,14 +2818,16 @@
             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
             left[base_y_c[15]]);
+        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
         a1_y = _mm256_setr_epi16(
-            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
-            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
-            left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
-            left[base_y_c[9] + 1], left[base_y_c[10] + 1],
-            left[base_y_c[11] + 1], left[base_y_c[12] + 1],
-            left[base_y_c[13] + 1], left[base_y_c[14] + 1],
-            left[base_y_c[15] + 1]);
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
 
         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 
@@ -2439,8 +2838,6 @@
         b = _mm256_mullo_epi16(diff, shifty);
         res = _mm256_add_epi16(a32, b);
         resy = _mm256_srli_epi16(res, 5);
-      } else {
-        resy = _mm256_setzero_si256();
       }
 
       resxy = _mm256_blendv_epi8(resx, resy,
@@ -2462,8 +2859,14 @@
   assert(dy > 0);
   switch (bw) {
     case 4:
-      highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
-                                       upsample_above, upsample_left, dx, dy);
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
       break;
     case 8:
       if (bd < 12) {
@@ -2488,21 +2891,19 @@
   }
 }
 
-static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
-                             uint16_t *dst, ptrdiff_t pitchDst, int width,
-                             int height) {
-  for (int j = 0; j < height; j += 8)
-    for (int i = 0; i < width; i += 8)
-      highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc,
-                              dst + j * pitchDst + i, pitchDst);
-}
-
+//  Directional prediction, zone 3 functions
 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left,
-                                             int upsample_left, int dy) {
+                                             int upsample_left, int dy,
+                                             int bd) {
   __m128i dstvec[4], d[4];
-
-  highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
   highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
                                    &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
@@ -2514,10 +2915,16 @@
 
 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left,
-                                             int upsample_left, int dy) {
+                                             int upsample_left, int dy,
+                                             int bd) {
   __m128i dstvec[8], d[8];
-
-  highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
   highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
                            &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
                            &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
@@ -2529,10 +2936,17 @@
 
 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left,
-                                             int upsample_left, int dy) {
+                                             int upsample_left, int dy,
+                                             int bd) {
   __m128i dstvec[4], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
 
-  highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
                                &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
                                &d[7]);
@@ -2543,10 +2957,17 @@
 
 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left,
-                                             int upsample_left, int dy) {
+                                             int upsample_left, int dy,
+                                             int bd) {
   __m128i dstvec[8], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
 
-  highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
                                &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
                                &d[0], &d[1], &d[2], &d[3]);
@@ -2558,11 +2979,16 @@
 
 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m256i dstvec[8], d[8];
-
-  highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
-                                             dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
   highbd_transpose8x16_16x8_avx2(dstvec, d);
   for (int i = 0; i < 8; i++) {
     _mm_storeu_si128((__m128i *)(dst + i * stride),
@@ -2576,11 +3002,16 @@
 
 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m128i dstvec[16], d[16];
-
-  highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
-                                            dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
   for (int i = 0; i < 16; i += 8) {
     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
@@ -2596,11 +3027,16 @@
 
 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m256i dstvec[4], d[4], d1;
-
-  highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
-                                             dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+                                                     upsample_left, dy);
+  }
   highbd_transpose4x16_avx2(dstvec, d);
   for (int i = 0; i < 4; i++) {
     _mm_storel_epi64((__m128i *)(dst + i * stride),
@@ -2617,11 +3053,16 @@
 
 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m128i dstvec[16], d[8];
-
-  highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
-                                            dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
   highbd_transpose16x4_8x8_sse2(dstvec, d);
 
   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
@@ -2636,11 +3077,17 @@
 
 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
 
-  highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
-                                             dy);
   for (int i = 0; i < 16; i += 8) {
     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   }
@@ -2665,11 +3112,17 @@
 
 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
-                                              int upsample_left, int dy) {
+                                              int upsample_left, int dy,
+                                              int bd) {
   __m128i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+                                                    upsample_left, dy);
+  }
 
-  highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
-                                            dy);
   for (int i = 0; i < 32; i += 8) {
     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
@@ -2687,11 +3140,17 @@
 
 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
 
-  highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
-                                             dy);
   highbd_transpose16x16_avx2(dstvec, d);
 
   for (int i = 0; i < 16; i++) {
@@ -2701,12 +3160,16 @@
 
 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   __m256i dstvec[64], d[16];
-
-  highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
-                                             dy);
-
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
   highbd_transpose16x16_avx2(dstvec, d);
   for (int j = 0; j < 16; j++) {
     _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
@@ -2727,19 +3190,30 @@
 
 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
-  highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+                                            dy);
+  }
   highbd_transpose(dstT, 64, dst, stride, 64, 64);
 }
 
 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   __m256i dstvec[32], d[32];
-
-  highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
-                                             dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
   for (int i = 0; i < 32; i += 8) {
     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   }
@@ -2764,11 +3238,16 @@
 
 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   __m256i dstvec[32], d[16];
-
-  highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
-                                             dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
   for (int i = 0; i < 32; i += 16) {
     highbd_transpose16x16_avx2((dstvec + i), d);
     for (int j = 0; j < 16; j++) {
@@ -2779,36 +3258,54 @@
 
 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   uint16_t dstT[64 * 32];
-  highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+                                            dy);
+  }
   highbd_transpose(dstT, 64, dst, stride, 32, 64);
 }
 
 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
-  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
   highbd_transpose(dstT, 32, dst, stride, 64, 32);
   return;
 }
 
 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
-  highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+                                            dy);
+  }
   highbd_transpose(dstT, 64, dst, stride, 16, 64);
 }
 
 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
-                                               int upsample_left, int dy) {
+                                               int upsample_left, int dy,
+                                               int bd) {
   __m256i dstvec[64], d[16];
-
-  highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
-                                             dy);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+                                                     upsample_left, dy);
+  }
   for (int i = 0; i < 64; i += 16) {
     highbd_transpose16x16_avx2((dstvec + i), d);
     for (int j = 0; j < 16; j++) {
@@ -2823,28 +3320,30 @@
                                       int dx, int dy, int bd) {
   (void)above;
   (void)dx;
-  (void)bd;
+
   assert(dx == 1);
   assert(dy > 0);
   if (bw == bh) {
     switch (bw) {
       case 4:
-        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
         break;
       case 8:
-        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
         break;
       case 16:
-        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left,
-                                           dy);
+        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
         break;
       case 32:
-        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left,
-                                           dy);
+        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
         break;
       case 64:
-        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left,
-                                           dy);
+        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
         break;
     }
   } else {
@@ -2853,34 +3352,34 @@
         switch (bw) {
           case 4:
             highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
-                                             dy);
+                                             dy, bd);
             break;
           case 8:
             highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 16:
             highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
           case 32:
             highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
         }
       } else {
         switch (bw) {
           case 4:
             highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 8:
             highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 16:
             highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
         }
       }
@@ -2889,34 +3388,34 @@
         switch (bh) {
           case 4:
             highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
-                                             dy);
+                                             dy, bd);
             break;
           case 8:
             highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 16:
             highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
           case 32:
             highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
         }
       } else {
         switch (bh) {
           case 4:
             highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 8:
             highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
-                                              dy);
+                                              dy, bd);
             break;
           case 16:
             highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
-                                               dy);
+                                               dy, bd);
             break;
         }
       }
@@ -2926,7 +3425,7 @@
 }
 
 // Low bit depth functions
-static uint8_t BaseMask[33][32] = {
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -3018,17 +3517,59 @@
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
 };
 
-static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2(
-    int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+/* clang-format off */
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1,  0,  0,  0,  0,  0,  0,  0},
+  { -1, -1,  0,  0,  0,  0,  0,  0},
+  { -1, -1, -1,  0,  0,  0,  0,  0},
+  { -1, -1, -1, -1,  0,  0,  0,  0},
+  { -1, -1, -1, -1, -1,  0,  0,  0},
+  { -1, -1, -1, -1, -1, -1,  0,  0},
+  { -1, -1, -1, -1, -1, -1, -1,  0},
+  { -1, -1, -1, -1, -1, -1, -1, -1},
+};
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
   const int frac_bits = 6 - upsample_above;
-  const int max_base_x = ((N + 4) - 1) << upsample_above;
-  int x;
-  // a assert(dx > 0);
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
   // pre-filter above pixels
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i a0, a1, a32, a16;
   __m256i diff, c3f;
@@ -3038,28 +3579,26 @@
   a_mbase_x = _mm_set1_epi8(above[max_base_x]);
   c3f = _mm256_set1_epi16(0x3f);
 
-  x = dx;
-  for (int r = 0; r < N; r++) {
+  int x = dx;
+  for (int r = 0; r < W; r++) {
     __m256i b, res, shift;
     __m128i res1, a0_128, a1_128;
 
     int base = x >> frac_bits;
     int base_max_diff = (max_base_x - base) >> upsample_above;
     if (base_max_diff <= 0) {
-      for (int i = r; i < N; ++i) {
+      for (int i = r; i < W; ++i) {
         dst[i] = a_mbase_x;  // save 4 values
       }
       return;
     }
-    if (base_max_diff > 4) base_max_diff = 4;
+    if (base_max_diff > H) base_max_diff = H;
     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
-    a1_128 = _mm_srli_si128(a0_128, 1);
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 
     if (upsample_above) {
-      a0_128 = _mm_shuffle_epi8(
-          a0_128,
-          _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15));
-      a1_128 = _mm_srli_si128(a0_128, 4);
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
 
       shift = _mm256_srli_epi16(
           _mm256_and_si256(
@@ -3079,8 +3618,10 @@
     res = _mm256_add_epi16(a32, b);
     res = _mm256_srli_epi16(res, 5);
 
-    res1 = _mm256_castsi256_si128(res);
-    res1 = _mm_packus_epi16(res1, res1);
+    res = _mm256_packus_epi16(
+        res, _mm256_castsi128_si256(
+                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
+    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
 
     dst[r] =
         _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
@@ -3093,191 +3634,29 @@
                                       int dx) {
   __m128i dstvec[16];
 
-  dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
   for (int i = 0; i < N; i++) {
     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
   }
 }
 
-static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2(
-    int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
-  const int frac_bits = 6 - upsample_above;
-  const int max_base_x = ((8 + N) - 1) << upsample_above;
-
-  int x;
-  // pre-filter above pixels
-  // store in temp buffers:
-  //   above[x] * 32 + 16
-  //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
-  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f;
-  __m128i a_mbase_x;
-
-  a16 = _mm256_set1_epi32(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
-  c3f = _mm256_set1_epi32(0x3f);
-
-  x = dx;
-  for (int r = 0; r < N; r++) {
-    __m256i b, res, res1, shift;
-    __m128i res128;
-
-    int base = x >> frac_bits;
-    int base_max_diff = (max_base_x - base) >> upsample_above;
-    if (base_max_diff <= 0) {
-      for (int i = r; i < N; ++i) {
-        dst[i] = a_mbase_x;  // save 16 values, 8 to be used furter
-      }
-      return;
-    }
-    if (base_max_diff > 8) base_max_diff = 8;
-
-    a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
-    a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
-
-    if (upsample_above) {
-      a0 = _mm256_permutevar8x32_epi32(
-          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
-
-      a0_1 =
-          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
-      a0_1 = _mm256_permutevar8x32_epi32(
-          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
-
-      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
-      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
-
-      shift = _mm256_srli_epi32(
-          _mm256_and_si256(
-              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f),
-          1);
-    } else {
-      shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
-    }
-
-    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
-    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
-    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
-
-    b = _mm256_mullo_epi32(diff, shift);
-    res = _mm256_add_epi32(a32, b);
-    res = _mm256_srli_epi32(res, 5);
-
-    res1 = _mm256_packus_epi32(
-        res, _mm256_castsi128_si256(
-                 _mm256_extracti128_si256(res, 1)));  // goto 16 bit
-
-    res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1),
-                              _mm256_castsi256_si128(res1));  // goto 8 bit
-
-    res128 =
-        _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]);
-    dst[r] = res128;
-    x += dx;
-  }
-}
-
 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above, int upsample_above,
                                       int dx) {
   __m128i dstvec[32];
 
-  dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
   for (int i = 0; i < N; i++) {
     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   }
 }
 
-static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2(
-    int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) {
-  int x;
-  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
-  (void)upsample_above;
-  const int frac_bits = 6;
-  const int max_base_x = ((16 + N) - 1);
-
-  // pre-filter above pixels
-  // store in temp buffers:
-  //   above[x] * 32 + 16
-  //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
-  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f;
-  __m128i a_mbase_x;
-
-  a16 = _mm256_set1_epi32(16);
-  a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]);
-  c3f = _mm256_set1_epi32(0x3f);
-
-  x = dx;
-  for (int r = 0; r < N; r++) {
-    __m256i b, res[2];
-    __m128i res128[2];
-    int base = x >> frac_bits;
-    int base_max_diff = (max_base_x - base);
-    if (base_max_diff <= 0) {
-      for (int i = r; i < N; ++i) {
-        dstvec[i] = a_mbase_x;  // save 16 values
-      }
-      return;
-    }
-    __m256i shift =
-        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
-
-    a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
-    a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
-
-    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
-    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
-    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
-    b = _mm256_mullo_epi32(diff, shift);
-
-    res[0] = _mm256_add_epi32(a32, b);
-    res[0] = _mm256_srli_epi32(res[0], 5);
-    res[0] = _mm256_packus_epi32(
-        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
-    res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]),
-                                 _mm256_castsi256_si128(res[0]));  // goto 8 bit
-
-    if (base_max_diff > 8) {
-      if (base_max_diff > 16) base_max_diff = 16;
-      a0_1 =
-          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
-      a1_1 =
-          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
-
-      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
-      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
-      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
-      b = _mm256_mullo_epi32(diff, shift);
-
-      res[1] = _mm256_add_epi32(a32, b);
-      res[1] = _mm256_srli_epi32(res[1], 5);
-      res[1] = _mm256_packus_epi32(
-          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
-      res128[1] =
-          _mm_packus_epi16(_mm256_castsi256_si128(res[1]),
-                           _mm256_castsi256_si128(res[1]));  // goto 8 bit
-
-    } else {
-      res128[1] = a_mbase_x;
-    }
-    res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]);  // 16 8bit values
-
-    dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0],
-                                *(__m128i *)BaseMask[base_max_diff]);
-    x += dx;
-  }
-}
 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
                                        const uint8_t *above, int upsample_above,
                                        int dx) {
   __m128i dstvec[64];
 
-  dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
   for (int i = 0; i < N; i++) {
     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   }
@@ -3285,7 +3664,6 @@
 
 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
     int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
-  int x;
   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   (void)upsample_above;
   const int frac_bits = 6;
@@ -3295,18 +3673,19 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a0, a1, a32, a16;
   __m256i a_mbase_x, diff, c3f;
 
-  a16 = _mm256_set1_epi32(16);
+  a16 = _mm256_set1_epi16(16);
   a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
-  c3f = _mm256_set1_epi32(0x3f);
+  c3f = _mm256_set1_epi16(0x3f);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++) {
-    __m256i b, res[2], res16[2];
+    __m256i b, res, res16[2];
+    __m128i a0_128, a1_128;
 
     int base = x >> frac_bits;
     int base_max_diff = (max_base_x - base);
@@ -3318,54 +3697,28 @@
     }
     if (base_max_diff > 32) base_max_diff = 32;
     __m256i shift =
-        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 
     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
       int mdiff = base_max_diff - j;
       if (mdiff <= 0) {
         res16[jj] = a_mbase_x;
       } else {
-        a0 = _mm256_cvtepu8_epi32(
-            _mm_loadu_si128((__m128i *)(above + base + j)));
-        a1 = _mm256_cvtepu8_epi32(
-            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
 
-        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
-        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
-        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
-        b = _mm256_mullo_epi32(diff, shift);
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
 
-        res[0] = _mm256_add_epi32(a32, b);
-        res[0] = _mm256_srli_epi32(res[0], 5);
-        res[0] = _mm256_packus_epi32(
-            res[0],
-            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
-
-        // goto 8 bit
-        res[0] = _mm256_packus_epi16(res[0], res[0]);
-
-        if (mdiff > 8) {
-          a0_1 = _mm256_cvtepu8_epi32(
-              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
-          a1_1 = _mm256_cvtepu8_epi32(
-              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
-
-          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
-          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
-          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
-          b = _mm256_mullo_epi32(diff, shift);
-
-          res[1] = _mm256_add_epi32(a32, b);
-          res[1] = _mm256_srli_epi32(res[1], 5);
-          res[1] = _mm256_packus_epi32(
-              res[1],
-              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
-          res[1] = _mm256_packus_epi16(res[1], res[1]);
-          // goto 8 bit
-        } else {
-          res[1] = a_mbase_x;
-        }
-        res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]);  // 16 8bit values
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res16[jj] = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
       }
     }
     res16[1] =
@@ -3392,7 +3745,6 @@
 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
                                        const uint8_t *above, int upsample_above,
                                        int dx) {
-  int x;
   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   (void)upsample_above;
   const int frac_bits = 6;
@@ -3402,22 +3754,20 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a0, a1, a32, a16;
   __m256i a_mbase_x, diff, c3f;
   __m128i max_base_x128, base_inc128, mask128;
 
-  a16 = _mm256_set1_epi32(16);
+  a16 = _mm256_set1_epi16(16);
   a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
   max_base_x128 = _mm_set1_epi8(max_base_x);
-  c3f = _mm256_set1_epi32(0x3f);
+  c3f = _mm256_set1_epi16(0x3f);
 
-  x = dx;
+  int x = dx;
   for (int r = 0; r < N; r++, dst += stride) {
-    __m256i b, res[2];
-    __m128i res1;
-
+    __m256i b, res;
     int base = x >> frac_bits;
     if (base >= max_base_x) {
       for (int i = r; i < N; ++i) {
@@ -3429,9 +3779,9 @@
     }
 
     __m256i shift =
-        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 
-    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+    __m128i a0_128, a1_128, res128;
     for (int j = 0; j < 64; j += 16) {
       int mdif = max_base_x - (base + j);
       if (mdif <= 0) {
@@ -3440,58 +3790,35 @@
       } else {
         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
-        a0 = _mm256_cvtepu8_epi32(a0_128);
-        a1 = _mm256_cvtepu8_epi32(a1_128);
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
 
-        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
-        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
-        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
-        b = _mm256_mullo_epi32(diff, shift);
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
 
-        res[0] = _mm256_add_epi32(a32, b);
-        res[0] = _mm256_srli_epi32(res[0], 5);
-        res[0] = _mm256_packus_epi32(
-            res[0],
-            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
-        // goto 8 bit
-        res[0] = _mm256_packus_epi16(res[0], res[0]);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 
-        if (mdif > 8) {
-          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
-          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
-          a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
-          a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
-
-          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
-          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
-          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
-          b = _mm256_mullo_epi32(diff, shift);
-
-          res[1] = _mm256_add_epi32(a32, b);
-          res[1] = _mm256_srli_epi32(res[1], 5);
-          res[1] = _mm256_packus_epi32(
-              res[1],
-              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
-          res[1] = _mm256_packus_epi16(res[1], res[1]);
-
-        } else {
-          res[1] = a_mbase_x;
-        }
-        res1 = _mm_unpacklo_epi64(
-            _mm256_castsi256_si128(res[0]),
-            _mm256_castsi256_si128(res[1]));  // 16 8bit values
-
-        base_inc128 = _mm_setr_epi8(
-            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
-            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
-            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
-            base + j + 13, base + j + 14, base + j + 15);
+        base_inc128 =
+            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
+                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
+                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
+                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
+                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
+                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
+                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
+                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
 
         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
                                  _mm_setzero_si128());
-        res1 =
-            _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
-        _mm_storeu_si128((__m128i *)(dst + j), res1);
+        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+                                 _mm256_castsi256_si128(res), mask128);
+        _mm_storeu_si128((__m128i *)(dst + j), res128);
       }
     }
     x += dx;
@@ -3525,39 +3852,6 @@
   return;
 }
 
-static uint8_t LoadMaskx[8][16] = {
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
-  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
-  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
-  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
-  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
-  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
-};
-
-static uint8_t EvenOddMaskx4[8][16] = {
-  { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 },
-  { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 },
-  { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 }
-};
-
-static uint8_t EvenOddMaskx[8][16] = {
-  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 },
-  { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 },
-  { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 },
-  { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 },
-  { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 },
-  { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 },
-  { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 },
-  { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 }
-};
-
 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above, const uint8_t *left,
                                       int upsample_above, int upsample_left,
@@ -3567,22 +3861,24 @@
   const int frac_bits_x = 6 - upsample_above;
   const int frac_bits_y = 6 - upsample_left;
 
-  // a assert(dx > 0);
+  assert(dx > 0);
   // pre-filter above pixels
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0_x, a1_x, a32, a16, diff;
-  __m128i c3f, min_base_y128;
+  __m128i a0_x, a1_x, a32, a16, diff;
+  __m128i c3f, min_base_y128, c1234, dy128;
 
-  a16 = _mm256_set1_epi32(16);
-  c3f = _mm_set1_epi32(0x3f);
-  min_base_y128 = _mm_set1_epi32(min_base_y);
+  a16 = _mm_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  dy128 = _mm_set1_epi16(dy);
 
   for (int r = 0; r < N; r++) {
-    __m256i b, res, shift;
+    __m128i b, res, shift, r6, ydx;
     __m128i resx, resy, resxy;
     __m128i a0_x128, a1_x128;
     int y = r + 1;
@@ -3600,82 +3896,73 @@
     }
 
     if (base_shift > 3) {
-      a0_x = _mm256_setzero_si256();
-      a1_x = _mm256_setzero_si256();
-      shift = _mm256_setzero_si256();
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
     } else {
       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
       if (upsample_above) {
         a0_x128 =
-            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]);
-        a1_x128 = _mm_srli_si128(a0_x128, 4);
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
 
-        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+        shift = _mm_srli_epi16(
             _mm_and_si128(
-                _mm_slli_epi32(
-                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
-                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
-                    upsample_above),
-                c3f),
-            1));
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
       } else {
         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
         a1_x128 = _mm_srli_si128(a0_x128, 1);
 
-        shift = _mm256_castsi128_si256(_mm_srli_epi32(
-            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
-                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
-                          c3f),
-            1));
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
       }
-      a0_x = _mm256_cvtepu8_epi32(a0_x128);
-      a1_x = _mm256_cvtepu8_epi32(a1_x128);
+      a0_x = _mm_cvtepu8_epi16(a0_x128);
+      a1_x = _mm_cvtepu8_epi16(a1_x128);
     }
     // y calc
     __m128i a0_y, a1_y, shifty;
     if (base_x < min_base_x) {
-      DECLARE_ALIGNED(32, int, base_y_c[4]);
-      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
-      r6 = _mm_set1_epi32(r << 6);
-      dy128 = _mm_set1_epi32(dy);
-      c1234 = _mm_setr_epi32(1, 2, 3, 4);
-      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
-      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
-      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c128, base_y_c128, mask128, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 
-      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
-                            left[base_y_c[2]], left[base_y_c[3]]);
-      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
-                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 
       if (upsample_left) {
-        shifty = _mm_srli_epi32(
-            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
       } else {
-        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
       }
-      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
-      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
-      shift = _mm256_inserti128_si256(shift, shifty, 1);
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
     }
 
-    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
-    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
-    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
 
-    b = _mm256_mullo_epi32(diff, shift);
-    res = _mm256_add_epi32(a32, b);
-    res = _mm256_srli_epi32(res, 5);
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
 
-    resx = _mm256_castsi256_si128(res);
-    resx = _mm_packus_epi32(resx, resx);
-    resx = _mm_packus_epi16(resx, resx);
-
-    resy = _mm256_extracti128_si256(res, 1);
-    resy = _mm_packus_epi32(resy, resy);
-    resy = _mm_packus_epi16(resy, resy);
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
 
     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
     *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
@@ -3696,19 +3983,22 @@
   // store in temp buffers:
   //   above[x] * 32 + 16
   //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
+  // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   __m256i diff, a32, a16;
   __m256i a0_x, a1_x;
   __m128i a0_x128, a1_x128, min_base_y128, c3f;
+  __m128i c1234, dy128;
 
   a16 = _mm256_set1_epi16(16);
   c3f = _mm_set1_epi16(0x3f);
   min_base_y128 = _mm_set1_epi16(min_base_y);
+  dy128 = _mm_set1_epi16(dy);
+  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 
   for (int r = 0; r < N; r++) {
     __m256i b, res, shift;
-    __m128i resx, resy, resxy;
+    __m128i resx, resy, resxy, r6, ydx;
 
     int y = r + 1;
     int base_x = (-y * dx) >> frac_bits_x;
@@ -3730,34 +4020,24 @@
       shift = _mm256_setzero_si256();
     } else {
       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
-      a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
       if (upsample_above) {
         a0_x128 =
             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
-        a1_x128 =
-            _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
 
         shift = _mm256_castsi128_si256(_mm_srli_epi16(
             _mm_and_si128(
-                _mm_slli_epi16(
-                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
-                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
-                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
-                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
-                    upsample_above),
-                c3f),
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
             1));
       } else {
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
 
-        shift = _mm256_castsi128_si256(_mm_srli_epi16(
-            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
-                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
-                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
-                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
-                          c3f),
-            1));
+        shift = _mm256_castsi128_si256(
+            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
       }
       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
@@ -3767,10 +4047,8 @@
     __m128i a0_y, a1_y, shifty;
     if (base_x < min_base_x) {
       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
-      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      __m128i y_c128, base_y_c128, mask128;
       r6 = _mm_set1_epi16(r << 6);
-      dy128 = _mm_set1_epi16(dy);
-      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
@@ -3781,10 +4059,14 @@
                             left[base_y_c[2]], left[base_y_c[3]],
                             left[base_y_c[4]], left[base_y_c[5]],
                             left[base_y_c[6]], left[base_y_c[7]]);
-      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
-                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
-                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
-                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+      base_y_c128 = _mm_add_epi16(
+          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
 
       if (upsample_left) {
         shifty = _mm_srli_epi16(
@@ -3830,28 +4112,30 @@
   const int frac_bits_x = 6;
   const int frac_bits_y = 6;
 
-  // pre-filter above pixels
-  // store in temp buffers:
-  //   above[x] * 32 + 16
-  //   above[x+1] - above[x]
-  // final pixels will be caluculated as:
-  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16;
-  __m256i diff, min_base_y256, c3f, shifty;
-  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx;
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+  __m128i a0_x128, a1_x128;
 
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
   min_base_y256 = _mm256_set1_epi16(min_base_y);
   c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
 
   for (int r = 0; r < H; r++) {
-    __m256i b, res, shift;
+    __m256i b, res, shift, j256, r6, ydx;
     __m128i resx, resy;
     __m128i resxy;
-    for (int j = 0; j < W; j += 16) {
-      int y = r + 1;
-      int base_x = (-y * dx) >> frac_bits_x;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((uint16_t)(y * dx));
 
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
       int base_shift = 0;
       if ((base_x + j) < (min_base_x - 1)) {
         base_shift = (min_base_x - (base_x + j) - 1);
@@ -3862,11 +4146,8 @@
       } else {
         if (base_min_diff < 0) base_min_diff = 0;
       }
-      if (base_shift > 7) {
-        a0_x = _mm256_setzero_si256();
-        a1_x = _mm256_setzero_si256();
-        shift = _mm256_setzero_si256();
-      } else {
+
+      if (base_shift < 16) {
         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
         a1_x128 =
             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
@@ -3876,92 +4157,81 @@
         a0_x = _mm256_cvtepu8_epi16(a0_x128);
         a1_x = _mm256_cvtepu8_epi16(a1_x128);
 
-        shift = _mm256_castsi128_si256(_mm_srli_epi16(
-            _mm_and_si128(_mm_setr_epi16(
-                              ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
-                              ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
-                              ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
-                              ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
-                          _mm256_castsi256_si128(c3f)),
-            1));
+        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+        shift = _mm256_srli_epi16(
+            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shift);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resx = _mm_setzero_si128();
       }
 
-      base_shift = 0;
-      if ((base_x + j + 8) < (min_base_x - 1)) {
-        base_shift = (min_base_x - (base_x + j + 8) - 1);
-      }
-      if (base_shift <= 7) {
-        a0_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
-        a1_1_x128 =
-            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
-        a0_1_x128 =
-            _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]);
-        a1_1_x128 =
-            _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]);
-
-        a0_1_x = _mm_cvtepu8_epi16(a0_1_x128);
-        a1_1_x = _mm_cvtepu8_epi16(a1_1_x128);
-
-        shiftx = _mm_srli_epi16(
-            _mm_and_si128(
-                _mm_setr_epi16(
-                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
-                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
-                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
-                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
-                _mm256_castsi256_si128(c3f)),
-            1);
-
-        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1);
-        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1);
-        shift = _mm256_inserti128_si256(shift, shiftx, 1);
-      }
-
-      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
-      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
-      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
-
-      b = _mm256_mullo_epi16(diff, shift);
-      res = _mm256_add_epi16(a32, b);
-      res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
-      resx = _mm256_castsi256_si128(_mm256_packus_epi16(
-          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
-
       // y calc
-      if ((base_x < min_base_x)) {
-        DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
-        __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+      if (base_x < min_base_x) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
         r6 = _mm256_set1_epi16(r << 6);
-        dy256 = _mm256_set1_epi16(dy);
-        c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
-                                 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
-                                 13 + j, 14 + j, 15 + j, 16 + j);
+        c256 = _mm256_add_epi16(j256, c1234);
         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
                                  _mm256_srli_epi16(min_base_y256, 1));
         y_c256 = _mm256_sub_epi16(r6, mul16);
 
         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
-        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
-        _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
 
-        a0_y = _mm256_setr_epi16(
-            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
-            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
-            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
-            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
-            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
-            left[base_y_c[15]]);
-        a1_y = _mm256_setr_epi16(
-            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
-            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
-            left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
-            left[base_y_c[9] + 1], left[base_y_c[10] + 1],
-            left[base_y_c[11] + 1], left[base_y_c[12] + 1],
-            left[base_y_c[13] + 1], left[base_y_c[14] + 1],
-            left[base_y_c[15] + 1]);
+        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+        int16_t min_y = (int16_t)_mm_extract_epi16(
+            _mm256_extracti128_si256(base_y_c256, 1), 7);
+        int16_t max_y =
+            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+        int16_t offset_diff = max_y - min_y;
 
+        if (offset_diff < 16) {
+          __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+          __m128i base_y_offset128 =
+              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+                              _mm256_extracti128_si256(base_y_offset, 1));
+
+          __m128i a0_y128 = _mm_maskload_epi32(
+              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          __m128i a1_y128 =
+              _mm_maskload_epi32((int *)(left + min_y + 1),
+                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+          a0_y = _mm256_cvtepu8_epi16(a0_y128);
+          a1_y = _mm256_cvtepu8_epi16(a1_y128);
+        } else {
+          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a0_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a1_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+        }
         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 
         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
@@ -3973,7 +4243,6 @@
         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
-
       } else {
         resy = _mm_setzero_si128();
       }
@@ -4227,38 +4496,25 @@
   d[15] = _mm_unpackhi_epi64(w7, w15);
 }
 
-static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc,
-                             uint8_t *dst, ptrdiff_t pitchDst) {
-  __m128i r0, r1, r2, r3, r4, r5, r6, r7;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-  r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc));
-  r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc));
-  r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc));
-  r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc));
-  r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc));
-  r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc));
-  r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc));
-  r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc));
-
-  transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1);
-  _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8));
-  _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3);
-  _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8));
-  _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5);
-  _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8));
-  _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7);
-  _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8));
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r[16];
+  __m128i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+  }
+  transpose16x16_sse2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+  }
 }
 
 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
                       ptrdiff_t pitchDst, int width, int height) {
-  for (int j = 0; j < height; j += 8)
-    for (int i = 0; i < width; i += 8)
-      transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
-                       pitchDst);
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
 }
 
 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
@@ -4266,7 +4522,7 @@
                                       int dy) {
   __m128i dstvec[4], d[4];
 
-  dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
                             &d[0], &d[1], &d[2], &d[3]);
 
@@ -4282,7 +4538,7 @@
                                       int dy) {
   __m128i dstvec[8], d[8];
 
-  dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
                     &d[3]);
@@ -4302,7 +4558,7 @@
                                       int dy) {
   __m128i dstvec[4], d[8];
 
-  dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
   transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
                         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
   for (int i = 0; i < 8; i++) {
@@ -4315,7 +4571,7 @@
                                       int dy) {
   __m128i dstvec[8], d[4];
 
-  dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
   transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
                         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
                         &d[1], &d[2], &d[3]);
@@ -4330,7 +4586,7 @@
                                        int dy) {
   __m128i dstvec[8], d[8];
 
-  dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
   transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
                           dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
                           d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
@@ -4346,7 +4602,7 @@
                                        int dy) {
   __m128i dstvec[16], d[16];
 
-  dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
   transpose16x8_8x16_sse2(
       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
@@ -4363,7 +4619,7 @@
                                        int dy) {
   __m128i dstvec[4], d[16];
 
-  dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
   transpose4x16_sse2(dstvec, d);
   for (int i = 0; i < 16; i++) {
     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
@@ -4375,7 +4631,7 @@
                                        int dy) {
   __m128i dstvec[16], d[8];
 
-  dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
   for (int i = 4; i < 8; i++) {
     d[i] = _mm_setzero_si128();
   }
@@ -4416,7 +4672,7 @@
                                        int dy) {
   __m128i dstvec[32], d[16];
 
-  dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
 
   transpose16x8_8x16_sse2(
       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
@@ -4442,7 +4698,7 @@
                                         int dy) {
   __m128i dstvec[16], d[16];
 
-  dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
   transpose16x16_sse2(dstvec, d);
 
   for (int i = 0; i < 16; i++) {
@@ -4501,7 +4757,7 @@
                                         int dy) {
   __m128i dstvec[32], d[16];
 
-  dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
   for (int i = 0; i < 32; i += 16) {
     transpose16x16_sse2((dstvec + i), d);
     for (int j = 0; j < 16; j++) {
@@ -4540,7 +4796,7 @@
                                         int dy) {
   __m128i dstvec[64], d[16];
 
-  dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy);
+  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
   for (int i = 0; i < 64; i += 16) {
     transpose16x16_sse2((dstvec + i), d);
     for (int j = 0; j < 16; j++) {

diff --git a/libaom/aom_dsp/x86/intrapred_sse2.c b/libaom/aom_dsp/x86/intrapred_sse2.c
index 5b2452c..5afef68 100644
--- a/libaom/aom_dsp/x86/intrapred_sse2.c
+++ b/libaom/aom_dsp/x86/intrapred_sse2.c

@@ -10,7 +10,7 @@
  */
 
 #include <emmintrin.h>
-
+#include "aom_dsp/x86/intrapred_x86.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
@@ -75,25 +75,6 @@
   return _mm_sad_epu8(x, zero);
 }
 
-static INLINE __m128i dc_sum_16(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
   __m128i x0 = _mm_load_si128((__m128i const *)ref);
   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
@@ -142,7 +123,7 @@
 
 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   __m128i sum_above = dc_sum_4(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
@@ -171,7 +152,7 @@
 
 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   __m128i sum_above = dc_sum_8(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -184,7 +165,7 @@
 
 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
+  const __m128i sum_left = dc_sum_32_sse2(left);
   __m128i sum_above = dc_sum_8(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -198,7 +179,7 @@
 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -211,7 +192,7 @@
 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -223,8 +204,8 @@
 
 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -237,7 +218,7 @@
 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_64(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -249,7 +230,7 @@
 
 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sum_left = dc_sum_8(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -262,8 +243,8 @@
 
 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -275,7 +256,7 @@
 
 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sum_left = dc_sum_64(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -302,7 +283,7 @@
 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_32(left);
+  const __m128i sum_left = dc_sum_32_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -315,7 +296,7 @@
 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -395,7 +376,7 @@
 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -408,7 +389,7 @@
 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -422,7 +403,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -436,7 +417,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -449,7 +430,7 @@
 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -463,7 +444,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -477,7 +458,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -550,7 +531,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -577,7 +558,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -590,7 +571,7 @@
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -631,7 +612,7 @@
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -673,7 +654,7 @@
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -715,7 +696,7 @@
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -729,7 +710,7 @@
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);

diff --git a/libaom/aom_dsp/x86/intrapred_ssse3.c b/libaom/aom_dsp/x86/intrapred_ssse3.c
index 807ed17..5a34ea0 100644
--- a/libaom/aom_dsp/x86/intrapred_ssse3.c
+++ b/libaom/aom_dsp/x86/intrapred_ssse3.c

@@ -48,7 +48,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -69,7 +69,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -90,7 +90,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int i = 0; i < 16; ++i) {
@@ -110,7 +110,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -131,7 +131,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -152,7 +152,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -176,7 +176,7 @@
 
   for (int j = 0; j < 2; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m128i l16 = _mm_shuffle_epi8(l, rep);
       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
@@ -205,7 +205,7 @@
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int i = 0; i < 4; ++i) {
@@ -226,7 +226,7 @@
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -249,7 +249,7 @@
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -272,7 +272,7 @@
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -287,7 +287,7 @@
   }
 
   l = _mm_load_si128((const __m128i *)(left + 16));
-  rep = _mm_set1_epi16(0x8000);
+  rep = _mm_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
@@ -310,7 +310,7 @@
 
   for (int j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m128i l16 = _mm_shuffle_epi8(l, rep);
       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
@@ -332,7 +332,7 @@
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
   __m128i l16;
@@ -361,7 +361,7 @@
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
   __m128i l16;
@@ -391,7 +391,7 @@
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
   __m128i l16;
@@ -408,7 +408,7 @@
     rep = _mm_add_epi16(rep, one);
   }
 
-  rep = _mm_set1_epi16(0x8000);
+  rep = _mm_set1_epi16((short)0x8000);
   l = _mm_load_si128((const __m128i *)(left + 16));
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
@@ -440,7 +440,7 @@
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -478,7 +478,7 @@
   int i, j;
   for (j = 0; j < 2; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -520,7 +520,7 @@
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -561,7 +561,7 @@
 
   int i;
   const __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -636,7 +636,8 @@
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
   for (int i = 0; i < h; ++i) {
@@ -792,7 +793,8 @@
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
 
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
   int i;
@@ -1400,7 +1402,7 @@
   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
 
   for (int i = 0; i < h; ++i) {
     __m128i b = _mm_shuffle_epi8(pixel[0], rep);
@@ -1499,7 +1501,8 @@
   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
 
   for (int i = 0; i < h; ++i) {
     __m128i b = _mm_shuffle_epi8(pixels[0], rep);

diff --git a/libaom/aom_dsp/x86/intrapred_x86.h b/libaom/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 0000000..b13f575
--- /dev/null
+++ b/libaom/aom_dsp/x86/intrapred_x86.h

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_X86_H_

diff --git a/libaom/aom_dsp/x86/loopfilter_sse2.c b/libaom/aom_dsp/x86/loopfilter_sse2.c
index c021f50..d534683 100644
--- a/libaom/aom_dsp/x86/loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/loopfilter_sse2.c

@@ -146,7 +146,7 @@
   __m128i hev1;
   const __m128i t3t4 =
       _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
   const __m128i ff = _mm_cmpeq_epi8(t80, t80);
 
   ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
@@ -195,7 +195,7 @@
                                                __m128i *ps1ps0) {
   const __m128i t3t4 =
       _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
   __m128i filter, filter2filter1, work;
   __m128i ps1ps0_work, qs1qs0_work;
   __m128i hev1;
@@ -410,7 +410,7 @@
     __m128i fe, ff, work;
     abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((char)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 = abs_diff(p1p0, q1q0);
     abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
@@ -714,7 +714,7 @@
     __m128i abs_p1q1, abs_p0q0, abs_q1q0;
     abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((char)0xfe);
     ff = _mm_cmpeq_epi8(fe, fe);
     abs_p0q0 = abs_diff(p1p0, q1q0);
     abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
@@ -1003,7 +1003,7 @@
   *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 
   {
@@ -1132,7 +1132,7 @@
   *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   {
     // filter_mask and hev_mask
@@ -1337,7 +1337,7 @@
   // otherwise - not
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 
@@ -1492,7 +1492,7 @@
     // otherwise - not
 
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((char)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 

diff --git a/libaom/aom_dsp/x86/masked_sad4d_ssse3.c b/libaom/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 0000000..8ef7ee0
--- /dev/null
+++ b/libaom/aom_dsp/x86/masked_sad4d_ssse3.c

@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx)                             \
+  a = _mm_loadu_si128((const __m128i *)&ref##idx[x]);         \
+  data_l = _mm_unpacklo_epi8(a, b);                           \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                       \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                 \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  data_r = _mm_unpackhi_epi8(a, b);                           \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                       \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                 \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  pred = _mm_packus_epi16(pred_l, pred_r);                    \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *a_ptr[], int a_stride,
+                                       const uint8_t *b_ptr, int b_stride,
+                                       const uint8_t *m_ptr, int m_stride,
+                                       int width, int height, int inv_mask,
+                                       unsigned sad_array[]) {
+  int x, y;
+  __m128i a;
+  __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const uint8_t *ref0 = a_ptr[0];
+  const uint8_t *ref1 = a_ptr[1];
+  const uint8_t *ref2 = a_ptr[2];
+  const uint8_t *ref3 = a_ptr[3];
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+      __m128i m = inv_mask ? m_inv : m_copy;
+      m_inv = inv_mask ? m_copy : m_inv;
+
+      MASK_SAD16XH_ONE_REF(0)
+      MASK_SAD16XH_ONE_REF(1)
+      MASK_SAD16XH_ONE_REF(2)
+      MASK_SAD16XH_ONE_REF(3)
+    }
+
+    src_ptr += src_stride;
+    ref0 += a_stride;
+    ref1 += a_stride;
+    ref2 += a_stride;
+    ref3 += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx)                                               \
+  const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx);              \
+  const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+  data_l = _mm_unpacklo_epi8(a##idx##0, b0);                                   \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                                        \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                                  \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  data_r = _mm_unpacklo_epi8(a##idx##1, b1);                                   \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                                        \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                                  \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  pred = _mm_packus_epi16(pred_l, pred_r);                                     \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+    const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+    const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+    const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+    __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD8XH_ONE_REF(0)
+    MASK_SAD8XH_ONE_REF(1)
+    MASK_SAD8XH_ONE_REF(2)
+    MASK_SAD8XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx)                                               \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                              \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                          \
+  pred = _mm_maddubs_epi16(data, mask);                                        \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                      \
+                                                                               \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                          \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data, pred, mask;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  __m128i a;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
+        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+    const __m128i m_copy =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD4XH_ONE_REF(0)
+    MASK_SAD4XH_ONE_REF(1)
+    MASK_SAD4XH_ONE_REF(2)
+    MASK_SAD4XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_unpacklo_epi32(res0, res1);
+  res2 = _mm_unpacklo_epi32(res2, res3);
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                    \
+    masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n, inv_mask, sad_array);                \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  void aom_masked_sad8x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  void aom_masked_sad4x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)

diff --git a/libaom/aom_dsp/x86/masked_sad_intrin_avx2.c b/libaom/aom_dsp/x86/masked_sad_intrin_avx2.c
index 584b5e7..60f0ab3 100644
--- a/libaom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/libaom/aom_dsp/x86/masked_sad_intrin_avx2.c

@@ -17,7 +17,7 @@
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 static INLINE unsigned int masked_sad32xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
@@ -64,7 +64,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
@@ -117,7 +117,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_masked_sad_avx2(
@@ -253,7 +253,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int highbd_masked_sad16xh_avx2(
@@ -311,7 +311,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_highbd_masked_sad_avx2(

diff --git a/libaom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/libaom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 493f9bd..7168277 100644
--- a/libaom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/libaom/aom_dsp/x86/masked_sad_intrin_ssse3.c

@@ -19,7 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 // For width a multiple of 16
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
@@ -134,7 +134,7 @@
   // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -179,7 +179,7 @@
   }
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -223,7 +223,7 @@
   }
   // At this point, the SAD is stored in lane 0 of 'res'
   int32_t sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 // For width a multiple of 8
@@ -338,7 +338,7 @@
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
@@ -398,5 +398,5 @@
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }

diff --git a/libaom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/libaom/aom_dsp/x86/masked_variance_intrin_ssse3.c
index ebf4631..fa93f0d 100644
--- a/libaom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/libaom/aom_dsp/x86/masked_variance_intrin_ssse3.c

@@ -517,6 +517,7 @@
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // For width a multiple of 8
 static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
                                    int xoffset, int yoffset, uint16_t *dst,
@@ -1025,6 +1026,7 @@
   *sum_ = _mm_cvtsi128_si32(sum);
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
                               int width, int height, const uint8_t *ref,

diff --git a/libaom/aom_dsp/x86/obmc_variance_sse4.c b/libaom/aom_dsp/x86/obmc_variance_sse4.c
index 72eda0e..aa73c39 100644
--- a/libaom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/libaom/aom_dsp/x86/obmc_variance_sse4.c

@@ -166,7 +166,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
@@ -378,3 +378,4 @@
 HBD_OBMCVARWXH(32, 8)
 HBD_OBMCVARWXH(16, 64)
 HBD_OBMCVARWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/quantize_x86.h b/libaom/aom_dsp/x86/quantize_x86.h
index b2de01b..5b040a2 100644
--- a/libaom/aom_dsp/x86/quantize_x86.h
+++ b/libaom/aom_dsp/x86/quantize_x86.h

@@ -143,3 +143,60 @@
   _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
   _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
 }
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+                                const int16_t *iscan_ptr, int *is_found,
+                                __m128i *mask) {
+  __m128i all_zero;
+  __m128i temp_mask = _mm_setzero_si128();
+  all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+  if (_mm_movemask_epi8(all_zero)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+    __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+    temp_mask = _mm_max_epi16(mask0, mask1);
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                __m128i *threshold, const int16_t *iscan_ptr,
+                                int *is_found, __m128i *mask) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+  coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+  coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+  coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+  coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+  coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+  coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+  cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+  coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+  cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+  __m128i mask0, mask1;
+  int non_zero_count = 0;
+  mask0 = _mm_unpackhi_epi64(mask, mask);
+  mask1 = _mm_max_epi16(mask0, mask);
+  mask0 = _mm_shuffle_epi32(mask1, 1);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  mask1 = _mm_srli_epi32(mask0, 16);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+  return non_zero_count;
+}

diff --git a/libaom/aom_dsp/x86/sad4d_avx2.c b/libaom/aom_dsp/x86/sad4d_avx2.c
index f662b62..0771252 100644
--- a/libaom/aom_dsp/x86/sad4d_avx2.c
+++ b/libaom/aom_dsp/x86/sad4d_avx2.c

@@ -14,41 +14,43 @@
 
 #include "aom/aom_integer.h"
 
-void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
+  int i, j;
   const uint8_t *ref0, *ref1, *ref2, *ref3;
 
   ref0 = ref[0];
   ref1 = ref[1];
   ref2 = ref[2];
   ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
 
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+      ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+      sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    }
     src += src_stride;
     ref0 += ref_stride;
     ref1 += ref_stride;
@@ -57,6 +59,7 @@
   }
   {
     __m128i sum;
+    __m256i sum_mlow, sum_mhigh;
     // in sum_ref-i the result is saved in the first 4 bytes
     // the other 4 bytes are zeroed.
     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
@@ -80,139 +83,24 @@
 
     _mm_storeu_si128((__m128i *)(res), sum);
   }
-  _mm256_zeroupper();
 }
 
-void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+#define sadMxN_avx2(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
-  {
-    __m128i sum;
 
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+sadMxN_avx2(32, 8);
+sadMxN_avx2(32, 16);
+sadMxN_avx2(32, 32);
+sadMxN_avx2(32, 64);
 
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+sadMxN_avx2(64, 16);
+sadMxN_avx2(64, 32);
+sadMxN_avx2(64, 64);
+sadMxN_avx2(64, 128);
 
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 5;
-  rf[0] += ref_stride << 5;
-  rf[1] += ref_stride << 5;
-  rf[2] += ref_stride << 5;
-  rf[3] += ref_stride << 5;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
-void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-  unsigned int half_width = 32;
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
+sadMxN_avx2(128, 64);
+sadMxN_avx2(128, 128);

diff --git a/libaom/aom_dsp/x86/sad4d_sse2.asm b/libaom/aom_dsp/x86/sad4d_sse2.asm
index 55a8569..a904374 100644
--- a/libaom/aom_dsp/x86/sad4d_sse2.asm
+++ b/libaom/aom_dsp/x86/sad4d_sse2.asm

@@ -15,15 +15,85 @@
 
 SECTION .text
 
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
-  movd                  m0, [srcq +%2]
+%macro AVG_4x2x4 2
+  movh                  m2, [second_predq]
+  movlhps               m2, m2
+  pavgb                 %1, m2
+  pavgb                 %2, m2
+  lea                   second_predq, [second_predq+8]
+%endmacro
+; 'mflag' affect a lot how the code works.
+;
+; When 'mflag' is false, the 'src_strideq' resides in register,
+; [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update
+; 'srcq' at each line. We only update 'srcq' each two-lines using
+; a compact LEA instruction like [srcq+src_strideq*2].
+;
+; When 'mflag' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_FIRST_OFFSET 2
+  %define first_offset %2
+  %if mflag == 0 && %1 == 1
+    %define first_offset (src_strideq + %2)
+  %endif
+%endmacro
+
+; first_extraline, second_extraline, in_line_offset
+%macro HANDLE_SECOND_OFFSET 3
+  %define second_offset %3
+  %if mflag && %1 == 0 && %2 == 1
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %2 == 1
+    %define second_offset (src_strideq + %3)
+  %endif
+%endmacro
+
+; Notes for line_ending:
+; 0 -- not a line ending
+; 1 -- line ending of a odd line [line numbers starts from one]
+; 2 -- line ending of a even line
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_LINE 1
+  %if mflag
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %1 == 2
+    lea                 srcq, [srcq +src_strideq*2]
+  %endif
+
+  %if %1 == 2
+    lea                ref1q, [ref1q+ref_strideq*2]
+    lea                ref2q, [ref2q+ref_strideq*2]
+    lea                ref3q, [ref3q+ref_strideq*2]
+    lea                ref4q, [ref4q+ref_strideq*2]
+  %endif
+%endmacro
+
+; Please note that the second_offset of src is for in_line_offset,
+; so it is less than src_stride.
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first, second}_extraline, line_ending
+%macro PROCESS_4x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movd                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movd                  m6, [ref1q+%3]
   movd                  m4, [ref2q+%3]
   movd                  m7, [ref3q+%3]
   movd                  m5, [ref4q+%3]
-  movd                  m1, [srcq +%4]
+
+  movd                  m1, [srcq + second_offset]
   movd                  m2, [ref1q+%5]
   punpckldq             m0, m1
   punpckldq             m6, m2
@@ -36,6 +106,9 @@
   movlhps               m0, m0
   movlhps               m6, m4
   movlhps               m7, m5
+%if %6 == 1
+  AVG_4x2x4             m6, m7
+%endif
   psadbw                m6, m0
   psadbw                m7, m0
 %else
@@ -51,38 +124,48 @@
   movd                  m4, [ref4q+%3]
   movd                  m5, [ref4q+%5]
   punpckldq             m4, m5
-  movd                  m5, [srcq +%4]
+  movd                  m5, [srcq + second_offset]
   punpckldq             m0, m5
   movlhps               m0, m0
   movlhps               m1, m2
   movlhps               m3, m4
+%if %6 == 1
+  AVG_4x2x4             m1, m3
+%endif
   psadbw                m1, m0
   psadbw                m3, m0
   paddd                 m6, m1
   paddd                 m7, m3
 %endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
 %endif
 %endmacro
 
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
-  movh                  m0, [srcq +%2]
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first,second}_extraline, line_ending
+%macro PROCESS_8x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movh                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movh                  m4, [ref1q+%3]
   movh                  m5, [ref2q+%3]
   movh                  m6, [ref3q+%3]
   movh                  m7, [ref4q+%3]
-  movhps                m0, [srcq +%4]
+  movhps                m0, [srcq + second_offset]
   movhps                m4, [ref1q+%5]
   movhps                m5, [ref2q+%5]
   movhps                m6, [ref3q+%5]
   movhps                m7, [ref4q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
@@ -90,105 +173,148 @@
 %else
   movh                  m1, [ref1q+%3]
   movh                  m2, [ref2q+%3]
-  movh                  m3, [ref3q+%3]
-  movhps                m0, [srcq +%4]
+  movhps                m0, [srcq + second_offset]
   movhps                m1, [ref1q+%5]
   movhps                m2, [ref2q+%5]
-  movhps                m3, [ref3q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
   psadbw                m1, m0
   psadbw                m2, m0
-  psadbw                m3, m0
   paddd                 m4, m1
-  movh                  m1, [ref4q+%3]
-  movhps                m1, [ref4q+%5]
   paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endif
+
+  movh                  m1, [ref3q+%3]
+  movhps                m1, [ref3q+%5]
+  movh                  m2, [ref4q+%3]
+  movhps                m2, [ref4q+%5]
 %if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
 %endif
 %endmacro
 
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_16x2x4 9
   ; 1st 16 px
-  mova                  m0, [srcq +%2]
+  HANDLE_FIRST_OFFSET   %7, %2
+  mova                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movu                  m4, [ref1q+%3]
   movu                  m5, [ref2q+%3]
   movu                  m6, [ref3q+%3]
   movu                  m7, [ref4q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
   psadbw                m7, m0
-%else
+%else ; %1 == 1
   movu                  m1, [ref1q+%3]
   movu                  m2, [ref2q+%3]
-  movu                  m3, [ref3q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
   psadbw                m1, m0
   psadbw                m2, m0
-  psadbw                m3, m0
   paddd                 m4, m1
-  movu                  m1, [ref4q+%3]
   paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
+
+  movu                  m1, [ref3q+%3]
+  movu                  m2, [ref4q+%3]
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
 %endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif ; %1 == 1
 
   ; 2nd 16 px
-  mova                  m0, [srcq +%4]
+  mova                  m0, [srcq + second_offset]
   movu                  m1, [ref1q+%5]
   movu                  m2, [ref2q+%5]
-  movu                  m3, [ref3q+%5]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movu                  m1, [ref4q+%5]
-  paddd                 m5, m2
-  paddd                 m6, m3
+
 %if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
 %endif
   psadbw                m1, m0
-  paddd                 m7, m1
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movu                  m1, [ref3q+%5]
+  movu                  m2, [ref4q+%5]
+
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
+%endif
+
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
 %endmacro
 
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_32x2x4 9
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9
 %endmacro
 
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_64x2x4 9
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9
 %endmacro
 
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_128x2x4 5-6 0
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                 {first,second}_extraline, line_ending
+%macro PROCESS_128x2x4 9
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9
 %endmacro
 
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 0
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -196,18 +322,41 @@
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else ; avg
+
+%if UNIX64
+cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
+                                  second_pred, res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
+                                  second_pred, ref2, ref3
+  %define src_strideq r1mp
+  %define src_strided r1mp
+%endif
+%endif
+
+  %define mflag ((1 - UNIX64) & %3)
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
+
   mov                ref2q, [ref1q+gprsize*1]
   mov                ref3q, [ref1q+gprsize*2]
   mov                ref4q, [ref1q+gprsize*3]
   mov                ref1q, [ref1q+gprsize*0]
 
-  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+  PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %rep (%2-4)/2
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %endrep
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+
+%if %3 == 0
+  %define resultq r4
+  %define resultmp r4mp
+%else
+  %define resultq r5
+  %define resultmp r5mp
+%endif
 
 %if %1 > 4
   pslldq                m5, 4
@@ -218,16 +367,16 @@
   mova                  m7, m6
   punpcklqdq            m4, m6
   punpckhqdq            m5, m7
-  movifnidn             r4, r4mp
   paddd                 m4, m5
-  movu                [r4], m4
+  movifnidn             resultq, resultmp
+  movu                [resultq], m4
   RET
 %else
-  movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
-  movq              [r4+0], m6
-  movq              [r4+8], m7
+  movifnidn             resultq, resultmp
+  movq              [resultq+0], m6
+  movq              [resultq+8], m7
   RET
 %endif
 %endmacro
@@ -255,3 +404,25 @@
 SADNXN4D 32,  8
 SADNXN4D 16, 64
 SADNXN4D 64, 16
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64,  128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  8,  4, 1
+SADNXN4D  4,  8, 1
+SADNXN4D  4,  4, 1
+SADNXN4D  4, 16, 1
+SADNXN4D 16,  4, 1
+SADNXN4D  8, 32, 1
+SADNXN4D 32,  8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1

diff --git a/libaom/aom_dsp/x86/sad_highbd_avx2.c b/libaom/aom_dsp/x86/sad_highbd_avx2.c
index b506d46..2cff2e6 100644
--- a/libaom/aom_dsp/x86/sad_highbd_avx2.c
+++ b/libaom/aom_dsp/x86/sad_highbd_avx2.c

@@ -37,487 +37,257 @@
   return (unsigned int)_mm_cvtsi128_si32(lo128);
 }
 
-unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-
-  // first 4 rows
-  __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  __m256i u0 = _mm256_sub_epi16(s0, r0);
-  __m256i u1 = _mm256_sub_epi16(s1, r1);
-  __m256i u2 = _mm256_sub_epi16(s2, r2);
-  __m256i u3 = _mm256_sub_epi16(s3, r3);
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum0, sum1;
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum0 = _mm256_add_epi16(u0, u1);
-  sum0 = _mm256_add_epi16(sum0, u2);
-  sum0 = _mm256_add_epi16(sum0, u3);
-
-  // second 4 rows
-  src_ptr += src_stride << 2;
-  ref_ptr += ref_stride << 2;
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  u0 = _mm256_sub_epi16(s0, r0);
-  u1 = _mm256_sub_epi16(s1, r1);
-  u2 = _mm256_sub_epi16(s2, r2);
-  u3 = _mm256_sub_epi16(s3, r3);
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum1 = _mm256_add_epi16(u0, u1);
-  sum1 = _mm256_add_epi16(sum1, u2);
-  sum1 = _mm256_add_epi16(sum1, u3);
-
-  // find out the SAD
-  s0 = _mm256_unpacklo_epi16(sum0, zero);
-  s1 = _mm256_unpackhi_epi16(sum0, zero);
-  r0 = _mm256_unpacklo_epi16(sum1, zero);
-  r1 = _mm256_unpackhi_epi16(sum1, zero);
-  s0 = _mm256_add_epi32(s0, s1);
-  r0 = _mm256_add_epi32(r0, r1);
-  sum0 = _mm256_add_epi32(s0, r0);
-  // 8 32-bit summation
-
-  return (unsigned int)get_sad_from_mm256_epi32(&sum0);
-}
-
-unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
-  __m256i sum0;
-  __m256i sum = _mm256_setzero_si256();
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+                                            __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
-  int row = 0;
-
-  // Loop for every 4 rows
-  while (row < 16) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-    u0 = _mm256_sub_epi16(s0, r0);
-    u1 = _mm256_sub_epi16(s1, r1);
-    u2 = _mm256_sub_epi16(s2, r2);
-    u3 = _mm256_sub_epi16(s3, r3);
-
-    u0 = _mm256_abs_epi16(u0);
-    u1 = _mm256_abs_epi16(u1);
-    u2 = _mm256_abs_epi16(u2);
-    u3 = _mm256_abs_epi16(u3);
-
-    sum0 = _mm256_add_epi16(u0, u1);
-    sum0 = _mm256_add_epi16(sum0, u2);
-    sum0 = _mm256_add_epi16(sum0, u3);
-
-    s0 = _mm256_unpacklo_epi16(sum0, zero);
-    s1 = _mm256_unpackhi_epi16(sum0, zero);
-    sum = _mm256_add_epi32(sum, s0);
-    sum = _mm256_add_epi32(sum, s1);
-    // 8 32-bit summation
-
-    row += 4;
-    src_ptr += src_stride << 2;
-    ref_ptr += ref_stride << 2;
+  int i;
+  for (i = 0; i < 4; i++) {
+    s[i] = _mm256_sub_epi16(s[i], r[i]);
+    s[i] = _mm256_abs_epi16(s[i]);
   }
-  return get_sad_from_mm256_epi32(&sum);
-}
-
-static void sad32x4(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-  int row_sections = 0;
-
-  while (row_sections < 2) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-
-    if (sec_ptr) {
-      r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-      r1 = _mm256_avg_epu16(
-          r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-      r2 = _mm256_avg_epu16(
-          r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-      r3 = _mm256_avg_epu16(
-          r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    }
-    s0 = _mm256_sub_epi16(s0, r0);
-    s1 = _mm256_sub_epi16(s1, r1);
-    s2 = _mm256_sub_epi16(s2, r2);
-    s3 = _mm256_sub_epi16(s3, r3);
-
-    s0 = _mm256_abs_epi16(s0);
-    s1 = _mm256_abs_epi16(s1);
-    s2 = _mm256_abs_epi16(s2);
-    s3 = _mm256_abs_epi16(s3);
-
-    s0 = _mm256_add_epi16(s0, s1);
-    s0 = _mm256_add_epi16(s0, s2);
-    s0 = _mm256_add_epi16(s0, s3);
-
-    r0 = _mm256_unpacklo_epi16(s0, zero);
-    r1 = _mm256_unpackhi_epi16(s0, zero);
-
-    r0 = _mm256_add_epi32(r0, r1);
-    *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-
-    row_sections += 1;
-    src_ptr += src_stride << 1;
-    ref_ptr += ref_stride << 1;
-    if (sec_ptr) sec_ptr += 32 << 1;
-  }
-}
-
-unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 2;
-  int row_section = 0;
-
-  while (row_section < 4) {
-    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad64x2(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
 
   s[0] = _mm256_add_epi16(s[0], s[1]);
   s[0] = _mm256_add_epi16(s[0], s[2]);
   s[0] = _mm256_add_epi16(s[0], s[3]);
 
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
 
   r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }
 
-unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
-    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
-                     const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  int row = 0;
-  while (row < 64) {
-    sad128x1(srcp, refp, NULL, &sad);
-    srcp += src_stride;
-    refp += ref_stride;
-    row += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
-                                        const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
 static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
                            const uint16_t *ref_ptr, int ref_stride,
                            const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
+  __m256i s[4], r[4];
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
 
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
 
   if (sec_ptr) {
-    r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r1 = _mm256_avg_epu16(r1,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r2 = _mm256_avg_epu16(r2,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r3 = _mm256_avg_epu16(r3,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+  }
+  highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  int i;
+  __m256i sad = _mm256_setzero_si256();
+  for (i = 0; i < N; i += 4) {
+    sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
+    src_ptr += src_stride << 2;
+    ref_ptr += ref_stride << 2;
+  }
+  return (unsigned int)get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int row_sections = 0;
+
+  while (row_sections < 2) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 32 << 1;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+
+    row_sections += 1;
+    src_ptr += src_stride << 1;
+    ref_ptr += ref_stride << 1;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 2;
+  int i;
+
+  for (i = 0; i < N; i += 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 1;
+  int i;
+  for (i = 0; i < N; i += 2) {
+    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+                     const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += 64;
+    ref_ptr += 64;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+    int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  int row = 0;
+  while (row < N) {
+    sad128x1(srcp, refp, NULL, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    row++;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+#define highbd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,               \
+      int ref_stride) {                                                     \
+    return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
   }
 
-  s0 = _mm256_sub_epi16(s0, r0);
-  s1 = _mm256_sub_epi16(s1, r1);
-  s2 = _mm256_sub_epi16(s2, r2);
-  s3 = _mm256_sub_epi16(s3, r3);
+highbd_sadMxN_avx2(16, 4);
+highbd_sadMxN_avx2(16, 8);
+highbd_sadMxN_avx2(16, 16);
+highbd_sadMxN_avx2(16, 32);
+highbd_sadMxN_avx2(16, 64);
 
-  s0 = _mm256_abs_epi16(s0);
-  s1 = _mm256_abs_epi16(s1);
-  s2 = _mm256_abs_epi16(s2);
-  s3 = _mm256_abs_epi16(s3);
+highbd_sadMxN_avx2(32, 8);
+highbd_sadMxN_avx2(32, 16);
+highbd_sadMxN_avx2(32, 32);
+highbd_sadMxN_avx2(32, 64);
 
-  s0 = _mm256_add_epi16(s0, s1);
-  s0 = _mm256_add_epi16(s0, s2);
-  s0 = _mm256_add_epi16(s0, s3);
+highbd_sadMxN_avx2(64, 16);
+highbd_sadMxN_avx2(64, 32);
+highbd_sadMxN_avx2(64, 64);
+highbd_sadMxN_avx2(64, 128);
 
-  r0 = _mm256_unpacklo_epi16(s0, zero);
-  r1 = _mm256_unpackhi_epi16(s0, zero);
+highbd_sadMxN_avx2(128, 64);
+highbd_sadMxN_avx2(128, 128);
 
-  r0 = _mm256_add_epi32(r0, r1);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  return get_sad_from_mm256_epi32(&sad);
 }
 
 unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
@@ -566,6 +336,40 @@
   return sum;
 }
 
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 2) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
 unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -614,6 +418,26 @@
   return sum;
 }
 
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 8) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
 unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -697,7 +521,7 @@
 }
 
 // SAD 4D
-// Combine 4 __m256i vectors to uint32_t result[4]
+// Combine 4 __m256i input vectors  v to uint32_t result[4]
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
@@ -752,287 +576,124 @@
   s[3] = _mm256_setzero_si256();
 }
 
-void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
-                                const uint8_t *const ref_array[],
-                                int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
   const int shift_for_4_rows = 2;
-  int i;
+  int i, j;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-    srcp += src_stride << shift_for_4_rows;
-    refp[i] += ref_stride << shift_for_4_rows;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first8rows[4];
-  uint32_t second8rows[4];
-  const uint8_t *ref[4];
-  const int shift_for_8_rows = 3;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
-  src += src_stride << shift_for_8_rows;
-  ref[0] += ref_stride << shift_for_8_rows;
-  ref[1] += ref_stride << shift_for_8_rows;
-  ref[2] += ref_stride << shift_for_8_rows;
-  ref[3] += ref_stride << shift_for_8_rows;
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
-  sad_array[0] = first8rows[0] + second8rows[0];
-  sad_array[1] = first8rows[1] + second8rows[1];
-  sad_array[2] = first8rows[2] + second8rows[2];
-  sad_array[3] = first8rows[3] + second8rows[3];
-}
-
-void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 4) {
-      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+    for (j = 0; j < N; j += 4) {
+      sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
       srcp += src_stride << shift_for_4_rows;
       refp[i] += ref_stride << shift_for_4_rows;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
+static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_4_rows = 2;
+  int i, r;
 
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
 
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    for (r = 0; r < N; r += 4) {
+      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      srcp += src_stride << shift_for_4_rows;
+      refp[i] += ref_stride << shift_for_4_rows;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
   const int shift_for_rows = 1;
-  int i;
-  int rows_section;
+  int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    rows_section = 0;
-    while (rows_section < 16) {
+    for (r = 0; r < N; r += 2) {
       sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
       srcp += src_stride << shift_for_rows;
       refp[i] += ref_stride << shift_for_rows;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
-  int i;
-  int rows_section;
+  int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    rows_section = 0;
-    while (rows_section < 64) {
+    for (r = 0; r < N; r++) {
       sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
       srcp += src_stride;
       refp[i] += ref_stride;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref_array[],
-                                   int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
+#define highbd_sadMxNx4d_avx2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
+                                  sad_array);                                \
+  }
 
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
+highbd_sadMxNx4d_avx2(16, 4);
+highbd_sadMxNx4d_avx2(16, 8);
+highbd_sadMxNx4d_avx2(16, 16);
+highbd_sadMxNx4d_avx2(16, 32);
+highbd_sadMxNx4d_avx2(16, 64);
 
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
+highbd_sadMxNx4d_avx2(32, 8);
+highbd_sadMxNx4d_avx2(32, 16);
+highbd_sadMxNx4d_avx2(32, 32);
+highbd_sadMxNx4d_avx2(32, 64);
+
+highbd_sadMxNx4d_avx2(64, 16);
+highbd_sadMxNx4d_avx2(64, 32);
+highbd_sadMxNx4d_avx2(64, 64);
+highbd_sadMxNx4d_avx2(64, 128);
+
+highbd_sadMxNx4d_avx2(128, 64);
+highbd_sadMxNx4d_avx2(128, 128);

diff --git a/libaom/aom_dsp/x86/sad_impl_avx2.c b/libaom/aom_dsp/x86/sad_impl_avx2.c
index c6fd62c..f77a585 100644
--- a/libaom/aom_dsp/x86/sad_impl_avx2.c
+++ b/libaom/aom_dsp/x86/sad_impl_avx2.c

@@ -84,81 +84,6 @@
   return sum;
 }
 
-static void sad64x64x4d(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        __m128i *res) {
-  uint32_t sum[4];
-  aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
-  *res = _mm_loadu_si128((const __m128i *)sum);
-}
-
-void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  unsigned int half_width = 64;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
-                            uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
 static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
                                      const int h, const uint8_t *second_pred,

diff --git a/libaom/aom_dsp/x86/sse_avx2.c b/libaom/aom_dsp/x86/sse_avx2.c
index 42df981..e6ee2fc 100644
--- a/libaom/aom_dsp/x86/sse_avx2.c
+++ b/libaom/aom_dsp/x86/sse_avx2.c

@@ -45,6 +45,7 @@
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
   const __m256i sum0_4x64 =
       _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
@@ -63,6 +64,7 @@
   xx_storel_64(&sum, sum_1x64);
   return sum;
 }
+#endif
 
 static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m256i *sum) {
@@ -211,6 +213,7 @@
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
                                        const uint16_t *b) {
   const __m256i v_a_w = yy_loadu_256(a);
@@ -378,3 +381,4 @@
   }
   return sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/sse_sse4.c b/libaom/aom_dsp/x86/sse_sse4.c
index 0d45003..5f95eb9 100644
--- a/libaom/aom_dsp/x86/sse_sse4.c
+++ b/libaom/aom_dsp/x86/sse_sse4.c

@@ -28,12 +28,14 @@
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
   const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
   *sum64 = _mm_add_epi64(sum0, *sum64);
   *sum64 = _mm_add_epi64(sum1, *sum64);
 }
+#endif
 
 static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
                                   const uint8_t *b) {
@@ -49,9 +51,8 @@
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
 }
 
-static INLINE void aom_sse4x2_sse4_1(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride,
-                                     __m128i *sum) {
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
   const __m128i v_a0 = xx_loadl_32(a);
   const __m128i v_a1 = xx_loadl_32(a + a_stride);
   const __m128i v_b0 = xx_loadl_32(b);
@@ -61,8 +62,8 @@
   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
 }
-static INLINE void aom_sse8_sse4_1(const uint8_t *a, const uint8_t *b,
-                                   __m128i *sum) {
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
   const __m128i v_b0 = xx_loadl_64(b);
   const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
@@ -79,7 +80,7 @@
   switch (width) {
     case 4:
       do {
-        aom_sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
         a += a_stride << 1;
         b += b_stride << 1;
         y += 2;
@@ -88,7 +89,7 @@
       break;
     case 8:
       do {
-        aom_sse8_sse4_1(a, b, &sum);
+        sse8_sse4_1(a, b, &sum);
         a += a_stride;
         b += b_stride;
         y += 1;
@@ -147,11 +148,11 @@
         do {
           int i = 0;
           do {
-            aom_sse8_sse4_1(a + i, b + i, &sum);
-            aom_sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
             i += 8;
           } while (i + 4 < width);
-          aom_sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
           a += (a_stride << 1);
           b += (b_stride << 1);
           y += 2;
@@ -160,7 +161,7 @@
         do {
           int i = 0;
           do {
-            aom_sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i, b + i, &sum);
             i += 8;
           } while (i < width);
           a += a_stride;
@@ -175,6 +176,7 @@
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
                                           int a_stride, const uint16_t *b,
                                           int b_stride) {
@@ -348,3 +350,4 @@
   }
   return sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/aom_dsp/x86/subtract_avx2.c b/libaom/aom_dsp/x86/subtract_avx2.c
index 4389d12..4083160 100644
--- a/libaom/aom_dsp/x86/subtract_avx2.c
+++ b/libaom/aom_dsp/x86/subtract_avx2.c

@@ -26,7 +26,7 @@
   _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
 }
 
-static INLINE void aom_subtract_block_16xn_avx2(
+static INLINE void subtract_block_16xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -42,7 +42,7 @@
   }
 }
 
-static INLINE void aom_subtract_block_32xn_avx2(
+static INLINE void subtract_block_32xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -53,7 +53,7 @@
   }
 }
 
-static INLINE void aom_subtract_block_64xn_avx2(
+static INLINE void subtract_block_64xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -65,7 +65,7 @@
   }
 }
 
-static INLINE void aom_subtract_block_128xn_avx2(
+static INLINE void subtract_block_128xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -85,20 +85,20 @@
                              ptrdiff_t pred_stride) {
   switch (cols) {
     case 16:
-      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 32:
-      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 64:
-      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 128:
-      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                    src_stride, pred_ptr, pred_stride);
+      subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                src_stride, pred_ptr, pred_stride);
       break;
     default:
       aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,

diff --git a/libaom/aom_dsp/x86/sum_squares_avx2.c b/libaom/aom_dsp/x86/sum_squares_avx2.c
index 0af44e3..97d78b6 100644
--- a/libaom/aom_dsp/x86/sum_squares_avx2.c
+++ b/libaom/aom_dsp/x86/sum_squares_avx2.c

@@ -77,3 +77,172 @@
     return aom_sum_squares_2d_i16_c(src, stride, width, height);
   }
 }
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 2);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 32 elements in a row
+  for (i = 0; i < width - 31; i += 32) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+        __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi16(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+      __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi16(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi32(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi32(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}

diff --git a/libaom/aom_dsp/x86/sum_squares_sse2.c b/libaom/aom_dsp/x86/sum_squares_sse2.c
index 22d7739..85b301a 100644
--- a/libaom/aom_dsp/x86/sum_squares_sse2.c
+++ b/libaom/aom_dsp/x86/sum_squares_sse2.c

@@ -201,3 +201,166 @@
     return aom_sum_squares_i16_c(src, n);
   }
 }
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 2);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+        __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi16(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+      __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi16(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 8 elements in a row
+  for (i = 0; i < width - 8; i += 8) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi32(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi32(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}

diff --git a/libaom/aom_dsp/x86/transpose_sse2.h b/libaom/aom_dsp/x86/transpose_sse2.h
index d0d1ee6..7ac692c 100644
--- a/libaom/aom_dsp/x86/transpose_sse2.h
+++ b/libaom/aom_dsp/x86/transpose_sse2.h

@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 
 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
-  // Unpack 16 bit elements. Goes from:
+  // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03
   // in[1]: 10 11 12 13
   // in[2]: 20 21 22 23
@@ -28,7 +28,7 @@
   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
 
-  // Unpack 32 bit elements resulting in:
+  // Unpack 16 bit elements resulting in:
   // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
   return _mm_unpacklo_epi16(a0, a1);
 }

diff --git a/libaom/aom_dsp/x86/txfm_common_avx2.h b/libaom/aom_dsp/x86/txfm_common_avx2.h
index 06a77e7..ea57c9f 100644
--- a/libaom/aom_dsp/x86/txfm_common_avx2.h
+++ b/libaom/aom_dsp/x86/txfm_common_avx2.h

@@ -114,58 +114,85 @@
   }
 }
 
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (int i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (int i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (int i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
 static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
                                               __m256i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
-  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
-  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
-  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
-  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
-  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
-  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
-  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
-  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
-  // to:
-  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
-  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
-  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
-  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
-  // ...
-  __m256i a[16];
-  for (int i = 0; i < 16; i += 2) {
-    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
-    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
-  }
-  __m256i b[16];
-  for (int i = 0; i < 16; i += 2) {
-    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
-    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
-  }
-  __m256i c[16];
-  for (int i = 0; i < 16; i += 2) {
-    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
-    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
-  }
-  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
-  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
-  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
-  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+  __m256i t[16];
 
-  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
-  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
-  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
-  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
 
-  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
-  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
-  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
-  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
 
-  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
-  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
-  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
-  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
 }
 
 static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,

diff --git a/libaom/aom_dsp/x86/txfm_common_sse2.h b/libaom/aom_dsp/x86/txfm_common_sse2.h
index ed82eee..9c99eb9 100644
--- a/libaom/aom_dsp/x86/txfm_common_sse2.h
+++ b/libaom/aom_dsp/x86/txfm_common_sse2.h

@@ -26,4 +26,8 @@
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
+#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
 #endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_

diff --git a/libaom/aom_dsp/x86/variance_avx2.c b/libaom/aom_dsp/x86/variance_avx2.c
index 800aef1..c4919ba 100644
--- a/libaom/aom_dsp/x86/variance_avx2.c
+++ b/libaom/aom_dsp/x86/variance_avx2.c

@@ -28,7 +28,7 @@
 static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
                                         __m256i *const sse,
                                         __m256i *const sum) {
-  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+  const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
 
   // unpack into pairs of source and reference values
   const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
@@ -234,6 +234,10 @@
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
                                              int height, unsigned int *sse);
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
 
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
@@ -276,6 +280,11 @@
 AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
 AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
 AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
 
 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \

diff --git a/libaom/aom_dsp/x86/variance_impl_avx2.c b/libaom/aom_dsp/x86/variance_impl_avx2.c
index 88e27ae..f779270 100644
--- a/libaom/aom_dsp/x86/variance_impl_avx2.c
+++ b/libaom/aom_dsp/x86/variance_impl_avx2.c

@@ -104,6 +104,65 @@
   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
 
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
+  /* load source and destination of 2 rows and insert*/          \
+  src_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
+  /* average between current and next stride source */                         \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT                                    \
+  /* load source and another source from next row   */               \
+  src_reg = _mm256_inserti128_si256(                                 \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
+  /* load source and next row source from 1 byte onwards   */        \
+  src_next_reg = _mm256_inserti128_si256(                            \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT                                          \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter)                        \
+  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
+  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
+  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
+  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
+  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter)             \
+  /* filter the source */                     \
+  src_lo = _mm_maddubs_epi16(src_lo, filter); \
+  src_hi = _mm_maddubs_epi16(src_hi, filter); \
+                                              \
+  /* add 8 to source */                       \
+  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+                                              \
+  /* divide source by 16 */                   \
+  src_lo = _mm_srai_epi16(src_lo, 4);         \
+  src_hi = _mm_srai_epi16(src_hi, 4);
+
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
@@ -127,8 +186,8 @@
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -156,8 +215,8 @@
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -169,8 +228,8 @@
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -189,7 +248,7 @@
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -228,8 +287,8 @@
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(
@@ -292,6 +351,244 @@
   return sum;
 }
 
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        AVG_NEXT_SRC_INSERT(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        /* average between current and next stride source */
+        src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg, src_avg, src_temp;
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        src_temp = _mm256_avg_epu8(src_avg, src_temp);
+        LOAD_DST_INSERT
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_temp, zero_reg)
+        // save current source average
+        src_avg = src_next_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
+      MERGE_WITH_SRC(src_avg, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = 4  and y_offset = bilin interpolation
+      __m256i filter, pw8, src_next_reg, src_avg, src_temp;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_avg, src_temp)
+        // save current source average
+        src_avg = src_next_reg;
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      MERGE_WITH_SRC(src_avg, src_next_reg)
+      FILTER_SRC(filter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(filter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      // average between previous pack to the current
+      src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+      MERGE_WITH_SRC(src_pack, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        MERGE_WITH_SRC(src_pack, src_next_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(xfilter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      MERGE_WITH_SRC(src_pack, src_next_reg)
+      FILTER_SRC(yfilter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,

diff --git a/libaom/aom_dsp/x86/variance_sse2.c b/libaom/aom_dsp/x86/variance_sse2.c
index f3efc15..4e2b5a1 100644
--- a/libaom/aom_dsp/x86/variance_sse2.c
+++ b/libaom/aom_dsp/x86/variance_sse2.c

@@ -21,9 +21,10 @@
 
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
@@ -144,6 +145,7 @@
                                   __m128i *const sum) {
   assert(h <= 128);  // May overflow for larger height.
   *sum = _mm_setzero_si128();
+  *sse = _mm_setzero_si128();
   for (int i = 0; i < h; i++) {
     const __m128i s = load8_8to16_sse2(src);
     const __m128i r = load8_8to16_sse2(ref);
@@ -236,6 +238,14 @@
   }
 }
 
+void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -498,75 +508,24 @@
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }

diff --git a/libaom/aom_mem/aom_mem.c b/libaom/aom_mem/aom_mem.c
index e603fc5..e977b01 100644
--- a/libaom/aom_mem/aom_mem.c
+++ b/libaom/aom_mem/aom_mem.c

@@ -54,7 +54,7 @@
 #endif
   void *const addr = malloc(aligned_size);
   if (addr) {
-    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
+    x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
     SetActualMallocAddress(x, addr);
   }
   return x;

diff --git a/libaom/aom_mem/aom_mem.cmake b/libaom/aom_mem/aom_mem.cmake
index eaee844..346588d 100644
--- a/libaom/aom_mem/aom_mem.cmake
+++ b/libaom/aom_mem/aom_mem.cmake

@@ -23,4 +23,7 @@
   add_library(aom_mem OBJECT ${AOM_MEM_SOURCES})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_mem>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_mem>)
+  endif()
 endfunction()

diff --git a/libaom/aom_mem/aom_mem.h b/libaom/aom_mem/aom_mem.h
index 4b1fa45..bc5d8bc 100644
--- a/libaom/aom_mem/aom_mem.h
+++ b/libaom/aom_mem/aom_mem.h

@@ -38,6 +38,10 @@
 void aom_free(void *memblk);
 void *aom_memset16(void *dest, int val, size_t length);
 
+/*returns an addr aligned to the byte boundary specified by align*/
+#define aom_align_addr(addr, align) \
+  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
+
 #include <string.h>
 
 #ifdef AOM_MEM_PLTFRM

diff --git a/libaom/aom_mem/include/aom_mem_intrnl.h b/libaom/aom_mem/include/aom_mem_intrnl.h
index cbc30a9..2c9819d 100644
--- a/libaom/aom_mem/include/aom_mem_intrnl.h
+++ b/libaom/aom_mem/include/aom_mem_intrnl.h

@@ -26,8 +26,4 @@
 #endif
 #endif
 
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr, align) \
-  (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
-
 #endif  // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_

diff --git a/libaom/aom_ports/aom_ports.cmake b/libaom/aom_ports/aom_ports.cmake
index 6272fc0..d579896 100644
--- a/libaom/aom_ports/aom_ports.cmake
+++ b/libaom/aom_ports/aom_ports.cmake

@@ -49,33 +49,44 @@
 # * The libaom target must exist before this function is called.
 function(setup_aom_ports_targets)
   if("${AOM_TARGET_CPU}" MATCHES "^x86")
-    add_asm_library("aom_ports" "AOM_PORTS_ASM_X86" "aom")
+    add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
     set(aom_ports_has_symbols 1)
   elseif("${AOM_TARGET_CPU}" MATCHES "arm")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
     set(aom_ports_has_symbols 1)
-    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
   elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
     set(aom_ports_has_symbols 1)
+  endif()
+
+  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
+    endif()
   endif()
 
   if(aom_ports_has_symbols)
     target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
 
-    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
-       "x86_64")
+    if("${AOM_TARGET_CPU}" STREQUAL "x86"
+       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
       target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
     endif()
 
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
   else()
     target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES})
+    endif()
 
-    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
-       "x86_64")
+    if("${AOM_TARGET_CPU}" STREQUAL "x86"
+       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
       target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86})
+      if(BUILD_SHARED_LIBS)
+        target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES_X86})
+      endif()
     endif()
   endif()
 endfunction()

diff --git a/libaom/aom_ports/ppc_cpudetect.c b/libaom/aom_ports/ppc_cpudetect.c
index 82b4f58..ce4d5ae 100644
--- a/libaom/aom_ports/ppc_cpudetect.c
+++ b/libaom/aom_ports/ppc_cpudetect.c

@@ -45,7 +45,7 @@
   unsigned int i;
   uint64_t buf[64];
 
-  // If VPX_SIMD_CAPS is set then allow only those capabilities.
+  // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities.
   if (!cpu_env_flags(&flags)) {
     return flags;
   }

diff --git a/libaom/aom_ports/x86_abi_support.asm b/libaom/aom_ports/x86_abi_support.asm
index 0e7c262..6448990 100644
--- a/libaom/aom_ports/x86_abi_support.asm
+++ b/libaom/aom_ports/x86_abi_support.asm

@@ -122,6 +122,13 @@
 ;    http://www.tortall.net/projects/yasm/ticket/236
 ;
 %ifdef CHROMIUM
+  %ifdef __NASM_VER__
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+      ; nasm < 2.14 does not support :private_extern directive
+      %fatal Must use nasm 2.14 or newer
+    %endif
+  %endif
+
   %ifidn   __OUTPUT_FORMAT__,elf32
     %define PRIVATE :hidden
   %elifidn __OUTPUT_FORMAT__,elf64

diff --git a/libaom/aom_scale/aom_scale.cmake b/libaom/aom_scale/aom_scale.cmake
index 3199733..e832993 100644
--- a/libaom/aom_scale/aom_scale.cmake
+++ b/libaom/aom_scale/aom_scale.cmake

@@ -31,10 +31,13 @@
 
   if(HAVE_DSPR2)
     add_intrinsics_object_library("" "dspr2" "aom_scale"
-                                  "AOM_SCALE_INTRIN_DSPR2" "aom")
+                                  "AOM_SCALE_INTRIN_DSPR2")
   endif()
 
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_scale>)
+  endif()
 
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.

diff --git a/libaom/aom_scale/generic/yv12config.c b/libaom/aom_scale/generic/yv12config.c
index a5ad1a7..1f80d7b 100644
--- a/libaom/aom_scale/generic/yv12config.c
+++ b/libaom/aom_scale/generic/yv12config.c

@@ -11,6 +11,7 @@
 
 #include <assert.h>
 
+#include "aom/internal/aom_image_internal.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
@@ -23,27 +24,23 @@
 /****************************************************************************
  *
  ****************************************************************************/
-#define yv12_align_addr(addr, align) \
-  (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align))
 
 // TODO(jkoleszar): Maybe replace this with struct aom_image
-
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   if (ybf) {
     if (ybf->buffer_alloc_sz > 0) {
       aom_free(ybf->buffer_alloc);
     }
     if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
-
+    aom_remove_metadata_from_frame_buffer(ybf);
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
       all of this so that a freed pointer isn't inadvertently used */
     memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
-  } else {
-    return -1;
+    return 0;
   }
 
-  return 0;
+  return AOM_CODEC_MEM_ERROR;
 }
 
 static int realloc_frame_buffer_aligned(
@@ -69,7 +66,8 @@
     // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
     // pool. Bound the total amount of allocated memory as if these REF_FRAMES
     // frame buffers were allocated in a single allocation.
-    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1;
+    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES)
+      return AOM_CODEC_MEM_ERROR;
 #endif
 
     if (cb != NULL) {
@@ -78,14 +76,17 @@
 
       assert(fb != NULL);
 
-      if (external_frame_size != (size_t)external_frame_size) return -1;
+      if (external_frame_size != (size_t)external_frame_size)
+        return AOM_CODEC_MEM_ERROR;
 
       // Allocation to hold larger frame, or first allocation.
-      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) return -1;
+      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
+        return AOM_CODEC_MEM_ERROR;
 
-      if (fb->data == NULL || fb->size < external_frame_size) return -1;
+      if (fb->data == NULL || fb->size < external_frame_size)
+        return AOM_CODEC_MEM_ERROR;
 
-      ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
+      ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32);
 
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
@@ -101,10 +102,10 @@
       ybf->buffer_alloc = NULL;
       ybf->buffer_alloc_sz = 0;
 
-      if (frame_size != (size_t)frame_size) return -1;
+      if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR;
 
       ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
-      if (!ybf->buffer_alloc) return -1;
+      if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR;
 
       ybf->buffer_alloc_sz = (size_t)frame_size;
 
@@ -140,22 +141,22 @@
       ybf->flags = 0;
     }
 
-    ybf->y_buffer = (uint8_t *)yv12_align_addr(
+    ybf->y_buffer = (uint8_t *)aom_align_addr(
         buf + (border * y_stride) + border, aom_byte_align);
-    ybf->u_buffer = (uint8_t *)yv12_align_addr(
+    ybf->u_buffer = (uint8_t *)aom_align_addr(
         buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
         aom_byte_align);
     ybf->v_buffer =
-        (uint8_t *)yv12_align_addr(buf + yplane_size + uvplane_size +
-                                       (uv_border_h * uv_stride) + uv_border_w,
-                                   aom_byte_align);
+        (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
+                                      (uv_border_h * uv_stride) + uv_border_w,
+                                  aom_byte_align);
 
     ybf->use_external_reference_buffers = 0;
 
     if (use_highbitdepth) {
       if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
       ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
-      if (!ybf->y_buffer_8bit) return -1;
+      if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR;
     } else {
       if (ybf->y_buffer_8bit) {
         aom_free(ybf->y_buffer_8bit);
@@ -167,7 +168,7 @@
     ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
   }
-  return -2;
+  return AOM_CODEC_MEM_ERROR;
 }
 
 static int calc_stride_and_planesize(const int ss_x, const int ss_y,
@@ -182,7 +183,7 @@
    * the start of the chroma rows without introducing an arbitrary gap
    * between planes, which would break the semantics of things like
    * aom_img_set_rect(). */
-  if (border & 0x1f) return -3;
+  if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
   *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
   *yplane_size =
       (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
@@ -199,7 +200,8 @@
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
 #if CONFIG_SIZE_LIMIT
-  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    return AOM_CODEC_MEM_ERROR;
 #endif
 
   if (ybf) {
@@ -224,55 +226,7 @@
         aligned_width, aligned_height, uv_width, uv_height, uv_stride,
         uv_border_w, uv_border_h);
   }
-  return -2;
-}
-
-// TODO(anyone): This function allocates memory for
-// lookahead buffer considering height and width is
-// aligned to 128. Currently variance calculation of
-// simple_motion_search_get_best_ref() function is done
-// for full sb size (i.e integral multiple of max sb
-// size = 128 or 64). Hence partial sbs need up to 127
-// pixels beyond frame boundary. 128 aligned limitation of
-// lookahead buffer can be removed if variance calculation
-// is adjusted for partial sbs
-
-// NOTE: Chroma width and height need not be aligned to
-// 128 since variance calculation happens only for luma plane
-int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                                 int ss_x, int ss_y, int use_highbitdepth,
-                                 int border, int byte_alignment,
-                                 aom_codec_frame_buffer_t *fb,
-                                 aom_get_frame_buffer_cb_fn_t cb,
-                                 void *cb_priv) {
-  if (ybf) {
-    int y_stride = 0;
-    int uv_stride = 0;
-    uint64_t yplane_size = 0;
-    uint64_t uvplane_size = 0;
-    const int aligned_128_width = (width + 127) & ~127;
-    const int aligned_128_height = (height + 127) & ~127;
-    const int aligned_width = (width + 7) & ~7;
-    const int aligned_height = (height + 7) & ~7;
-    const int uv_64_height = aligned_128_height >> ss_y;
-    const int uv_width = aligned_width >> ss_x;
-    const int uv_height = aligned_height >> ss_y;
-    const int uv_border_w = border >> ss_x;
-    const int uv_border_h = border >> ss_y;
-
-    int error = calc_stride_and_planesize(
-        ss_x, ss_y, aligned_128_width, aligned_128_height, border,
-        byte_alignment, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
-        uv_64_height);
-    if (error) return error;
-
-    return realloc_frame_buffer_aligned(
-        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
-        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
-        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
-        uv_border_w, uv_border_h);
-  }
-  return -2;
+  return AOM_CODEC_MEM_ERROR;
 }
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
@@ -284,5 +238,32 @@
                                     use_highbitdepth, border, byte_alignment,
                                     NULL, NULL, NULL);
   }
-  return -2;
+  return AOM_CODEC_MEM_ERROR;
+}
+
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf && ybf->metadata) {
+    aom_img_metadata_array_free(ybf->metadata);
+    ybf->metadata = NULL;
+  }
+}
+
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                      const aom_metadata_array_t *arr) {
+  if (!ybf || !arr || !arr->metadata_array) return -1;
+  aom_remove_metadata_from_frame_buffer(ybf);
+  ybf->metadata = aom_img_metadata_array_alloc(arr->sz);
+  if (!ybf->metadata) return -1;
+  for (size_t i = 0; i < ybf->metadata->sz; i++) {
+    ybf->metadata->metadata_array[i] = aom_img_metadata_alloc(
+        arr->metadata_array[i]->type, arr->metadata_array[i]->payload,
+        arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag);
+    if (ybf->metadata->metadata_array[i] == NULL) {
+      aom_img_metadata_array_free(ybf->metadata);
+      ybf->metadata = NULL;
+      return -1;
+    }
+  }
+  ybf->metadata->sz = arr->sz;
+  return 0;
 }

diff --git a/libaom/aom_scale/generic/yv12extend.c b/libaom/aom_scale/generic/yv12extend.c
index 6e9cfff..834a59d 100644
--- a/libaom/aom_scale/generic/yv12extend.c
+++ b/libaom/aom_scale/generic/yv12extend.c

@@ -59,6 +59,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
                               int height, int extend_top, int extend_left,
                               int extend_bottom, int extend_right) {
@@ -99,6 +100,7 @@
     dst_ptr2 += src_stride;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
                                      const int num_planes) {
@@ -108,6 +110,7 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (int plane = 0; plane < num_planes; ++plane) {
       const int is_uv = plane > 0;
@@ -120,6 +123,8 @@
     }
     return;
   }
+#endif
+
   for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int plane_border = ybf->border >> is_uv;
@@ -141,6 +146,7 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (int plane = 0; plane < num_planes; ++plane) {
       const int is_uv = plane > 0;
@@ -154,6 +160,8 @@
     }
     return;
   }
+#endif
+
   for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int top = ext_size >> (is_uv ? ss_y : 0);
@@ -184,7 +192,7 @@
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
     extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                       ybf->y_crop_height, ext_size, ext_size,
@@ -192,17 +200,20 @@
                       ext_size + ybf->y_width - ybf->y_crop_width);
     return;
   }
+#endif
   extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                ybf->y_crop_height, ext_size, ext_size,
                ext_size + ybf->y_height - ybf->y_crop_height,
                ext_size + ybf->y_width - ybf->y_crop_width);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   memcpy(dst, src, num * sizeof(uint16_t));
 }
+#endif
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
@@ -217,6 +228,7 @@
   assert(src_bc->y_height == dst_bc->y_height);
 #endif
 
+#if CONFIG_AV1_HIGHBITDEPTH
   assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
          (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
 
@@ -235,6 +247,7 @@
     aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
     return;
   }
+#endif
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint8_t *plane_src = src_bc->buffers[plane];
     uint8_t *plane_dst = dst_bc->buffers[plane];
@@ -255,6 +268,7 @@
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -265,6 +279,7 @@
     }
     return;
   }
+#endif
 
   for (row = 0; row < src_ybc->y_height; ++row) {
     memcpy(dst, src, src_ybc->y_width);
@@ -278,7 +293,7 @@
   int row;
   const uint8_t *src = src_bc->u_buffer;
   uint8_t *dst = dst_bc->u_buffer;
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -289,7 +304,7 @@
     }
     return;
   }
-
+#endif
   for (row = 0; row < src_bc->uv_height; ++row) {
     memcpy(dst, src, src_bc->uv_width);
     src += src_bc->uv_stride;
@@ -302,7 +317,7 @@
   int row;
   const uint8_t *src = src_bc->v_buffer;
   uint8_t *dst = dst_bc->v_buffer;
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -313,7 +328,7 @@
     }
     return;
   }
-
+#endif
   for (row = 0; row < src_bc->uv_height; ++row) {
     memcpy(dst, src, src_bc->uv_width);
     src += src_bc->uv_stride;
@@ -328,7 +343,7 @@
   int row;
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 =
         CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
@@ -342,6 +357,7 @@
     }
     return;
   }
+#endif
   src = (src + vstart1 * src_ybc->y_stride + hstart1);
   dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
 
@@ -366,7 +382,7 @@
   int row;
   const uint8_t *src = src_bc->u_buffer;
   uint8_t *dst = dst_bc->u_buffer;
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 =
         CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
@@ -379,7 +395,7 @@
     }
     return;
   }
-
+#endif
   src = (src + vstart1 * src_bc->uv_stride + hstart1);
   dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
 
@@ -404,7 +420,7 @@
   int row;
   const uint8_t *src = src_bc->v_buffer;
   uint8_t *dst = dst_bc->v_buffer;
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 =
         CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
@@ -417,7 +433,7 @@
     }
     return;
   }
-
+#endif
   src = (src + vstart1 * src_bc->uv_stride + hstart1);
   dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
 

diff --git a/libaom/aom_scale/yv12config.h b/libaom/aom_scale/yv12config.h
index 04a1c04..3642bb7 100644
--- a/libaom/aom_scale/yv12config.h
+++ b/libaom/aom_scale/yv12config.h

@@ -21,12 +21,12 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_frame_buffer.h"
 #include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
 
 #define AOMINNERBORDERINPIXELS 160
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
-#define AOM_ENC_LOOKAHEAD_BORDER 64
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
 typedef struct yv12_buffer_config {
@@ -105,6 +105,7 @@
 
   int corrupted;
   int flags;
+  aom_metadata_array_t *metadata;
 } YV12_BUFFER_CONFIG;
 
 #define YV12_FLAG_HIGHBITDEPTH 8
@@ -126,15 +127,31 @@
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
 
-int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                                 int ss_x, int ss_y, int use_highbitdepth,
-                                 int border, int byte_alignment,
-                                 aom_codec_frame_buffer_t *fb,
-                                 aom_get_frame_buffer_cb_fn_t cb,
-                                 void *cb_priv);
-
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
+/*!\brief Removes metadata from YUV_BUFFER_CONFIG struct.
+ *
+ * Frees metadata in frame buffer.
+ * Frame buffer metadata pointer will be set to NULL.
+ *
+ * \param[in]    ybf       Frame buffer struct pointer
+ */
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+/*!\brief Copy metadata to YUV_BUFFER_CONFIG struct.
+ *
+ * Copies metadata in frame buffer.
+ * Frame buffer will clear any previous metadata and will reallocate the
+ * metadata array to the new metadata size. Then, it will copy the new metadata
+ * array into it.
+ * Returns 0 on success or -1 on failure.
+ *
+ * \param[in]    ybf       Frame buffer struct pointer
+ * \param[in]    arr       Metadata array struct pointer
+ */
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                      const aom_metadata_array_t *arr);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/libaom/aom_util/aom_thread.c b/libaom/aom_util/aom_thread.c
index 244ac3b..a749a22 100644
--- a/libaom/aom_util/aom_thread.c
+++ b/libaom/aom_util/aom_thread.c

@@ -48,7 +48,7 @@
     // thread_name is too long, pthread_setname_np returns -1 with errno
     // ENAMETOOLONG (63).
     char thread_name[64];
-    strncpy(thread_name, worker->thread_name, sizeof(thread_name));
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
     thread_name[sizeof(thread_name) - 1] = '\0';
     pthread_setname_np(thread_name);
   }
@@ -57,7 +57,7 @@
     // Linux and Android require names (with nul) fit in 16 chars, otherwise
     // pthread_setname_np() returns ERANGE (34).
     char thread_name[16];
-    strncpy(thread_name, worker->thread_name, sizeof(thread_name));
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
     thread_name[sizeof(thread_name) - 1] = '\0';
     pthread_setname_np(pthread_self(), thread_name);
   }

diff --git a/libaom/aom_util/aom_thread.h b/libaom/aom_util/aom_thread.h
index bda8b75..8d04312 100644
--- a/libaom/aom_util/aom_thread.h
+++ b/libaom/aom_util/aom_thread.h

@@ -23,9 +23,6 @@
 extern "C" {
 #endif
 
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
 #define MAX_NUM_THREADS 64
 
 #if CONFIG_MULTITHREAD
@@ -37,16 +34,10 @@
 typedef HANDLE pthread_t;
 typedef CRITICAL_SECTION pthread_mutex_t;
 
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
 typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
-#endif  // _WIN32_WINNT >= 0x600
 
 #ifndef WINAPI_FAMILY_PARTITION
 #define WINAPI_PARTITION_DESKTOP 1
@@ -64,11 +55,6 @@
 #define THREADFN unsigned int __stdcall
 #define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
 
-#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
-  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
 static INLINE int pthread_create(pthread_t *const thread, const void *attr,
                                  unsigned int(__stdcall *start)(void *),
                                  void *arg) {
@@ -91,7 +77,8 @@
 
 static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
   (void)value_ptr;
-  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+  return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+              WAIT_OBJECT_0 ||
           CloseHandle(thread) == 0);
 }
 
@@ -99,11 +86,7 @@
 static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
                                      void *mutexattr) {
   (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
   InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
-  InitializeCriticalSection(mutex);
-#endif
   return 0;
 }
 
@@ -128,85 +111,31 @@
 
 // Condition
 static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
   (void)condition;
-#else
-  ok &= (CloseHandle(condition->waiting_sem_) != 0);
-  ok &= (CloseHandle(condition->received_sem_) != 0);
-  ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
-  return !ok;
+  return 0;
 }
 
 static INLINE int pthread_cond_init(pthread_cond_t *const condition,
                                     void *cond_attr) {
   (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
   InitializeConditionVariable(condition);
-#else
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
-  if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
-      condition->signal_event_ == NULL) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-#endif
   return 0;
 }
 
 static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
   WakeConditionVariable(condition);
-#else
-  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok = SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
+  return 0;
 }
 
 static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
   WakeAllConditionVariable(condition);
-#else
-  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok &= SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
+  return 0;
 }
 
 static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
                                     pthread_mutex_t *const mutex) {
   int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
   ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
-  // note that there is a consumer available so the signal isn't dropped in
-  // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
-  // now unlock the mutex so pthread_cond_signal may be issued
-  pthread_mutex_unlock(mutex);
-  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
-        WAIT_OBJECT_0);
-  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
-  pthread_mutex_lock(mutex);
-#endif
   return !ok;
 }
 #elif defined(__OS2__)

diff --git a/libaom/aom_util/aom_util.cmake b/libaom/aom_util/aom_util.cmake
index d4f3bce..1a1bfe1 100644
--- a/libaom/aom_util/aom_util.cmake
+++ b/libaom/aom_util/aom_util.cmake

@@ -25,4 +25,7 @@
   add_library(aom_util OBJECT ${AOM_UTIL_SOURCES})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_util>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_util>)
+  endif()
 endfunction()

diff --git a/libaom/aom_util/debug_util.c b/libaom/aom_util/debug_util.c
index 468c47e..5762e69 100644
--- a/libaom/aom_util/debug_util.c
+++ b/libaom/aom_util/debug_util.c

@@ -18,13 +18,17 @@
 
 static int frame_idx_r = 0;
 
-void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; }
+void aom_bitstream_queue_set_frame_write(int frame_idx) {
+  frame_idx_w = frame_idx;
+}
 
-int bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+int aom_bitstream_queue_get_frame_writee(void) { return frame_idx_w; }
 
-void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
+void aom_bitstream_queue_set_frame_read(int frame_idx) {
+  frame_idx_r = frame_idx;
+}
 
-int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; }
 
 #if CONFIG_BITSTREAM_DEBUG
 #define QUEUE_MAX_SIZE 2000000

diff --git a/libaom/aom_util/debug_util.h b/libaom/aom_util/debug_util.h
index 127a8b4..23cad2a 100644
--- a/libaom/aom_util/debug_util.h
+++ b/libaom/aom_util/debug_util.h

@@ -20,10 +20,10 @@
 extern "C" {
 #endif
 
-void bitstream_queue_set_frame_write(int frame_idx);
-int bitstream_queue_get_frame_write(void);
-void bitstream_queue_set_frame_read(int frame_idx);
-int bitstream_queue_get_frame_read(void);
+void aom_bitstream_queue_set_frame_write(int frame_idx);
+int aom_bitstream_queue_get_frame_writee(void);
+void aom_bitstream_queue_set_frame_read(int frame_idx);
+int aom_bitstream_queue_get_frame_read(void);
 
 #if CONFIG_BITSTREAM_DEBUG
 /* This is a debug tool used to detect bitstream error. On encoder side, it

diff --git a/libaom/apps/aomdec.c b/libaom/apps/aomdec.c
index 549c4da..2591d41 100644
--- a/libaom/apps/aomdec.c
+++ b/libaom/apps/aomdec.c

@@ -76,8 +76,6 @@
     ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
 static const arg_def_t skiparg =
     ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
-static const arg_def_t postprocarg =
-    ARG_DEF(NULL, "postproc", 0, "Postprocess decoded frames");
 static const arg_def_t summaryarg =
     ARG_DEF(NULL, "summary", 0, "Show timing summary");
 static const arg_def_t outputfile =
@@ -108,13 +106,11 @@
     ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
 
 static const arg_def_t *all_args[] = {
-  &help,           &codecarg,   &use_yv12,      &use_i420,
-  &flipuvarg,      &rawvideo,   &noblitarg,     &progressarg,
-  &limitarg,       &skiparg,    &postprocarg,   &summaryarg,
-  &outputfile,     &threadsarg, &verbosearg,    &scalearg,
-  &fb_arg,         &md5arg,     &framestatsarg, &continuearg,
-  &outbitdeptharg, &isannexb,   &oppointarg,    &outallarg,
-  &skipfilmgrain,  NULL
+  &help,       &codecarg,   &use_yv12,      &use_i420,      &flipuvarg,
+  &rawvideo,   &noblitarg,  &progressarg,   &limitarg,      &skiparg,
+  &summaryarg, &outputfile, &threadsarg,    &verbosearg,    &scalearg,
+  &fb_arg,     &md5arg,     &framestatsarg, &continuearg,   &outbitdeptharg,
+  &isannexb,   &oppointarg, &outallarg,     &skipfilmgrain, NULL
 };
 
 #if CONFIG_LIBYUV
@@ -437,7 +433,7 @@
   FILE *infile;
   int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
   int do_md5 = 0, progress = 0;
-  int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
+  int stop_after = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int keep_going = 0;
   const AvxInterface *interface = NULL;
@@ -451,7 +447,7 @@
   int opt_yv12 = 0;
   int opt_i420 = 0;
   int opt_raw = 0;
-  aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+  aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
   unsigned int fixed_output_bit_depth = 0;
   unsigned int is_annexb = 0;
   int frames_corrupted = 0;
@@ -536,8 +532,6 @@
       stop_after = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &skiparg, argi)) {
       arg_skip = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &postprocarg, argi)) {
-      postproc = 1;
     } else if (arg_match(&arg, &md5arg, argi)) {
       do_md5 = 1;
     } else if (arg_match(&arg, &framestatsarg, argi)) {
@@ -628,6 +622,7 @@
 #if !CONFIG_WEBM_IO
     fprintf(stderr, "aomdec was built without WebM container support.\n");
 #endif
+    free(argv);
     return EXIT_FAILURE;
   }
 
@@ -675,7 +670,7 @@
 
   if (!interface) interface = get_aom_decoder_by_index(0);
 
-  dec_flags = (postproc ? AOM_CODEC_USE_POSTPROC : 0);
+  dec_flags = 0;
   if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
                          dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -685,25 +680,27 @@
 
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
-  if (aom_codec_control(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
     fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
     goto fail;
   }
 
-  if (aom_codec_control(&decoder, AV1D_SET_OPERATING_POINT, operating_point)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT,
+                                    operating_point)) {
     fprintf(stderr, "Failed to set operating_point: %s\n",
             aom_codec_error(&decoder));
     goto fail;
   }
 
-  if (aom_codec_control(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
-                        output_all_layers)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
+                                    output_all_layers)) {
     fprintf(stderr, "Failed to set output_all_layers: %s\n",
             aom_codec_error(&decoder));
     goto fail;
   }
 
-  if (aom_codec_control(&decoder, AV1D_SET_SKIP_FILM_GRAIN, skip_film_grain)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN,
+                                    skip_film_grain)) {
     fprintf(stderr, "Failed to set skip_film_grain: %s\n",
             aom_codec_error(&decoder));
     goto fail;
@@ -759,7 +756,8 @@
 
         if (framestats_file) {
           int qp;
-          if (aom_codec_control(&decoder, AOMD_GET_LAST_QUANTIZER, &qp)) {
+          if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER,
+                                            &qp)) {
             warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
                  aom_codec_error(&decoder));
             if (!keep_going) goto fail;
@@ -793,7 +791,8 @@
       ++frame_out;
       got_data = 1;
 
-      if (aom_codec_control(&decoder, AOMD_GET_FRAME_CORRUPTED, &corrupted)) {
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED,
+                                        &corrupted)) {
         warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
         if (!keep_going) goto fail;
       }
@@ -817,8 +816,8 @@
             int render_height = aom_input_ctx.height;
             if (!render_width || !render_height) {
               int render_size[2];
-              if (aom_codec_control(&decoder, AV1D_GET_DISPLAY_SIZE,
-                                    render_size)) {
+              if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE,
+                                                render_size)) {
                 // As last resort use size of first frame as display size.
                 render_width = img->d_w;
                 render_height = img->d_h;

diff --git a/libaom/apps/aomenc.c b/libaom/apps/aomenc.c
index 08bf08d..bb57726 100644
--- a/libaom/apps/aomenc.c
+++ b/libaom/apps/aomenc.c

@@ -158,11 +158,7 @@
     ARG_DEF("v", "verbose", 0, "Show encoder parameters");
 static const arg_def_t psnrarg =
     ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
-#if CONFIG_FILEOPTIONS
 static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
-static const arg_def_t ext_partition =
-    ARG_DEF(NULL, "ext-partition", 1, "corresponds to extended partitions");
-#endif
 
 static const struct arg_enum_list test_decode_enum[] = {
   { "off", TEST_DECODE_OFF },
@@ -205,9 +201,7 @@
     NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value.");
 
 static const arg_def_t *main_args[] = { &help,
-#if CONFIG_FILEOPTIONS
                                         &use_cfg,
-#endif
                                         &debugmode,
                                         &outputfile,
                                         &codecarg,
@@ -240,9 +234,9 @@
 static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
 static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
 static const arg_def_t forced_max_frame_width = ARG_DEF(
-    NULL, "forced_max_frame_width", 0, "Maximum frame width value to force");
+    NULL, "forced_max_frame_width", 1, "Maximum frame width value to force");
 static const arg_def_t forced_max_frame_height = ARG_DEF(
-    NULL, "forced_max_frame_height", 0, "Maximum frame height value to force");
+    NULL, "forced_max_frame_height", 1, "Maximum frame height value to force");
 #if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
   { "mono", STEREO_FORMAT_MONO },
@@ -396,10 +390,9 @@
 static const struct arg_enum_list tuning_enum[] = {
   { "psnr", AOM_TUNE_PSNR },
   { "ssim", AOM_TUNE_SSIM },
-#ifdef CONFIG_DIST_8X8
-  { "cdef-dist", AOM_TUNE_CDEF_DIST },
-  { "daala-dist", AOM_TUNE_DAALA_DIST },
-#endif
+  { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
+  { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
+  { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
   { NULL, 0 }
 };
 static const arg_def_t tune_metric =
@@ -411,7 +404,8 @@
 
 #if CONFIG_AV1_ENCODER
 static const arg_def_t cpu_used_av1 =
-    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
+    ARG_DEF(NULL, "cpu-used", 1,
+            "Speed setting (0..6 in good mode, 6..8 in realtime mode)");
 static const arg_def_t rowmtarg =
     ARG_DEF(NULL, "row-mt", 1,
             "Enable row based multi-threading (0: off, 1: on (default))");
@@ -421,7 +415,13 @@
     ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
 static const arg_def_t enable_tpl_model =
     ARG_DEF(NULL, "enable-tpl-model", 1,
-            "RDO modulation based on frame temporal dependency");
+            "RDO based on frame temporal dependency "
+            "(0: off, 1: backward source based). "
+            "This is required for deltaq mode.");
+static const arg_def_t enable_keyframe_filtering =
+    ARG_DEF(NULL, "enable-keyframe-filtering", 1,
+            "Apply temporal filtering on key frame "
+            "(0: false, 1: true (default)");
 static const arg_def_t tile_width =
     ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
 static const arg_def_t tile_height =
@@ -432,10 +432,10 @@
     ARG_DEF(NULL, "enable-cdef", 1,
             "Enable the constrained directional enhancement filter (0: false, "
             "1: true (default))");
-static const arg_def_t enable_restoration =
-    ARG_DEF(NULL, "enable-restoration", 1,
-            "Enable the loop restoration filter (0: false, "
-            "1: true (default))");
+static const arg_def_t enable_restoration = ARG_DEF(
+    NULL, "enable-restoration", 1,
+    "Enable the loop restoration filter (0: false (default in Realtime mode), "
+    "1: true (default in Non-realtime mode))");
 static const arg_def_t enable_rect_partitions =
     ARG_DEF(NULL, "enable-rect-partitions", 1,
             "Enable rectangular partitions "
@@ -450,7 +450,9 @@
 static const arg_def_t min_partition_size =
     ARG_DEF(NULL, "min-partition-size", 4,
             "Set min partition size "
-            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128). "
+            "On frame with 4k+ resolutions or higher speed settings, the min "
+            "partition size will have a minimum of 8.");
 static const arg_def_t max_partition_size =
     ARG_DEF(NULL, "max-partition-size", 128,
             "Set max partition size "
@@ -459,6 +461,10 @@
     ARG_DEF(NULL, "enable-dual-filter", 1,
             "Enable dual filter "
             "(0: false, 1: true (default))");
+static const arg_def_t enable_chroma_deltaq =
+    ARG_DEF(NULL, "enable-chroma-deltaq", 1,
+            "Enable chroma delta quant "
+            "(0: false (default), 1: true)");
 static const arg_def_t enable_intra_edge_filter =
     ARG_DEF(NULL, "enable-intra-edge-filter", 1,
             "Enable intra edge filtering "
@@ -470,10 +476,6 @@
 static const arg_def_t enable_tx64 =
     ARG_DEF(NULL, "enable-tx64", 1,
             "Enable 64-pt transform (0: false, 1: true (default))");
-static const arg_def_t tx_size_search_method =
-    ARG_DEF(NULL, "tx-size-search-method", 0,
-            "Set transform block size search method "
-            "(0: Full RD (default), 1: Fast RD, 2: use largest allowed)");
 static const arg_def_t enable_flip_idtx =
     ARG_DEF(NULL, "enable-flip-idtx", 1,
             "Enable extended transform type (0: false, 1: true (default)) "
@@ -535,8 +537,14 @@
     ARG_DEF(NULL, "enable-cfl-intra", 1,
             "Enable chroma from luma intra prediction mode "
             "(0: false, 1: true (default))");
+static const arg_def_t force_video_mode =
+    ARG_DEF(NULL, "force-video-mode", 1,
+            "Force video mode (0: false, 1: true (default))");
 static const arg_def_t enable_obmc = ARG_DEF(
     NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))");
+static const arg_def_t enable_overlay =
+    ARG_DEF(NULL, "enable-overlay", 1,
+            "Enable coding overlay frames (0: false, 1: true (default))");
 static const arg_def_t enable_palette =
     ARG_DEF(NULL, "enable-palette", 1,
             "Enable palette prediction mode (0: false, 1: true (default))");
@@ -549,8 +557,9 @@
             "Enable intra angle delta (0: false, 1: true (default))");
 static const arg_def_t disable_trellis_quant =
     ARG_DEF(NULL, "disable-trellis-quant", 1,
-            "Disable trellis optimization of quantized coefficients (0: false ("
-            "default) 1: true  2: partial true)");
+            "Disable trellis optimization of quantized coefficients (0: false "
+            "1: true  2: true for rd search 3: true for estimate yrd serch "
+            "(default))");
 static const arg_def_t enable_qm =
     ARG_DEF(NULL, "enable-qm", 1,
             "Enable quantisation matrices (0: false (default), 1: true)");
@@ -577,11 +586,10 @@
     ARG_DEF(NULL, "mode-cost-upd-freq", 1,
             "Update freq for mode costs"
             "0: SB, 1: SB Row per Tile, 2: Tile");
-#if CONFIG_DIST_8X8
-static const arg_def_t enable_dist_8x8 =
-    ARG_DEF(NULL, "enable-dist-8x8", 1,
-            "Enable dist-8x8 (0: false (default), 1: true)");
-#endif  // CONFIG_DIST_8X8
+static const arg_def_t mv_cost_upd_freq =
+    ARG_DEF(NULL, "mv-cost-upd-freq", 1,
+            "Update freq for mv costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off");
 static const arg_def_t num_tg = ARG_DEF(
     NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1");
 static const arg_def_t mtu_size =
@@ -599,6 +607,10 @@
                  "Signal timing info in the bitstream (model unly works for no "
                  "hidden frames, no super-res yet):",
                  timing_info_enum);
+#if CONFIG_TUNE_VMAF
+static const arg_def_t vmaf_model_path =
+    ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file");
+#endif
 static const arg_def_t film_grain_test =
     ARG_DEF(NULL, "film-grain-test", 1,
             "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
@@ -628,9 +640,13 @@
     NULL, "aq-mode", 1,
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
     "3: cyclic refresh)");
-static const arg_def_t deltaq_mode = ARG_DEF(
-    NULL, "deltaq-mode", 1,
-    "Delta qindex mode (0: off (default), 1: deltaq 2: deltaq + deltalf)");
+static const arg_def_t deltaq_mode =
+    ARG_DEF(NULL, "deltaq-mode", 1,
+            "Delta qindex mode (0: off, 1: deltaq objective (default), "
+            "2: deltaq perceptual). "
+            "Currently this requires enable-tpl-model as a prerequisite.");
+static const arg_def_t deltalf_mode = ARG_DEF(
+    NULL, "delta-lf-mode", 1, "Enable delta-lf-mode (0: off (default), 1: on)");
 static const arg_def_t frame_periodic_boost =
     ARG_DEF(NULL, "frame-boost", 1,
             "Enable frame periodic boost (0: off (default), 1: on)");
@@ -644,9 +660,12 @@
 static const arg_def_t max_gf_interval = ARG_DEF(
     NULL, "max-gf-interval", 1,
     "max gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t gf_min_pyr_height =
+    ARG_DEF(NULL, "gf-min-pyr-height", 1,
+            "Min height for GF group pyramid structure (0 (default) to 5)");
 static const arg_def_t gf_max_pyr_height =
     ARG_DEF(NULL, "gf-max-pyr-height", 1,
-            "maximum height for GF group pyramid structure (0 to 4 (default))");
+            "maximum height for GF group pyramid structure (0 to 5 (default))");
 static const arg_def_t max_reference_frames = ARG_DEF(
     NULL, "max-reference-frames", 1,
     "maximum number of reference frames allowed per frame (3 to 7 (default))");
@@ -663,6 +682,11 @@
             "xy: Target level index for the OP. "
             "E.g. \"0\" means target level index 0 for the 0th OP; "
             "\"1021\" means target level index 21 for the 10th OP.");
+static const arg_def_t set_min_cr =
+    ARG_DEF(NULL, "min-cr", 1,
+            "Set minimum compression ratio. Take integer values. Default is 0. "
+            "If non-zero, encoder will try to keep the compression ratio of "
+            "each frame to be higher than the given value divided by 100.");
 
 static const struct arg_enum_list color_primaries_enum[] = {
   { "bt709", AOM_CICP_CP_BT_709 },
@@ -774,6 +798,22 @@
             "operating points conforms to. "
             "Bit value 0(defualt): Main Tier; 1: High Tier.");
 
+static const arg_def_t use_fixed_qp_offsets =
+    ARG_DEF(NULL, "use-fixed-qp-offsets", 1,
+            "Enable fixed QP offsets for frames at different levels of the "
+            "pyramid. Selected automatically from --cq-level if "
+            "--fixed-qp-offsets is not provided. If this option is not "
+            "specified (default), offsets are adaptively chosen by the "
+            "encoder.");
+
+static const arg_def_t fixed_qp_offsets =
+    ARG_DEF(NULL, "fixed-qp-offsets", 1,
+            "Set fixed QP offsets for frames at different levels of the "
+            "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, "
+            "and 3 levels of internal alt-refs. If this option is not "
+            "specified (default), offsets are adaptively chosen by the "
+            "encoder.");
+
 static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &auto_altref,
                                        &sharpness,
@@ -782,6 +822,7 @@
                                        &tile_cols,
                                        &tile_rows,
                                        &enable_tpl_model,
+                                       &enable_keyframe_filtering,
                                        &arnr_maxframes,
                                        &arnr_strength,
                                        &tune_metric,
@@ -798,10 +839,10 @@
                                        &min_partition_size,
                                        &max_partition_size,
                                        &enable_dual_filter,
+                                       &enable_chroma_deltaq,
                                        &enable_intra_edge_filter,
                                        &enable_order_hint,
                                        &enable_tx64,
-                                       &tx_size_search_method,
                                        &enable_flip_idtx,
                                        &enable_dist_wtd_comp,
                                        &enable_masked_comp,
@@ -817,7 +858,9 @@
                                        &enable_smooth_intra,
                                        &enable_paeth_intra,
                                        &enable_cfl_intra,
+                                       &force_video_mode,
                                        &enable_obmc,
+                                       &enable_overlay,
                                        &enable_palette,
                                        &enable_intrabc,
                                        &enable_angle_delta,
@@ -832,13 +875,12 @@
                                        &quant_b_adapt,
                                        &coeff_cost_upd_freq,
                                        &mode_cost_upd_freq,
-#if CONFIG_DIST_8X8
-                                       &enable_dist_8x8,
-#endif
+                                       &mv_cost_upd_freq,
                                        &frame_parallel_decoding,
                                        &error_resilient_mode,
                                        &aq_mode,
                                        &deltaq_mode,
+                                       &deltalf_mode,
                                        &frame_periodic_boost,
                                        &noise_sens,
                                        &tune_content,
@@ -849,6 +891,7 @@
                                        &input_chroma_sample_position,
                                        &min_gf_interval,
                                        &max_gf_interval,
+                                       &gf_min_pyr_height,
                                        &gf_max_pyr_height,
                                        &superblock_size,
                                        &num_tg,
@@ -865,6 +908,7 @@
                                        &enable_ref_frame_mvs,
                                        &target_seq_level_idx,
                                        &set_tier_mask,
+                                       &set_min_cr,
                                        &bitdeptharg,
                                        &inbitdeptharg,
                                        &input_chroma_subsampling_x,
@@ -872,6 +916,9 @@
                                        &sframe_dist,
                                        &sframe_mode,
                                        &save_as_annexb,
+#if CONFIG_TUNE_VMAF
+                                       &vmaf_model_path,
+#endif
                                        NULL };
 static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AOME_SET_ENABLEAUTOALTREF,
@@ -881,6 +928,7 @@
                                         AV1E_SET_TILE_COLUMNS,
                                         AV1E_SET_TILE_ROWS,
                                         AV1E_SET_ENABLE_TPL_MODEL,
+                                        AV1E_SET_ENABLE_KEYFRAME_FILTERING,
                                         AOME_SET_ARNR_MAXFRAMES,
                                         AOME_SET_ARNR_STRENGTH,
                                         AOME_SET_TUNING,
@@ -897,10 +945,10 @@
                                         AV1E_SET_MIN_PARTITION_SIZE,
                                         AV1E_SET_MAX_PARTITION_SIZE,
                                         AV1E_SET_ENABLE_DUAL_FILTER,
+                                        AV1E_SET_ENABLE_CHROMA_DELTAQ,
                                         AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
                                         AV1E_SET_ENABLE_ORDER_HINT,
                                         AV1E_SET_ENABLE_TX64,
-                                        AV1E_SET_TX_SIZE_SEARCH_METHOD,
                                         AV1E_SET_ENABLE_FLIP_IDTX,
                                         AV1E_SET_ENABLE_DIST_WTD_COMP,
                                         AV1E_SET_ENABLE_MASKED_COMP,
@@ -916,7 +964,9 @@
                                         AV1E_SET_ENABLE_SMOOTH_INTRA,
                                         AV1E_SET_ENABLE_PAETH_INTRA,
                                         AV1E_SET_ENABLE_CFL_INTRA,
+                                        AV1E_SET_FORCE_VIDEO_MODE,
                                         AV1E_SET_ENABLE_OBMC,
+                                        AV1E_SET_ENABLE_OVERLAY,
                                         AV1E_SET_ENABLE_PALETTE,
                                         AV1E_SET_ENABLE_INTRABC,
                                         AV1E_SET_ENABLE_ANGLE_DELTA,
@@ -931,13 +981,12 @@
                                         AV1E_SET_QUANT_B_ADAPT,
                                         AV1E_SET_COEFF_COST_UPD_FREQ,
                                         AV1E_SET_MODE_COST_UPD_FREQ,
-#if CONFIG_DIST_8X8
-                                        AV1E_SET_ENABLE_DIST_8X8,
-#endif
+                                        AV1E_SET_MV_COST_UPD_FREQ,
                                         AV1E_SET_FRAME_PARALLEL_DECODING,
                                         AV1E_SET_ERROR_RESILIENT_MODE,
                                         AV1E_SET_AQ_MODE,
                                         AV1E_SET_DELTAQ_MODE,
+                                        AV1E_SET_DELTALF_MODE,
                                         AV1E_SET_FRAME_PERIODIC_BOOST,
                                         AV1E_SET_NOISE_SENSITIVITY,
                                         AV1E_SET_TUNE_CONTENT,
@@ -948,6 +997,7 @@
                                         AV1E_SET_CHROMA_SAMPLE_POSITION,
                                         AV1E_SET_MIN_GF_INTERVAL,
                                         AV1E_SET_MAX_GF_INTERVAL,
+                                        AV1E_SET_GF_MIN_PYRAMID_HEIGHT,
                                         AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
                                         AV1E_SET_SUPERBLOCK_SIZE,
                                         AV1E_SET_NUM_TG,
@@ -964,6 +1014,10 @@
                                         AV1E_SET_ENABLE_REF_FRAME_MVS,
                                         AV1E_SET_TARGET_SEQ_LEVEL_IDX,
                                         AV1E_SET_TIER_MASK,
+                                        AV1E_SET_MIN_CR,
+#if CONFIG_TUNE_VMAF
+                                        AV1E_SET_VMAF_MODEL_PATH,
+#endif
                                         0 };
 #endif  // CONFIG_AV1_ENCODER
 
@@ -1038,6 +1092,9 @@
   int write_ivf;
   // whether to use 16bit internal buffers
   int use_16bit_internal;
+#if CONFIG_TUNE_VMAF
+  const char *vmaf_model_path;
+#endif
 };
 
 struct stream_state {
@@ -1076,18 +1133,22 @@
   if (!rat->den) die("Error: %s has zero denominator\n", msg);
 }
 
+static void init_config(cfg_options_t *config) {
+  memset(config, 0, sizeof(cfg_options_t));
+  config->super_block_size = 0;  // Dynamic
+  config->max_partition_size = 128;
+  config->min_partition_size = 4;
+  config->disable_trellis_quant = 3;
+}
+
 /* Parses global config arguments into the AvxEncoderConfig. Note that
  * argv is modified and overwrites all parsed arguments.
  */
-static void parse_global_config(struct AvxEncoderConfig *global, int argc,
-                                char ***argv) {
+static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
   char **argi, **argj;
   struct arg arg;
   const int num_encoder = get_aom_encoder_count();
   char **argv_local = (char **)*argv;
-#if CONFIG_FILEOPTIONS
-  int argc_local = argc;
-#endif
   if (num_encoder < 1) die("Error: no valid encoder available\n");
 
   /* Initialize default parameters */
@@ -1097,27 +1158,18 @@
   global->color_type = I420;
   global->csp = AOM_CSP_UNKNOWN;
 
-#if CONFIG_FILEOPTIONS
-  const char *cfg = NULL;
   int cfg_included = 0;
-#endif
+  init_config(&global->encoder_config);
+
   for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-#if CONFIG_FILEOPTIONS
     if (arg_match(&arg, &use_cfg, argi)) {
       if (cfg_included) continue;
-      cfg = arg.val;
-
-      arg_cfg(&argc_local, &argv_local, cfg);
-
-      *argj = *argi = *argv_local;
-      argj = argi = argv_local;
-      *argv = argv_local;
+      parse_cfg(arg.val, &global->encoder_config);
       cfg_included = 1;
       continue;
     }
-#endif
     if (arg_match(&arg, &help, argi)) {
       show_help(stdout, 0);
       exit(EXIT_SUCCESS);
@@ -1309,6 +1361,8 @@
 
     /* Allows removal of the application version from the EBML tags */
     stream->webm_ctx.debug = global->debug;
+    memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config,
+           sizeof(stream->config.cfg.encoder_cfg));
   }
 
   /* Output files must be specified for each stream */
@@ -1533,10 +1587,26 @@
     } else if (arg_match(&arg, &tile_height, argi)) {
       config->cfg.tile_height_count =
           arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
-#if CONFIG_FILEOPTIONS
-    } else if (arg_match(&arg, &ext_partition, argi)) {
-      config->cfg.cfg.ext_partition = !!arg_parse_uint(&arg) > 0;
+#if CONFIG_TUNE_VMAF
+    } else if (arg_match(&arg, &vmaf_model_path, argi)) {
+      config->vmaf_model_path = arg.val;
 #endif
+    } else if (arg_match(&arg, &use_fixed_qp_offsets, argi)) {
+      config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &fixed_qp_offsets, argi)) {
+      const int fixed_qp_offset_count = arg_parse_list(
+          &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT);
+      if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) {
+        die("Option --fixed_qp_offsets requires %d comma-separated values, but "
+            "only %d values were provided.\n",
+            FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count);
+      }
+      config->cfg.use_fixed_qp_offsets = 1;
+    } else if (global->usage == AOM_USAGE_REALTIME &&
+               arg_match(&arg, &enable_restoration, argi)) {
+      if (arg_parse_uint(&arg) == 1) {
+        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+      }
     } else {
       int i, match = 0;
       for (i = 0; ctrl_args[i]; i++) {
@@ -1551,7 +1621,7 @@
     }
   }
   config->use_16bit_internal =
-      config->cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH;
+      config->cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING;
   return eos_mark_found;
 }
 
@@ -1695,6 +1765,44 @@
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
+
+#define SHOW_PARAMS(field)                    \
+  fprintf(stderr, "    %-28s = %d\n", #field, \
+          stream->config.cfg.encoder_cfg.field)
+  SHOW_PARAMS(super_block_size);
+  SHOW_PARAMS(max_partition_size);
+  SHOW_PARAMS(min_partition_size);
+  SHOW_PARAMS(disable_ab_partition_type);
+  SHOW_PARAMS(disable_rect_partition_type);
+  SHOW_PARAMS(disable_1to4_partition_type);
+  SHOW_PARAMS(disable_flip_idtx);
+  SHOW_PARAMS(disable_cdef);
+  SHOW_PARAMS(disable_lr);
+  SHOW_PARAMS(disable_obmc);
+  SHOW_PARAMS(disable_warp_motion);
+  SHOW_PARAMS(disable_global_motion);
+  SHOW_PARAMS(disable_dist_wtd_comp);
+  SHOW_PARAMS(disable_diff_wtd_comp);
+  SHOW_PARAMS(disable_inter_intra_comp);
+  SHOW_PARAMS(disable_masked_comp);
+  SHOW_PARAMS(disable_one_sided_comp);
+  SHOW_PARAMS(disable_palette);
+  SHOW_PARAMS(disable_intrabc);
+  SHOW_PARAMS(disable_cfl);
+  SHOW_PARAMS(disable_smooth_intra);
+  SHOW_PARAMS(disable_filter_intra);
+  SHOW_PARAMS(disable_dual_filter);
+  SHOW_PARAMS(disable_intra_angle_delta);
+  SHOW_PARAMS(disable_intra_edge_filter);
+  SHOW_PARAMS(disable_tx_64x64);
+  SHOW_PARAMS(disable_smooth_inter_intra);
+  SHOW_PARAMS(disable_inter_inter_wedge);
+  SHOW_PARAMS(disable_inter_intra_wedge);
+  SHOW_PARAMS(disable_paeth_intra);
+  SHOW_PARAMS(disable_trellis_quant);
+  SHOW_PARAMS(disable_ref_frame_mv);
+  SHOW_PARAMS(reduced_reference_set);
+  SHOW_PARAMS(reduced_tx_type_set);
 }
 
 static void open_output_file(struct stream_state *stream,
@@ -1788,42 +1896,48 @@
                      &stream->config.cfg, flags);
   ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
 
-  /* Note that we bypass the aom_codec_control wrapper macro because
-   * we're being clever to store the control IDs in an array. Real
-   * applications will want to make use of the enumerations directly
-   */
   for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
     int ctrl = stream->config.arg_ctrls[i][0];
     int value = stream->config.arg_ctrls[i][1];
-    if (aom_codec_control_(&stream->encoder, ctrl, value))
+    if (aom_codec_control(&stream->encoder, ctrl, value))
       fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value);
 
     ctx_exit_on_error(&stream->encoder, "Failed to control codec");
   }
+
+#if CONFIG_TUNE_VMAF
+  if (stream->config.vmaf_model_path) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH,
+                                  stream->config.vmaf_model_path);
+  }
+#endif
+
   if (stream->config.film_grain_filename) {
-    aom_codec_control_(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
-                       stream->config.film_grain_filename);
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
+                                  stream->config.film_grain_filename);
   }
 
 #if CONFIG_AV1_DECODER
   if (global->test_decode != TEST_DECODE_OFF) {
     const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
     aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
 
     if (strcmp(global->codec->name, "av1") == 0) {
-      aom_codec_control(&stream->decoder, AV1_SET_TILE_MODE,
-                        stream->config.cfg.large_scale_tile);
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE,
+                                    stream->config.cfg.large_scale_tile);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
 
-      aom_codec_control(&stream->decoder, AV1D_SET_IS_ANNEXB,
-                        stream->config.cfg.save_as_annexb);
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB,
+                                    stream->config.cfg.save_as_annexb);
       ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb");
 
-      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_ROW, -1);
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW,
+                                    -1);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
 
-      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_COL, -1);
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL,
+                                    -1);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
     }
   }
@@ -1922,7 +2036,8 @@
   if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) {
     int q;
 
-    aom_codec_control(&stream->encoder, AOME_GET_LAST_QUANTIZER_64, &q);
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64,
+                                  &q);
     ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
     stream->counts[q]++;
   }
@@ -2049,8 +2164,10 @@
   if (stream->mismatch_seen) return;
 
   /* Get the internal reference frame */
-  aom_codec_control(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
-  aom_codec_control(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+  AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE,
+                                &enc_img);
+  AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE,
+                                &dec_img);
 
   if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
       (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
@@ -2146,13 +2263,9 @@
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
-  parse_global_config(&global, argc, &argv);
+  parse_global_config(&global, &argv);
 
-#if CONFIG_FILEOPTIONS
   if (argc < 2) usage_exit();
-#else
-  if (argc < 3) usage_exit();
-#endif
 
   switch (global.color_type) {
     case I420: input.fmt = AOM_IMG_FMT_I420; break;
@@ -2291,16 +2404,20 @@
                        input.file_type == FILE_TYPE_Y4M) {
               // Note that here the input file values for chroma subsampling
               // are used instead of those from the command line.
-              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
-                                input.y4m.dst_c_dec_h >> 1);
-              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
-                                input.y4m.dst_c_dec_v >> 1);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                            input.y4m.dst_c_dec_h >> 1);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                            input.y4m.dst_c_dec_v >> 1);
             } else if (input.bit_depth == 12 &&
                        input.file_type == FILE_TYPE_RAW) {
-              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
-                                stream->chroma_subsampling_x);
-              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
-                                stream->chroma_subsampling_y);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                            stream->chroma_subsampling_x);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                            stream->chroma_subsampling_y);
             }
             break;
           default: break;

diff --git a/libaom/apps/aomenc.h b/libaom/apps/aomenc.h
index 5e59c1a..a38258b 100644
--- a/libaom/apps/aomenc.h
+++ b/libaom/apps/aomenc.h

@@ -11,6 +11,7 @@
 #ifndef AOM_APPS_AOMENC_H_
 #define AOM_APPS_AOMENC_H_
 
+#include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
 #ifdef __cplusplus
@@ -37,7 +38,7 @@
   const struct AvxInterface *codec;
   int passes;
   int pass;
-  int usage;
+  unsigned int usage;
   ColorInputType color_type;
   int quiet;
   int verbose;
@@ -54,6 +55,7 @@
   int disable_warning_prompt;
   int experimental_bitstream;
   aom_chroma_sample_position_t csp;
+  cfg_options_t encoder_config;
 };
 
 #ifdef __cplusplus

diff --git a/libaom/av1/av1.cmake b/libaom/av1/av1.cmake
index fb9678a..2ab3496 100644
--- a/libaom/av1/av1.cmake
+++ b/libaom/av1/av1.cmake

@@ -17,6 +17,7 @@
             "${AOM_ROOT}/av1/av1_iface_common.h"
             "${AOM_ROOT}/av1/common/alloccommon.c"
             "${AOM_ROOT}/av1/common/alloccommon.h"
+            "${AOM_ROOT}/av1/common/av1_common_int.h"
             "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
             "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
             "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
@@ -57,7 +58,6 @@
             "${AOM_ROOT}/av1/common/obu_util.h"
             "${AOM_ROOT}/av1/common/odintrin.c"
             "${AOM_ROOT}/av1/common/odintrin.h"
-            "${AOM_ROOT}/av1/common/onyxc_int.h"
             "${AOM_ROOT}/av1/common/pred_common.c"
             "${AOM_ROOT}/av1/common/pred_common.h"
             "${AOM_ROOT}/av1/common/quant_common.c"
@@ -88,6 +88,10 @@
             "${AOM_ROOT}/av1/common/warped_motion.c"
             "${AOM_ROOT}/av1/common/warped_motion.h")
 
+if(CONFIG_LPF_MASK)
+  list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c")
+endif()
+
 list(APPEND AOM_AV1_DECODER_SOURCES
             "${AOM_ROOT}/av1/av1_dx_iface.c"
             "${AOM_ROOT}/av1/decoder/decodeframe.c"
@@ -112,6 +116,7 @@
             "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
             "${AOM_ROOT}/av1/encoder/aq_variance.c"
             "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/enc_enums.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
@@ -123,6 +128,10 @@
             "${AOM_ROOT}/av1/encoder/bitstream.c"
             "${AOM_ROOT}/av1/encoder/bitstream.h"
             "${AOM_ROOT}/av1/encoder/block.h"
+            "${AOM_ROOT}/av1/encoder/cnn.c"
+            "${AOM_ROOT}/av1/encoder/cnn.h"
+            "${AOM_ROOT}/av1/encoder/compound_type.c"
+            "${AOM_ROOT}/av1/encoder/compound_type.h"
             "${AOM_ROOT}/av1/encoder/context_tree.c"
             "${AOM_ROOT}/av1/encoder/context_tree.h"
             "${AOM_ROOT}/av1/encoder/corner_detect.c"
@@ -160,16 +169,21 @@
             "${AOM_ROOT}/av1/encoder/hash_motion.h"
             "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
             "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/interp_search.c"
+            "${AOM_ROOT}/av1/encoder/interp_search.h"
             "${AOM_ROOT}/av1/encoder/level.c"
             "${AOM_ROOT}/av1/encoder/level.h"
             "${AOM_ROOT}/av1/encoder/lookahead.c"
             "${AOM_ROOT}/av1/encoder/lookahead.h"
-            "${AOM_ROOT}/av1/encoder/mbgraph.c"
-            "${AOM_ROOT}/av1/encoder/mbgraph.h"
             "${AOM_ROOT}/av1/encoder/mcomp.c"
             "${AOM_ROOT}/av1/encoder/mcomp.h"
             "${AOM_ROOT}/av1/encoder/ml.c"
             "${AOM_ROOT}/av1/encoder/ml.h"
+            "${AOM_ROOT}/av1/encoder/model_rd.h"
+            "${AOM_ROOT}/av1/encoder/motion_search_facade.c"
+            "${AOM_ROOT}/av1/encoder/motion_search_facade.h"
+            "${AOM_ROOT}/av1/encoder/mv_prec.c"
+            "${AOM_ROOT}/av1/encoder/mv_prec.h"
             "${AOM_ROOT}/av1/encoder/palette.c"
             "${AOM_ROOT}/av1/encoder/palette.h"
             "${AOM_ROOT}/av1/encoder/partition_strategy.h"
@@ -188,19 +202,28 @@
             "${AOM_ROOT}/av1/encoder/rd.c"
             "${AOM_ROOT}/av1/encoder/rd.h"
             "${AOM_ROOT}/av1/encoder/rdopt.c"
+            "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c"
             "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h"
+            "${AOM_ROOT}/av1/encoder/rdopt_utils.h"
             "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
             "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
             "${AOM_ROOT}/av1/encoder/segmentation.c"
             "${AOM_ROOT}/av1/encoder/segmentation.h"
             "${AOM_ROOT}/av1/encoder/speed_features.c"
             "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+            "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
             "${AOM_ROOT}/av1/encoder/temporal_filter.c"
             "${AOM_ROOT}/av1/encoder/temporal_filter.h"
             "${AOM_ROOT}/av1/encoder/tokenize.c"
             "${AOM_ROOT}/av1/encoder/tokenize.h"
             "${AOM_ROOT}/av1/encoder/tpl_model.c"
             "${AOM_ROOT}/av1/encoder/tpl_model.h"
+            "${AOM_ROOT}/av1/encoder/tx_search.c"
+            "${AOM_ROOT}/av1/encoder/tx_search.h"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.h"
@@ -213,6 +236,11 @@
             "${AOM_ROOT}/av1/encoder/dwt.c"
             "${AOM_ROOT}/av1/encoder/dwt.h")
 
+if(CONFIG_TUNE_VMAF)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c"
+              "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
             "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
@@ -221,7 +249,13 @@
             "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c")
+endif()
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
             "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
@@ -233,11 +267,15 @@
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
-            "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
             "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
@@ -250,6 +288,12 @@
             "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
             "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+                   "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
@@ -264,8 +308,14 @@
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
             "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
 
@@ -275,8 +325,15 @@
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(
+    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
 
 list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
@@ -292,7 +349,6 @@
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
@@ -307,10 +363,18 @@
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(
+    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
             "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
@@ -354,44 +418,73 @@
   list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
 endif()
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
+                   "${AOM_ROOT}/av1/encoder/cnn.c"
+                   "${AOM_ROOT}/av1/encoder/cnn.h"
+                   "${AOM_ROOT}/av1/encoder/firstpass.c"
+                   "${AOM_ROOT}/av1/encoder/firstpass.h"
+                   "${AOM_ROOT}/av1/encoder/gop_structure.c"
+                   "${AOM_ROOT}/av1/encoder/gop_structure.h"
+                   "${AOM_ROOT}/av1/encoder/misc_model_weights.h"
+                   "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h"
+                   "${AOM_ROOT}/av1/encoder/partition_model_weights.h"
+                   "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h"
+                   "${AOM_ROOT}/av1/encoder/tpl_model.c"
+                   "${AOM_ROOT}/av1/encoder/tpl_model.h"
+                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c")
+endif()
+
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
 # this function is called.
 function(setup_av1_targets)
   add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_av1_common)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+  endif()
 
   if(CONFIG_AV1_DECODER)
     add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+    endif()
   endif()
 
   if(CONFIG_AV1_ENCODER)
     add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+    endif()
   endif()
 
   if(HAVE_SSE2)
     require_compiler_flag_nomsvc("-msse2" NO)
     add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_SSE2" "aom")
+                                  "AOM_AV1_COMMON_INTRIN_SSE2")
     if(CONFIG_AV1_DECODER)
       if(AOM_AV1_DECODER_ASM_SSE2)
-        add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
+        add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2")
       endif()
 
       if(AOM_AV1_DECODER_INTRIN_SSE2)
         add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
-                                      "AOM_AV1_DECODER_INTRIN_SSE2" "aom")
+                                      "AOM_AV1_DECODER_INTRIN_SSE2")
       endif()
     endif()
 
     if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
+      add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2")
       add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_SSE2" "aom")
+                                    "AOM_AV1_ENCODER_INTRIN_SSE2")
     endif()
   endif()
 
@@ -399,19 +492,19 @@
     require_compiler_flag_nomsvc("-msse3" NO)
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_SSE3" "aom")
+                                    "AOM_AV1_ENCODER_INTRIN_SSE3")
     endif()
   endif()
 
   if(HAVE_SSSE3)
     require_compiler_flag_nomsvc("-mssse3" NO)
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_SSSE3" "aom")
+                                  "AOM_AV1_COMMON_INTRIN_SSSE3")
 
     if(CONFIG_AV1_DECODER)
       if(AOM_AV1_DECODER_INTRIN_SSSE3)
         add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
-                                      "AOM_AV1_DECODER_INTRIN_SSSE3" "aom")
+                                      "AOM_AV1_DECODER_INTRIN_SSSE3")
       endif()
     endif()
   endif()
@@ -419,17 +512,17 @@
   if(HAVE_SSE4_1)
     require_compiler_flag_nomsvc("-msse4.1" NO)
     add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_SSE4_1" "aom")
+                                  "AOM_AV1_COMMON_INTRIN_SSE4_1")
 
     if(CONFIG_AV1_ENCODER)
       if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         add_asm_library("aom_av1_encoder_ssse3"
-                        "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
+                        "AOM_AV1_ENCODER_ASM_SSSE3_X86_64")
       endif()
 
       if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
         add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
-                                      "AOM_AV1_ENCODER_INTRIN_SSE4_1" "aom")
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_1")
       endif()
     endif()
   endif()
@@ -439,7 +532,7 @@
     if(CONFIG_AV1_ENCODER)
       if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
         add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
-                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2")
       endif()
     endif()
   endif()
@@ -447,11 +540,11 @@
   if(HAVE_AVX2)
     require_compiler_flag_nomsvc("-mavx2" NO)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_AVX2" "aom")
+                                  "AOM_AV1_COMMON_INTRIN_AVX2")
 
     if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_AVX2" "aom")
+                                    "AOM_AV1_ENCODER_INTRIN_AVX2")
     endif()
   endif()
 
@@ -459,26 +552,26 @@
     if(AOM_AV1_COMMON_INTRIN_NEON)
       add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                     "aom_av1_common"
-                                    "AOM_AV1_COMMON_INTRIN_NEON" "aom")
+                                    "AOM_AV1_COMMON_INTRIN_NEON")
     endif()
 
     if(AOM_AV1_ENCODER_INTRIN_NEON)
       add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                     "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
+                                    "AOM_AV1_ENCODER_INTRIN_NEON")
     endif()
   endif()
 
   if(HAVE_VSX)
     if(AOM_AV1_COMMON_INTRIN_VSX)
       add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
-                                    "AOM_AV1_COMMON_INTRIN_VSX" "aom")
+                                    "AOM_AV1_COMMON_INTRIN_VSX")
     endif()
   endif()
 
   if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_av1_encoder"
-                                  "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
+                                  "AOM_AV1_ENCODER_INTRIN_MSA")
   endif()
 
   # Pass the new lib targets up to the parent scope instance of

diff --git a/libaom/av1/av1_cx_iface.c b/libaom/av1/av1_cx_iface.c
index e8cd508..676eaa0 100644
--- a/libaom/av1/av1_cx_iface.c
+++ b/libaom/av1/av1_cx_iface.c

@@ -27,10 +27,9 @@
 #include "av1/encoder/firstpass.h"
 
 #define MAG_SIZE (4)
-#define MAX_NUM_ENHANCEMENT_LAYERS 3
 
 struct av1_extracfg {
-  int cpu_used;  // available cpu percentage in 1/16
+  int cpu_used;
   unsigned int enable_auto_alt_ref;
   unsigned int enable_auto_bwd_ref;
   unsigned int noise_sensitivity;
@@ -40,12 +39,15 @@
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
   unsigned int enable_tpl_model;
+  unsigned int enable_keyframe_filtering;
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
   unsigned int max_gf_interval;
+  unsigned int gf_min_pyr_height;
   unsigned int gf_max_pyr_height;
   aom_tune_metric tuning;
+  const char *vmaf_model_path;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
   unsigned int rc_max_inter_bitrate_pct;
@@ -53,6 +55,7 @@
   unsigned int lossless;
   unsigned int enable_cdef;
   unsigned int enable_restoration;
+  unsigned int force_video_mode;
   unsigned int enable_obmc;
   unsigned int disable_trellis_quant;
   unsigned int enable_qm;
@@ -61,17 +64,16 @@
   unsigned int qm_v;
   unsigned int qm_min;
   unsigned int qm_max;
-#if CONFIG_DIST_8X8
-  unsigned int enable_dist_8x8;
-#endif
   unsigned int num_tg;
   unsigned int mtu_size;
 
   aom_timing_info_type_t timing_info_type;
   unsigned int frame_parallel_decoding_mode;
   int enable_dual_filter;
+  unsigned int enable_chroma_deltaq;
   AQ_MODE aq_mode;
   DELTAQ_MODE deltaq_mode;
+  int deltalf_mode;
   unsigned int frame_periodic_boost;
   aom_bit_depth_t bit_depth;
   aom_tune_content content;
@@ -99,7 +101,6 @@
   int enable_intra_edge_filter;  // enable intra-edge filter for sequence
   int enable_order_hint;         // enable order hint for sequence
   int enable_tx64;               // enable 64-pt transform usage for sequence
-  int tx_size_search_method;     // set transform block size search method
   int enable_flip_idtx;          // enable flip and identity transform types
   int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
   int max_reference_frames;      // maximum number of references per frame
@@ -121,6 +122,7 @@
   int enable_paeth_intra;            // enable Paeth intra mode for sequence
   int enable_cfl_intra;              // enable CFL uv intra mode for sequence
   int enable_superres;
+  int enable_overlay;  // enable overlay for filtered arf frames
   int enable_palette;
   int enable_intrabc;
   int enable_angle_delta;
@@ -140,52 +142,60 @@
   // Bit mask to specify which tier each of the 32 possible operating points
   // conforms to.
   unsigned int tier_mask;
+  // min_cr / 100 is the target minimum compression ratio for each frame.
+  unsigned int min_cr;
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
+  unsigned int ext_tile_debug;
+  unsigned int sb_multipass_unit_test;
 };
 
 static struct av1_extracfg default_extra_cfg = {
-  0,                       // cpu_used
-  1,                       // enable_auto_alt_ref
-  0,                       // enable_auto_bwd_ref
-  0,                       // noise_sensitivity
-  CONFIG_SHARP_SETTINGS,   // sharpness
-  0,                       // static_thresh
-  1,                       // row_mt
-  0,                       // tile_columns
-  0,                       // tile_rows
-  0,                       // enable_tpl_model
-  7,                       // arnr_max_frames
-  5,                       // arnr_strength
-  0,                       // min_gf_interval; 0 -> default decision
-  0,                       // max_gf_interval; 0 -> default decision
-  4,                       // gf_max_pyr_height
-  AOM_TUNE_PSNR,           // tuning
-  10,                      // cq_level
-  0,                       // rc_max_intra_bitrate_pct
-  0,                       // rc_max_inter_bitrate_pct
-  0,                       // gf_cbr_boost_pct
-  0,                       // lossless
-  !CONFIG_SHARP_SETTINGS,  // enable_cdef
-  1,                       // enable_restoration
-  1,                       // enable_obmc
-  0,                       // disable_trellis_quant
-  0,                       // enable_qm
-  DEFAULT_QM_Y,            // qm_y
-  DEFAULT_QM_U,            // qm_u
-  DEFAULT_QM_V,            // qm_v
-  DEFAULT_QM_FIRST,        // qm_min
-  DEFAULT_QM_LAST,         // qm_max
-#if CONFIG_DIST_8X8
-  0,
-#endif
-  1,                            // max number of tile groups
-  0,                            // mtu_size
+  0,              // cpu_used
+  1,              // enable_auto_alt_ref
+  0,              // enable_auto_bwd_ref
+  0,              // noise_sensitivity
+  0,              // sharpness
+  0,              // static_thresh
+  1,              // row_mt
+  0,              // tile_columns
+  0,              // tile_rows
+  1,              // enable_tpl_model
+  1,              // enable_keyframe_filtering
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  0,              // gf_min_pyr_height
+  5,              // gf_max_pyr_height
+  AOM_TUNE_PSNR,  // tuning
+  "/usr/local/share/model/vmaf_v0.6.1.pkl",  // VMAF model path
+  10,                                        // cq_level
+  0,                                         // rc_max_intra_bitrate_pct
+  0,                                         // rc_max_inter_bitrate_pct
+  0,                                         // gf_cbr_boost_pct
+  0,                                         // lossless
+  1,                                         // enable_cdef
+  1,                                         // enable_restoration
+  0,                                         // force_video_mode
+  1,                                         // enable_obmc
+  3,                                         // disable_trellis_quant
+  0,                                         // enable_qm
+  DEFAULT_QM_Y,                              // qm_y
+  DEFAULT_QM_U,                              // qm_u
+  DEFAULT_QM_V,                              // qm_v
+  DEFAULT_QM_FIRST,                          // qm_min
+  DEFAULT_QM_LAST,                           // qm_max
+  1,                                         // max number of tile groups
+  0,                                         // mtu_size
   AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
   0,                            // frame_parallel_decoding_mode
   1,                            // enable dual filter
+  0,                            // enable delta quant in chroma planes
   NO_AQ,                        // aq_mode
-  NO_DELTA_Q,                   // deltaq_mode
+  DELTA_Q_OBJECTIVE,            // deltaq_mode
+  0,                            // delta lf mode
   0,                            // frame_periodic_delta_q
   AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
@@ -212,7 +222,6 @@
   1,                            // enable intra edge filter
   1,                            // frame order hint
   1,                            // enable 64-pt transform usage
-  0,                            // transform block size search method
   1,                            // enable flip and identity transform
   1,                            // dist-wtd compound
   7,                            // max_reference_frames
@@ -234,6 +243,7 @@
   1,                            // enable Paeth intra mode usage for sequence
   1,                            // enable CFL uv intra mode usage for sequence
   1,                            // superres
+  1,                            // enable overlay
   1,                            // enable palette
   !CONFIG_SHARP_SETTINGS,       // enable intrabc
   1,                            // enable angle delta
@@ -249,18 +259,30 @@
   0,  // use_intra_default_tx_only
   0,  // quant_b_adapt
   {
-      31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-      31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
   },            // target_seq_level_idx
   0,            // tier_mask
+  0,            // min_cr
   COST_UPD_SB,  // coeff_cost_upd_freq
   COST_UPD_SB,  // mode_cost_upd_freq
+  COST_UPD_SB,  // mv_cost_upd_freq
+  0,            // ext_tile_debug
+  0,            // sb_multipass_unit_test
 };
 
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
   aom_codec_enc_cfg_t cfg;
   struct av1_extracfg extra_cfg;
+  aom_rational64_t timestamp_ratio;
+  aom_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
   AV1EncoderConfig oxcf;
   AV1_COMP *cpi;
   unsigned char *cx_data;
@@ -271,13 +293,37 @@
   size_t pending_frame_sizes[8];
   aom_image_t preview_img;
   aom_enc_frame_flags_t next_frame_flags;
-  aom_postproc_cfg_t preview_ppcfg;
   aom_codec_pkt_list_decl(256) pkt_list;
   unsigned int fixed_kf_cntr;
   // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
+
+  // lookahead instance variables
+  BufferPool *buffer_pool_lap;
+  AV1_COMP *cpi_lap;
+  FIRSTPASS_STATS *frame_stats_buffer;
+  // Number of stats buffers required for look ahead
+  int num_lap_buffers;
+  STATS_BUFFER_CTX stats_buf_context;
 };
 
+static INLINE int gcd(int64_t a, int b) {
+  int remainder;  // remainder
+  while (b > 0) {
+    remainder = (int)(a % b);
+    a = b;
+    b = remainder;
+  }
+
+  return (int)a;
+}
+
+static INLINE void reduce_ratio(aom_rational64_t *ratio) {
+  const int denom = gcd(ratio->num, ratio->den);
+  ratio->num /= denom;
+  ratio->den /= denom;
+}
+
 static aom_codec_err_t update_error_state(
     aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
   const aom_codec_err_t res = error->error_code;
@@ -324,11 +370,11 @@
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
   RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
-  RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
   RANGE_CHECK_HI(cfg, g_usage, 1);
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
-  RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
@@ -336,13 +382,24 @@
   RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
   RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  if (cfg->g_pass == AOM_RC_ONE_PASS) {
+    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
+  } else {
+    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+  }
   RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
   RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
   if (extra_cfg->max_gf_interval > 0) {
-    RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
-                (MAX_LAG_BUFFERS - 1));
+    RANGE_CHECK(extra_cfg, max_gf_interval,
+                AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1));
   }
-  RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 4);
+  RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5);
+  RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5);
+  if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) {
+    ERROR(
+        "gf_min_pyr_height must be less than or equal to "
+        "gf_max_pyramid_height");
+  }
 
   RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
   RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
@@ -367,6 +424,8 @@
         "or kf_max_dist instead.");
 
   RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+  RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1);
+  RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
   RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
@@ -396,10 +455,6 @@
   RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
 
-  // TODO(yaowu): remove this when ssim tuning is implemented for av1
-  if (extra_cfg->tuning == AOM_TUNE_SSIM)
-    ERROR("Option --tune=ssim is not currently supported in AV1.");
-
   if (cfg->g_pass == AOM_RC_LAST_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -430,6 +485,22 @@
     ERROR("Source bit-depth 12 not supported in profile < 2");
   }
 
+  if (cfg->rc_end_usage == AOM_Q) {
+    RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1);
+    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+      RANGE_CHECK_HI(cfg, fixed_qp_offsets[i], 63);
+    }
+  } else {
+    if (cfg->use_fixed_qp_offsets > 0) {
+      ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q");
+    }
+    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+      if (cfg->fixed_qp_offsets[i] >= 0) {
+        ERROR("--fixed_qp_offsets can only be used with --end-usage=q");
+      }
+    }
+  }
+
   RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
               AOM_CICP_CP_EBU_3213);  // Need to check range more precisely to
                                       // check for reserved values?
@@ -439,8 +510,18 @@
               AOM_CICP_MC_ICTCP);
   RANGE_CHECK(extra_cfg, color_range, 0, 1);
 
-#if CONFIG_DIST_8X8
-  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_DAALA_DIST);
+#if !CONFIG_TUNE_VMAF
+  if (extra_cfg->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+      extra_cfg->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      extra_cfg->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    ERROR(
+        "This error may be related to the wrong configuration options: try to "
+        "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run.");
+  }
+#endif
+
+#if CONFIG_TUNE_VMAF
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_MAX_GAIN);
 #else
   RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
 #endif
@@ -453,10 +534,13 @@
   if (extra_cfg->lossless) {
     if (extra_cfg->aq_mode != 0)
       ERROR("Only --aq_mode=0 can be used with --lossless=1.");
-#if CONFIG_DIST_8X8
-    if (extra_cfg->enable_dist_8x8)
-      ERROR("dist-8x8 cannot be used with lossless compression.");
-#endif
+    if (extra_cfg->enable_chroma_deltaq)
+      ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1.");
+  }
+
+  if (cfg->rc_resize_mode != RESIZE_NONE &&
+      extra_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+    ERROR("--aq_mode=3 is only supported for --resize-mode=0.");
   }
 
   RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
@@ -467,16 +551,17 @@
   RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
   RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
   RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
   RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
   RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size);
 
-  RANGE_CHECK(extra_cfg, tx_size_search_method, 0, 2);
-
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    if (!is_valid_seq_level_idx(extra_cfg->target_seq_level_idx[i]))
+    const int level_idx = extra_cfg->target_seq_level_idx[i];
+    if (!is_valid_seq_level_idx(level_idx) && level_idx != SEQ_LEVELS) {
       ERROR("Target sequence level index is invalid");
+    }
   }
 
   return AOM_CODEC_OK;
@@ -512,6 +597,10 @@
   if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
     ERROR("Image size must match encoder init configuration size");
 
+  if (img->fmt != AOM_IMG_FMT_I420 && !ctx->extra_cfg.enable_tx64) {
+    ERROR("TX64 can only be disabled on I420 images.");
+  }
+
   return AOM_CODEC_OK;
 }
 
@@ -539,14 +628,78 @@
   oxcf->superres_kf_qthresh = 255;
 }
 
-static aom_codec_err_t set_encoder_config(
-    AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
-    const struct av1_extracfg *extra_cfg) {
+static void update_default_encoder_config(const cfg_options_t *cfg,
+                                          struct av1_extracfg *extra_cfg) {
+  extra_cfg->enable_cdef = (cfg->disable_cdef == 0);
+  extra_cfg->enable_restoration = (cfg->disable_lr == 0);
+  extra_cfg->superblock_size = (cfg->super_block_size == 64)
+                                   ? AOM_SUPERBLOCK_SIZE_64X64
+                                   : (cfg->super_block_size == 128)
+                                         ? AOM_SUPERBLOCK_SIZE_128X128
+                                         : AOM_SUPERBLOCK_SIZE_DYNAMIC;
+  extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0);
+  extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0);
+  extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0);
+  extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0);
+  extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0);
+  extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0);
+  extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0);
+  extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0);
+  extra_cfg->max_partition_size = cfg->max_partition_size;
+  extra_cfg->min_partition_size = cfg->min_partition_size;
+  extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0);
+  extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0);
+  extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0);
+  extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0);
+  extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0);
+  extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0);
+  extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0);
+  extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0);
+  extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0);
+  extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0);
+  extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0);
+  extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0);
+  extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0);
+  extra_cfg->enable_obmc = (cfg->disable_obmc == 0);
+  extra_cfg->enable_palette = (cfg->disable_palette == 0);
+  extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0);
+  extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant;
+  extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+  extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+  extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0);
+  extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set;
+  extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set;
+}
+
+static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) {
+  const double base_q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
+  const int new_q_index_offset = av1_quantizer_to_qindex(q_offset);
+  const int new_q_index = AOMMAX(cq_level - new_q_index_offset, 0);
+  const double new_q_val = av1_convert_qindex_to_q(new_q_index, bit_depth);
+  return (base_q_val - new_q_val);
+}
+
+static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) {
+  // 80% for keyframe was derived empirically.
+  // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF.
+  // Rest derived similar to rc_pick_q_and_bounds_two_pass()
+  static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 };
+  const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
+  return q_val * percents[level] / 100;
+}
+
+static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
+                                          const aom_codec_enc_cfg_t *cfg,
+                                          struct av1_extracfg *extra_cfg) {
+  if (cfg->encoder_cfg.init_by_cfg_file) {
+    update_default_encoder_config(&cfg->encoder_cfg, extra_cfg);
+  }
+
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
   oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
-  oxcf->mode = (cfg->g_usage == 1) ? REALTIME : GOOD;
+  oxcf->mode = (cfg->g_usage == AOM_USAGE_REALTIME) ? REALTIME : GOOD;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
   oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
@@ -589,7 +742,7 @@
     oxcf->init_framerate = 30;
     oxcf->timing_info_present = 0;
   }
-  oxcf->cfg = &cfg->cfg;
+  oxcf->encoder_cfg = &cfg->encoder_cfg;
 
   switch (cfg->g_pass) {
     case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
@@ -597,8 +750,7 @@
     case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
   }
 
-  oxcf->lag_in_frames =
-      cfg->g_pass == AOM_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
+  oxcf->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
   oxcf->rc_mode = cfg->rc_end_usage;
 
   // Convert target bandwidth from Kbit/s to Bit/s
@@ -615,12 +767,16 @@
   oxcf->fixed_q = -1;
 
   oxcf->enable_cdef = extra_cfg->enable_cdef;
-  oxcf->enable_restoration = extra_cfg->enable_restoration;
+  oxcf->enable_restoration =
+      (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration;
+  oxcf->force_video_mode = extra_cfg->force_video_mode;
   oxcf->enable_obmc = extra_cfg->enable_obmc;
+  oxcf->enable_overlay = extra_cfg->enable_overlay;
   oxcf->enable_palette = extra_cfg->enable_palette;
   oxcf->enable_intrabc = extra_cfg->enable_intrabc;
   oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
   oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+  oxcf->allow_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs;
   oxcf->using_qm = extra_cfg->enable_qm;
   oxcf->qm_y = extra_cfg->qm_y;
   oxcf->qm_u = extra_cfg->qm_u;
@@ -634,12 +790,7 @@
   oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
   oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
   oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
-#if CONFIG_DIST_8X8
-  oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
-  if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
-      extra_cfg->tuning == AOM_TUNE_DAALA_DIST)
-    oxcf->using_dist_8x8 = 1;
-#endif
+  oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
   oxcf->num_tile_groups = extra_cfg->num_tg;
   // In large-scale tile encoding mode, num_tile_groups is always 1.
   if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
@@ -683,9 +834,6 @@
     }
   }
 
-  oxcf->enable_tpl_model =
-      extra_cfg->enable_tpl_model && (oxcf->superres_mode == SUPERRES_NONE);
-
   oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
   oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
   oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
@@ -720,15 +868,14 @@
   oxcf->render_width = extra_cfg->render_width;
   oxcf->render_height = extra_cfg->render_height;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
-  // Adjust g_lag_in_frames down if not needed
-  oxcf->lag_in_frames =
-      AOMMIN(MAX_GF_INTERVAL + oxcf->arnr_max_frames / 2, oxcf->lag_in_frames);
   oxcf->arnr_strength = extra_cfg->arnr_strength;
   oxcf->min_gf_interval = extra_cfg->min_gf_interval;
   oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+  oxcf->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
   oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
 
   oxcf->tuning = extra_cfg->tuning;
+  oxcf->vmaf_model_path = extra_cfg->vmaf_model_path;
   oxcf->content = extra_cfg->content;
   oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
   oxcf->superblock_size = extra_cfg->superblock_size;
@@ -771,7 +918,6 @@
   oxcf->max_partition_size = extra_cfg->max_partition_size;
   oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
   oxcf->enable_tx64 = extra_cfg->enable_tx64;
-  oxcf->tx_size_search_method = extra_cfg->tx_size_search_method;
   oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
   oxcf->enable_order_hint = extra_cfg->enable_order_hint;
   oxcf->enable_dist_wtd_comp =
@@ -795,7 +941,9 @@
   oxcf->enable_global_motion = extra_cfg->enable_global_motion;
   oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
   oxcf->allow_warped_motion =
-      extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+      (cfg->g_usage == AOM_USAGE_REALTIME)
+          ? 0
+          : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
   oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
   oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
   oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
@@ -833,13 +981,22 @@
     oxcf->timing_info_present = 0;
   }
 
+  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
+  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+
+  oxcf->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
   oxcf->aq_mode = extra_cfg->aq_mode;
   oxcf->deltaq_mode = extra_cfg->deltaq_mode;
 
+  oxcf->deltalf_mode =
+      (oxcf->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
+
   oxcf->save_as_annexb = cfg->save_as_annexb;
 
   oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+  oxcf->sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test;
+  oxcf->ext_tile_debug = extra_cfg->ext_tile_debug;
 
   oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
   oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
@@ -849,6 +1006,24 @@
   memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
+
+  oxcf->use_fixed_qp_offsets =
+      cfg->use_fixed_qp_offsets && (oxcf->rc_mode == AOM_Q);
+  for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+    if (oxcf->use_fixed_qp_offsets) {
+      if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
+        oxcf->fixed_qp_offsets[i] = convert_qp_offset(
+            oxcf->cq_level, cfg->fixed_qp_offsets[i], oxcf->bit_depth);
+      } else {  // auto-selected qp offset
+        oxcf->fixed_qp_offsets[i] =
+            get_modeled_qp_offset(oxcf->cq_level, i, oxcf->bit_depth);
+      }
+    } else {
+      oxcf->fixed_qp_offsets[i] = -1.0;
+    }
+  }
+
+  oxcf->min_cr = extra_cfg->min_cr;
   return AOM_CODEC_OK;
 }
 
@@ -872,6 +1047,10 @@
   // config.
   if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
     ERROR("Cannot increase lag_in_frames");
+  // Prevent changing lag_in_frames if Lookahead Processing is enabled
+  if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames &&
+      ctx->num_lap_buffers > 0)
+    ERROR("Cannot change lag_in_frames if LAP is enabled");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
 
@@ -909,7 +1088,7 @@
 }
 
 static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
-                                        const struct av1_extracfg *extra_cfg) {
+                                        struct av1_extracfg *extra_cfg) {
   const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
@@ -989,6 +1168,14 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_keyframe_filtering(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_keyframe_filtering =
+      CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1061,6 +1248,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1109,14 +1303,7 @@
   extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#if CONFIG_DIST_8X8
-static aom_codec_err_t ctrl_set_enable_dist_8x8(aom_codec_alg_priv_t *ctx,
-                                                va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_dist_8x8 = CAST(AV1E_SET_ENABLE_DIST_8X8, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif
+
 static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
                                        va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1143,6 +1330,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_rect_partitions(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1202,13 +1396,6 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_tx_size_search_method(aom_codec_alg_priv_t *ctx,
-                                                      va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.tx_size_search_method = CAST(AV1E_SET_TX_SIZE_SEARCH_METHOD, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-
 static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1361,6 +1548,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx,
                                                va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1468,6 +1662,20 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.vmaf_model_path = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_film_grain_test_vector(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1483,22 +1691,32 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_DENOISE
 static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
+#if !CONFIG_DENOISE
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.noise_level =
       ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
   return update_extra_cfg(ctx, &extra_cfg);
+#endif
 }
 
 static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
+#if !CONFIG_DENOISE
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
   return update_extra_cfg(ctx, &extra_cfg);
-}
 #endif
+}
 
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
@@ -1507,6 +1725,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1521,6 +1746,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1543,6 +1775,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1563,10 +1802,75 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
-                                    aom_codec_priv_enc_mr_cfg_t *data) {
+static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sb_multipass_unit_test =
+      CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
+                                           STATS_BUFFER_CTX *stats_buf_context,
+                                           int num_lap_buffers) {
   aom_codec_err_t res = AOM_CODEC_OK;
-  (void)data;
+
+  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
+  *frame_stats_buffer =
+      (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS));
+  if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR;
+
+  stats_buf_context->stats_in_start = *frame_stats_buffer;
+  stats_buf_context->stats_in_end = stats_buf_context->stats_in_start;
+  stats_buf_context->stats_in_buf_end =
+      stats_buf_context->stats_in_start + size;
+
+  stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+  if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR;
+  av1_twopass_zero_stats(stats_buf_context->total_left_stats);
+  stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+  if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR;
+  av1_twopass_zero_stats(stats_buf_context->total_stats);
+  return res;
+}
+#endif
+
+static aom_codec_err_t create_context_and_bufferpool(
+    AV1_COMP **p_cpi, BufferPool **p_buffer_pool, AV1EncoderConfig *oxcf,
+    struct aom_codec_pkt_list *pkt_list_head, FIRSTPASS_STATS *frame_stats_buf,
+    COMPRESSOR_STAGE stage, int num_lap_buffers, int lap_lag_in_frames,
+    STATS_BUFFER_CTX *stats_buf_context) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+  if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+  if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
+    return AOM_CODEC_MEM_ERROR;
+  }
+#endif
+  *p_cpi = av1_create_compressor(oxcf, *p_buffer_pool, frame_stats_buf, stage,
+                                 num_lap_buffers, lap_lag_in_frames,
+                                 stats_buf_context);
+  if (*p_cpi == NULL)
+    res = AOM_CODEC_MEM_ERROR;
+  else
+    (*p_cpi)->output_pkt_list = pkt_list_head;
+
+  return res;
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
+  aom_codec_err_t res = AOM_CODEC_OK;
 
   if (ctx->priv == NULL) {
     aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
@@ -1574,15 +1878,6 @@
 
     ctx->priv = (aom_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
-    ctx->priv->enc.total_encoders = 1;
-    priv->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
-    if (priv->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
-
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
-      return AOM_CODEC_MEM_ERROR;
-    }
-#endif
 
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
@@ -1596,27 +1891,83 @@
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
     if (res == AOM_CODEC_OK) {
+      int *num_lap_buffers = &priv->num_lap_buffers;
+      int lap_lag_in_frames = 0;
+      *num_lap_buffers = 0;
+      priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+      priv->timestamp_ratio.num =
+          (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC;
+      reduce_ratio(&priv->timestamp_ratio);
+
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+      if (priv->oxcf.rc_mode == AOM_Q && priv->oxcf.pass == 0 &&
+          priv->oxcf.mode == GOOD) {
+        // Enable look ahead
+        *num_lap_buffers = priv->cfg.g_lag_in_frames;
+        *num_lap_buffers =
+            clamp(*num_lap_buffers, 1,
+                  AOMMIN(MAX_LAP_BUFFERS,
+                         priv->oxcf.key_freq + SCENE_CUT_KEY_TEST_INTERVAL));
+        if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >=
+            LAP_LAG_IN_FRAMES) {
+          lap_lag_in_frames = LAP_LAG_IN_FRAMES;
+        }
+      }
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
-      priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
-      if (priv->cpi == NULL)
-        res = AOM_CODEC_MEM_ERROR;
-      else
-        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+
+#if !CONFIG_REALTIME_ONLY
+      res = create_stats_buffer(&priv->frame_stats_buffer,
+                                &priv->stats_buf_context, *num_lap_buffers);
+      if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+#endif
+
+      res = create_context_and_bufferpool(
+          &priv->cpi, &priv->buffer_pool, &priv->oxcf, &priv->pkt_list.head,
+          priv->frame_stats_buffer, ENCODE_STAGE, *num_lap_buffers, -1,
+          &priv->stats_buf_context);
+
+      // Create another compressor if look ahead is enabled
+      if (res == AOM_CODEC_OK && *num_lap_buffers) {
+        res = create_context_and_bufferpool(
+            &priv->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, NULL,
+            priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers,
+            clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS),
+            &priv->stats_buf_context);
+      }
     }
   }
 
   return res;
 }
 
+static void destroy_context_and_bufferpool(AV1_COMP *cpi,
+                                           BufferPool *buffer_pool) {
+  av1_remove_compressor(cpi);
+#if CONFIG_MULTITHREAD
+  if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex);
+#endif
+  aom_free(buffer_pool);
+}
+
+static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
+                                 FIRSTPASS_STATS *frame_stats_buffer) {
+  aom_free(stats_buf_context->total_left_stats);
+  aom_free(stats_buf_context->total_stats);
+  aom_free(frame_stats_buffer);
+}
+
 static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
-  av1_remove_compressor(ctx->cpi);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
-  aom_free(ctx->buffer_pool);
+  destroy_context_and_bufferpool(ctx->cpi, ctx->buffer_pool);
+  if (ctx->cpi_lap) {
+    // As both cpi and cpi_lap have the same lookahead_ctx, it is already freed
+    // when destroy is called on cpi. Thus, setting lookahead_ctx to null here,
+    // so that it doesn't attempt to free it again.
+    ctx->cpi_lap->lookahead = NULL;
+    destroy_context_and_bufferpool(ctx->cpi_lap, ctx->buffer_pool_lap);
+  }
+  destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer);
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
@@ -1635,6 +1986,8 @@
   return flags;
 }
 
+// TODO(Mufaddal): Check feasibility of abstracting functions related to LAP
+// into a separate function.
 static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                       const aom_image_t *img,
                                       aom_codec_pts_t pts,
@@ -1643,10 +1996,16 @@
   const size_t kMinCompressedSize = 8192;
   volatile aom_codec_err_t res = AOM_CODEC_OK;
   AV1_COMP *const cpi = ctx->cpi;
-  const aom_rational_t *const timebase = &ctx->cfg.g_timebase;
+  const aom_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+  volatile aom_codec_pts_t ptsvol = pts;
+  // LAP context
+  AV1_COMP *cpi_lap = ctx->cpi_lap;
 
   if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
 
+  if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
+    return AOM_CODEC_INVALID_PARAM;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
@@ -1670,6 +2029,12 @@
     av1_change_config(ctx->cpi, &ctx->oxcf);
   }
 
+  if (!ctx->pts_offset_initialized) {
+    ctx->pts_offset = ptsvol;
+    ctx->pts_offset_initialized = 1;
+  }
+  ptsvol -= ctx->pts_offset;
+
   aom_codec_pkt_list_init(&ctx->pkt_list);
 
   volatile aom_enc_frame_flags_t flags = enc_flags;
@@ -1684,32 +2049,72 @@
     return res;
   }
   cpi->common.error.setjmp = 1;
+  if (cpi_lap != NULL) {
+    if (setjmp(cpi_lap->common.error.jmp)) {
+      cpi_lap->common.error.setjmp = 0;
+      res = update_error_state(ctx, &cpi_lap->common.error);
+      aom_clear_system_state();
+      return res;
+    }
+    cpi_lap->common.error.setjmp = 1;
+  }
 
   // Note(yunqing): While applying encoding flags, always start from enabling
   // all, and then modifying according to the flags. Previous frame's flags are
   // overwritten.
   av1_apply_encoding_flags(cpi, flags);
+  if (cpi_lap != NULL) {
+    av1_apply_encoding_flags(cpi_lap, flags);
+  }
 
   // Handle fixed keyframe intervals
-  if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
-      ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
-    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
-      flags |= AOM_EFLAG_FORCE_KF;
-      ctx->fixed_kf_cntr = 1;
+  if (is_stat_generation_stage(cpi)) {
+    if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+        ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+      if (cpi->common.spatial_layer_id == 0 &&
+          ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+        flags |= AOM_EFLAG_FORCE_KF;
+        ctx->fixed_kf_cntr = 1;
+      }
     }
   }
 
   if (res == AOM_CODEC_OK) {
-    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
     int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timebase, pts + duration);
+        timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
 
     // Set up internal flags
     if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       YV12_BUFFER_CONFIG sd;
+      int use_highbitdepth, subsampling_x, subsampling_y;
       res = image2yuvconfig(img, &sd);
+      use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+      subsampling_x = sd.subsampling_x;
+      subsampling_y = sd.subsampling_y;
+
+      if (!cpi->lookahead) {
+        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.lag_in_frames
+                                            : cpi->oxcf.lag_in_frames;
+
+        cpi->lookahead = av1_lookahead_init(
+            cpi->oxcf.width, cpi->oxcf.height, subsampling_x, subsampling_y,
+            use_highbitdepth, lag_in_frames, cpi->oxcf.border_in_pixels,
+            cpi->common.features.byte_alignment, ctx->num_lap_buffers);
+      }
+      if (!cpi->lookahead)
+        aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate lag buffers");
+
+      av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
+                              subsampling_y);
+      if (cpi_lap != NULL) {
+        cpi_lap->lookahead = cpi->lookahead;
+        av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
+                                subsampling_y);
+      }
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
@@ -1746,12 +2151,37 @@
     int is_frame_visible = 0;
     int index_size = 0;
     int has_fwd_keyframe = 0;
+
+    // Call for LAP stage
+    if (cpi_lap != NULL) {
+      int status;
+      aom_rational64_t timestamp_ratio_la = *timestamp_ratio;
+      int64_t dst_time_stamp_la = dst_time_stamp;
+      int64_t dst_end_time_stamp_la = dst_end_time_stamp;
+      status = av1_get_compressed_data(
+          cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
+          &dst_end_time_stamp_la, !img, &timestamp_ratio_la);
+      if (status != -1) {
+        if (status != AOM_CODEC_OK) {
+          aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
+        }
+        cpi_lap->seq_params_locked = 1;
+      }
+      lib_flags = 0;
+      frame_size = 0;
+    }
+
     // invisible frames get packed with the next visible frame
     while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
-           !is_frame_visible &&
-           -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
-                                         &dst_time_stamp, &dst_end_time_stamp,
-                                         !img, timebase)) {
+           !is_frame_visible) {
+      const int status = av1_get_compressed_data(
+          cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp,
+          &dst_end_time_stamp, !img, timestamp_ratio);
+      if (status == -1) break;
+      if (status != AOM_CODEC_OK) {
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+      }
+
       cpi->seq_params_locked = 1;
       if (frame_size) {
         if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
@@ -1772,12 +2202,12 @@
           }
           const uint32_t obu_header_offset = 0;
           obu_header_size = av1_write_obu_header(
-              cpi, OBU_TEMPORAL_DELIMITER, 0,
+              &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0,
               (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
 
           // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
-          if (write_uleb_obu_size(obu_header_size, obu_payload_size,
-                                  ctx->pending_cx_data) != AOM_CODEC_OK) {
+          if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                      ctx->pending_cx_data) != AOM_CODEC_OK) {
             aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
           }
 
@@ -1798,7 +2228,7 @@
             const size_t move_offset = length_field_size;
             memmove(cx_data + move_offset, cx_data, frame_size);
           }
-          if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+          if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
               AOM_CODEC_OK) {
             aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
           }
@@ -1832,8 +2262,8 @@
           memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
                   tu_size);
         }
-        if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
-            AOM_CODEC_OK) {
+        if (av1_write_uleb_obu_size(0, (uint32_t)tu_size,
+                                    ctx->pending_cx_data) != AOM_CODEC_OK) {
           aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
         }
         ctx->pending_cx_data_sz += length_field_size;
@@ -1846,7 +2276,9 @@
       pkt.data.frame.partition_id = -1;
       pkt.data.frame.vis_frame_size = frame_size;
 
-      pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+      pkt.data.frame.pts =
+          ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+          ctx->pts_offset;
       pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
       if (has_fwd_keyframe) {
         // If one of the invisible frames in the packet is a keyframe, set
@@ -1854,7 +2286,7 @@
         pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
       }
       pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
-          timebase, dst_end_time_stamp - dst_time_stamp);
+          timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
 
       aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
 
@@ -1955,13 +2387,6 @@
   }
 }
 
-static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_INCAPABLE;
-}
-
 static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
   YV12_BUFFER_CONFIG sd;
 
@@ -1977,7 +2402,7 @@
                                           va_list args) {
   const int reference_flag = va_arg(args, int);
 
-  av1_use_as_reference(ctx->cpi, reference_flag);
+  av1_use_as_reference(&ctx->cpi->ext_flags.ref_frame_flags, reference_flag);
   return AOM_CODEC_OK;
 }
 
@@ -2025,9 +2450,9 @@
   aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
 
   if (mode) {
-    const int res =
-        av1_set_internal_size(ctx->cpi, (AOM_SCALING)mode->h_scaling_mode,
-                              (AOM_SCALING)mode->v_scaling_mode);
+    const int res = av1_set_internal_size(
+        &ctx->cpi->oxcf, &ctx->cpi->resize_pending_params,
+        (AOM_SCALING)mode->h_scaling_mode, (AOM_SCALING)mode->v_scaling_mode);
     return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -2037,7 +2462,7 @@
 static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   const int spatial_layer_id = va_arg(args, int);
-  if (spatial_layer_id > MAX_NUM_ENHANCEMENT_LAYERS)
+  if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
   ctx->cpi->common.spatial_layer_id = spatial_layer_id;
   return AOM_CODEC_OK;
@@ -2046,12 +2471,70 @@
 static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
                                                       va_list args) {
   const int number_spatial_layers = va_arg(args, int);
-  if (number_spatial_layers > MAX_NUM_ENHANCEMENT_LAYERS)
+  if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
   ctx->cpi->common.number_spatial_layers = number_spatial_layers;
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *);
+  ctx->cpi->common.spatial_layer_id = data->spatial_layer_id;
+  ctx->cpi->common.temporal_layer_id = data->temporal_layer_id;
+  ctx->cpi->svc.spatial_layer_id = data->spatial_layer_id;
+  ctx->cpi->svc.temporal_layer_id = data->temporal_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  AV1_COMP *const cpi = ctx->cpi;
+  aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
+  cpi->common.number_spatial_layers = params->number_spatial_layers;
+  cpi->common.number_temporal_layers = params->number_temporal_layers;
+  cpi->svc.number_spatial_layers = params->number_spatial_layers;
+  cpi->svc.number_temporal_layers = params->number_temporal_layers;
+  if (cpi->common.number_spatial_layers > 1 ||
+      cpi->common.number_temporal_layers > 1) {
+    unsigned int sl, tl;
+    cpi->use_svc = 1;
+    for (sl = 0; sl < cpi->common.number_spatial_layers; ++sl) {
+      for (tl = 0; tl < cpi->common.number_temporal_layers; ++tl) {
+        const int layer =
+            LAYER_IDS_TO_IDX(sl, tl, cpi->common.number_temporal_layers);
+        LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+        lc->max_q = params->max_quantizers[layer];
+        lc->min_q = params->min_quantizers[layer];
+        lc->scaling_factor_num = params->scaling_factor_num[sl];
+        lc->scaling_factor_den = params->scaling_factor_den[sl];
+        lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer];
+        lc->framerate_factor = params->framerate_factor[tl];
+      }
+    }
+    if (cpi->common.current_frame.frame_number == 0)
+      av1_init_layer_context(cpi);
+    else
+      av1_update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  AV1_COMP *const cpi = ctx->cpi;
+  aom_svc_ref_frame_config_t *const data =
+      va_arg(args, aom_svc_ref_frame_config_t *);
+  cpi->svc.external_ref_frame_config = 1;
+  for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cpi->svc.reference[i] = data->reference[i];
+    cpi->svc.ref_idx[i] = data->ref_idx[i];
+  }
+  for (unsigned int i = 0; i < REF_FRAMES; ++i)
+    cpi->svc.refresh[i] = data->refresh[i];
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -2136,8 +2619,10 @@
 static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *const arg = va_arg(args, int *);
+  const AV1_COMP *const cpi = ctx->cpi;
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  return av1_get_seq_level_idx(ctx->cpi, arg);
+  return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params,
+                               arg);
 }
 
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
@@ -2146,7 +2631,6 @@
 
   // Setters
   { AV1_SET_REFERENCE, ctrl_set_reference },
-  { AOM_SET_POSTPROC, ctrl_set_previewpp },
   { AOME_SET_ROI_MAP, ctrl_set_roi_map },
   { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
   { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
@@ -2160,6 +2644,7 @@
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
   { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model },
+  { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering },
   { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { AOME_SET_TUNING, ctrl_set_tuning },
@@ -2171,6 +2656,7 @@
   { AV1E_SET_LOSSLESS, ctrl_set_lossless },
   { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
   { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode },
   { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc },
   { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
   { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
@@ -2179,9 +2665,6 @@
   { AV1E_SET_QM_V, ctrl_set_qm_v },
   { AV1E_SET_QM_MIN, ctrl_set_qm_min },
   { AV1E_SET_QM_MAX, ctrl_set_qm_max },
-#if CONFIG_DIST_8X8
-  { AV1E_SET_ENABLE_DIST_8X8, ctrl_set_enable_dist_8x8 },
-#endif
   { AV1E_SET_NUM_TG, ctrl_set_num_tg },
   { AV1E_SET_MTU, ctrl_set_mtu },
   { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
@@ -2194,10 +2677,10 @@
   { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size },
   { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size },
   { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter },
+  { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq },
   { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter },
   { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
   { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
-  { AV1E_SET_TX_SIZE_SEARCH_METHOD, ctrl_set_tx_size_search_method },
   { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
   { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
   { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
@@ -2219,6 +2702,7 @@
   { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
   { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
   { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+  { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay },
   { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
   { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc },
   { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
@@ -2230,7 +2714,9 @@
   { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
   { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
   { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
+  { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq },
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+  { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode },
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
   { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
@@ -2242,19 +2728,25 @@
   { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
   { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
   { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+  { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height },
   { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height },
   { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
+  { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
   { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
   { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
-#if CONFIG_DENOISE
   { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
   { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
-#endif  // CONFIG_FILM_GRAIN
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug },
   { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
   { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
+  { AV1E_SET_MIN_CR, ctrl_set_min_cr },
+  { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
+  { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
+  { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+  { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -2269,143 +2761,147 @@
   { -1, NULL },
 };
 
-static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
-  { 0,
-    {
-        // NOLINT
-        0,  // g_usage - non-realtime usage
-        0,  // g_threads
-        0,  // g_profile
+static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+  {
+      // NOLINT
+      AOM_USAGE_GOOD_QUALITY,  // g_usage - non-realtime usage
+      0,                       // g_threads
+      0,                       // g_profile
 
-        320,         // g_width
-        240,         // g_height
-        0,           // g_limit
-        0,           // g_forced_max_frame_width
-        0,           // g_forced_max_frame_height
-        AOM_BITS_8,  // g_bit_depth
-        8,           // g_input_bit_depth
+      320,         // g_width
+      240,         // g_height
+      0,           // g_limit
+      0,           // g_forced_max_frame_width
+      0,           // g_forced_max_frame_height
+      AOM_BITS_8,  // g_bit_depth
+      8,           // g_input_bit_depth
 
-        { 1, 30 },  // g_timebase
+      { 1, 30 },  // g_timebase
 
-        0,  // g_error_resilient
+      0,  // g_error_resilient
 
-        AOM_RC_ONE_PASS,  // g_pass
+      AOM_RC_ONE_PASS,  // g_pass
 
-        19,  // g_lag_in_frames
+      19,  // g_lag_in_frames
 
-        0,                // rc_dropframe_thresh
-        RESIZE_NONE,      // rc_resize_mode
-        SCALE_NUMERATOR,  // rc_resize_denominator
-        SCALE_NUMERATOR,  // rc_resize_kf_denominator
+      0,                // rc_dropframe_thresh
+      RESIZE_NONE,      // rc_resize_mode
+      SCALE_NUMERATOR,  // rc_resize_denominator
+      SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-        SUPERRES_NONE,    // rc_superres_mode
-        SCALE_NUMERATOR,  // rc_superres_denominator
-        SCALE_NUMERATOR,  // rc_superres_kf_denominator
-        63,               // rc_superres_qthresh
-        32,               // rc_superres_kf_qthresh
+      SUPERRES_NONE,    // rc_superres_mode
+      SCALE_NUMERATOR,  // rc_superres_denominator
+      SCALE_NUMERATOR,  // rc_superres_kf_denominator
+      63,               // rc_superres_qthresh
+      32,               // rc_superres_kf_qthresh
 
-        AOM_VBR,      // rc_end_usage
-        { NULL, 0 },  // rc_twopass_stats_in
-        { NULL, 0 },  // rc_firstpass_mb_stats_in
-        256,          // rc_target_bandwidth
-        0,            // rc_min_quantizer
-        63,           // rc_max_quantizer
-        25,           // rc_undershoot_pct
-        25,           // rc_overshoot_pct
+      AOM_VBR,      // rc_end_usage
+      { NULL, 0 },  // rc_twopass_stats_in
+      { NULL, 0 },  // rc_firstpass_mb_stats_in
+      256,          // rc_target_bandwidth
+      0,            // rc_min_quantizer
+      63,           // rc_max_quantizer
+      25,           // rc_undershoot_pct
+      25,           // rc_overshoot_pct
 
-        6000,  // rc_max_buffer_size
-        4000,  // rc_buffer_initial_size
-        5000,  // rc_buffer_optimal_size
+      6000,  // rc_max_buffer_size
+      4000,  // rc_buffer_initial_size
+      5000,  // rc_buffer_optimal_size
 
-        50,    // rc_two_pass_vbrbias
-        0,     // rc_two_pass_vbrmin_section
-        2000,  // rc_two_pass_vbrmax_section
+      50,    // rc_two_pass_vbrbias
+      0,     // rc_two_pass_vbrmin_section
+      2000,  // rc_two_pass_vbrmax_section
 
-        // keyframing settings (kf)
-        0,            // fwd_kf_enabled
-        AOM_KF_AUTO,  // g_kfmode
-        0,            // kf_min_dist
-        9999,         // kf_max_dist
-        0,            // sframe_dist
-        1,            // sframe_mode
-        0,            // large_scale_tile
-        0,            // monochrome
-        0,            // full_still_picture_hdr
-        0,            // save_as_annexb
-        0,            // tile_width_count
-        0,            // tile_height_count
-        { 0 },        // tile_widths
-        { 0 },        // tile_heights
-        { 1 },        // config file
-    } },
-  { 1,
-    {
-        // NOLINT
-        1,  // g_usage - real-time usage
-        0,  // g_threads
-        0,  // g_profile
+      // keyframing settings (kf)
+      0,                       // fwd_kf_enabled
+      AOM_KF_AUTO,             // g_kfmode
+      0,                       // kf_min_dist
+      9999,                    // kf_max_dist
+      0,                       // sframe_dist
+      1,                       // sframe_mode
+      0,                       // large_scale_tile
+      0,                       // monochrome
+      0,                       // full_still_picture_hdr
+      0,                       // save_as_annexb
+      0,                       // tile_width_count
+      0,                       // tile_height_count
+      { 0 },                   // tile_widths
+      { 0 },                   // tile_heights
+      0,                       // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+  },
+  {
+      // NOLINT
+      AOM_USAGE_REALTIME,  // g_usage - real-time usage
+      0,                   // g_threads
+      0,                   // g_profile
 
-        320,         // g_width
-        240,         // g_height
-        0,           // g_limit
-        0,           // g_forced_max_frame_width
-        0,           // g_forced_max_frame_height
-        AOM_BITS_8,  // g_bit_depth
-        8,           // g_input_bit_depth
+      320,         // g_width
+      240,         // g_height
+      0,           // g_limit
+      0,           // g_forced_max_frame_width
+      0,           // g_forced_max_frame_height
+      AOM_BITS_8,  // g_bit_depth
+      8,           // g_input_bit_depth
 
-        { 1, 30 },  // g_timebase
+      { 1, 30 },  // g_timebase
 
-        0,  // g_error_resilient
+      0,  // g_error_resilient
 
-        AOM_RC_ONE_PASS,  // g_pass
+      AOM_RC_ONE_PASS,  // g_pass
 
-        1,  // g_lag_in_frames
+      1,  // g_lag_in_frames
 
-        0,                // rc_dropframe_thresh
-        RESIZE_NONE,      // rc_resize_mode
-        SCALE_NUMERATOR,  // rc_resize_denominator
-        SCALE_NUMERATOR,  // rc_resize_kf_denominator
+      0,                // rc_dropframe_thresh
+      RESIZE_NONE,      // rc_resize_mode
+      SCALE_NUMERATOR,  // rc_resize_denominator
+      SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-        0,                // rc_superres_mode
-        SCALE_NUMERATOR,  // rc_superres_denominator
-        SCALE_NUMERATOR,  // rc_superres_kf_denominator
-        63,               // rc_superres_qthresh
-        32,               // rc_superres_kf_qthresh
+      0,                // rc_superres_mode
+      SCALE_NUMERATOR,  // rc_superres_denominator
+      SCALE_NUMERATOR,  // rc_superres_kf_denominator
+      63,               // rc_superres_qthresh
+      32,               // rc_superres_kf_qthresh
 
-        AOM_CBR,      // rc_end_usage
-        { NULL, 0 },  // rc_twopass_stats_in
-        { NULL, 0 },  // rc_firstpass_mb_stats_in
-        256,          // rc_target_bandwidth
-        0,            // rc_min_quantizer
-        63,           // rc_max_quantizer
-        25,           // rc_undershoot_pct
-        25,           // rc_overshoot_pct
+      AOM_CBR,      // rc_end_usage
+      { NULL, 0 },  // rc_twopass_stats_in
+      { NULL, 0 },  // rc_firstpass_mb_stats_in
+      256,          // rc_target_bandwidth
+      0,            // rc_min_quantizer
+      63,           // rc_max_quantizer
+      25,           // rc_undershoot_pct
+      25,           // rc_overshoot_pct
 
-        6000,  // rc_max_buffer_size
-        4000,  // rc_buffer_initial_size
-        5000,  // rc_buffer_optimal_size
+      6000,  // rc_max_buffer_size
+      4000,  // rc_buffer_initial_size
+      5000,  // rc_buffer_optimal_size
 
-        50,    // rc_two_pass_vbrbias
-        0,     // rc_two_pass_vbrmin_section
-        2000,  // rc_two_pass_vbrmax_section
+      50,    // rc_two_pass_vbrbias
+      0,     // rc_two_pass_vbrmin_section
+      2000,  // rc_two_pass_vbrmax_section
 
-        // keyframing settings (kf)
-        0,            // fwd_kf_enabled
-        AOM_KF_AUTO,  // g_kfmode
-        0,            // kf_min_dist
-        9999,         // kf_max_dist
-        0,            // sframe_dist
-        1,            // sframe_mode
-        0,            // large_scale_tile
-        0,            // monochrome
-        0,            // full_still_picture_hdr
-        0,            // save_as_annexb
-        0,            // tile_width_count
-        0,            // tile_height_count
-        { 0 },        // tile_widths
-        { 0 },        // tile_heights
-        { 1 },        // config file
-    } },
+      // keyframing settings (kf)
+      0,                       // fwd_kf_enabled
+      AOM_KF_AUTO,             // g_kfmode
+      0,                       // kf_min_dist
+      9999,                    // kf_max_dist
+      0,                       // sframe_dist
+      1,                       // sframe_mode
+      0,                       // large_scale_tile
+      0,                       // monochrome
+      0,                       // full_still_picture_hdr
+      0,                       // save_as_annexb
+      0,                       // tile_width_count
+      0,                       // tile_height_count
+      { 0 },                   // tile_widths
+      { 0 },                   // tile_heights
+      0,                       // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+  },
 };
 
 #ifndef VERSION_STRING
@@ -2429,13 +2925,12 @@
   },
   {
       // NOLINT
-      2,                           // 2 cfg map
-      encoder_usage_cfg_map,       // aom_codec_enc_cfg_map_t
+      2,                           // 2 cfg
+      encoder_usage_cfg,           // aom_codec_enc_cfg_t
       encoder_encode,              // aom_codec_encode_fn_t
       encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
       encoder_set_config,          // aom_codec_enc_config_set_fn_t
       encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
-      encoder_get_preview,         // aom_codec_get_preview_frame_fn_t
-      NULL                         // aom_codec_enc_mr_get_mem_loc_fn_t
+      encoder_get_preview          // aom_codec_get_preview_frame_fn_t
   }
 };

diff --git a/libaom/av1/av1_dx_iface.c b/libaom/av1/av1_dx_iface.c
index ca872d7..d821a52 100644
--- a/libaom/av1/av1_dx_iface.c
+++ b/libaom/av1/av1_dx_iface.c

@@ -16,6 +16,7 @@
 #include "config/aom_version.h"
 
 #include "aom/internal/aom_codec_internal.h"
+#include "aom/internal/aom_image_internal.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
 #include "aom_dsp/bitreader_buffer.h"
@@ -38,8 +39,6 @@
   aom_codec_priv_t base;
   aom_codec_dec_cfg_t cfg;
   aom_codec_stream_info_t si;
-  int postproc_cfg_set;
-  aom_postproc_cfg_t postproc_cfg;
   aom_image_t img;
   int img_avail;
   int flushed;
@@ -58,14 +57,11 @@
   int operating_point;
   int output_all_layers;
 
-  // TODO(wtc): This can be simplified. num_frame_workers is always 1, and
-  // next_output_worker_id is always 0. The frame_workers array of size 1 can
-  // be replaced by a single AVxWorker.
-  AVxWorker *frame_workers;
-  int num_frame_workers;
-  int next_output_worker_id;
+  AVxWorker *frame_worker;
 
-  aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
+  aom_image_t image_with_grain;
+  aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS];
+  size_t num_grain_image_frame_buffers;
   int need_resync;  // wait for key/intra-only frame
   // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
   BufferPool *buffer_pool;
@@ -81,13 +77,10 @@
 #endif
 };
 
-static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
-                                    aom_codec_priv_enc_mr_cfg_t *data) {
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
   // This function only allocates space for the aom_codec_alg_priv_t
   // structure. More memory may be required at the time the stream
   // information becomes known.
-  (void)data;
-
   if (!ctx->priv) {
     aom_codec_alg_priv_t *const priv =
         (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
@@ -98,14 +91,12 @@
     priv->flushed = 0;
 
     // TODO(tdaede): this should not be exposed to the API
-    priv->cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+    priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
-      // default values
-      priv->cfg.cfg.ext_partition = 1;
     }
-    av1_zero(priv->image_with_grain);
+    priv->num_grain_image_frame_buffers = 0;
     // Turn row_mt on by default.
     priv->row_mt = 1;
 
@@ -121,35 +112,33 @@
 }
 
 static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
-  if (ctx->frame_workers != NULL) {
-    int i;
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      AVxWorker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      aom_get_worker_interface()->end(worker);
-      aom_free(frame_worker_data->pbi->common.tpl_mvs);
-      frame_worker_data->pbi->common.tpl_mvs = NULL;
-      av1_remove_common(&frame_worker_data->pbi->common);
-      av1_free_restoration_buffers(&frame_worker_data->pbi->common);
-      av1_decoder_remove(frame_worker_data->pbi);
-      aom_free(frame_worker_data);
-    }
+  if (ctx->frame_worker != NULL) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    aom_get_worker_interface()->end(worker);
+    aom_free(frame_worker_data->pbi->common.tpl_mvs);
+    frame_worker_data->pbi->common.tpl_mvs = NULL;
+    av1_remove_common(&frame_worker_data->pbi->common);
+    av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+    av1_decoder_remove(frame_worker_data->pbi);
+    aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
     pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
 #endif
   }
 
   if (ctx->buffer_pool) {
+    for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) {
+      ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv,
+                                      &ctx->grain_image_frame_buffers[i]);
+    }
     av1_free_ref_frame_buffers(ctx->buffer_pool);
     av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
   }
 
-  aom_free(ctx->frame_workers);
+  aom_free(ctx->frame_worker);
   aom_free(ctx->buffer_pool);
-  for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
-    if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
-  }
+  aom_img_free(&ctx->img);
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
@@ -336,6 +325,8 @@
           if (frame_type == KEY_FRAME) {
             found_keyframe = 1;
             break;  // Stop here as no further OBUs will change the outcome.
+          } else if (frame_type == INTRA_ONLY_FRAME) {
+            intra_only_flag = 1;
           }
         }
       }
@@ -379,42 +370,33 @@
 }
 
 static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
-  int i;
+  AVxWorker *const worker = ctx->frame_worker;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  AV1Decoder *const pbi = frame_worker_data->pbi;
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    AVxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-    BufferPool *const pool = cm->buffer_pool;
+  cm->cur_frame = NULL;
+  cm->features.byte_alignment = ctx->byte_alignment;
+  pbi->skip_loop_filter = ctx->skip_loop_filter;
+  pbi->skip_film_grain = ctx->skip_film_grain;
 
-    cm->cur_frame = NULL;
-    cm->byte_alignment = ctx->byte_alignment;
-    cm->skip_loop_filter = ctx->skip_loop_filter;
-    cm->skip_film_grain = ctx->skip_film_grain;
+  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+    pool->get_fb_cb = ctx->get_ext_fb_cb;
+    pool->release_fb_cb = ctx->release_ext_fb_cb;
+    pool->cb_priv = ctx->ext_priv;
+  } else {
+    pool->get_fb_cb = av1_get_frame_buffer;
+    pool->release_fb_cb = av1_release_frame_buffer;
 
-    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-      pool->get_fb_cb = ctx->get_ext_fb_cb;
-      pool->release_fb_cb = ctx->release_ext_fb_cb;
-      pool->cb_priv = ctx->ext_priv;
-    } else {
-      pool->get_fb_cb = av1_get_frame_buffer;
-      pool->release_fb_cb = av1_release_frame_buffer;
+    if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to initialize internal frame buffers");
 
-      if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to initialize internal frame buffers");
-
-      pool->cb_priv = &pool->int_frame_buffers;
-    }
+    pool->cb_priv = &pool->int_frame_buffers;
   }
 }
 
-static void set_default_ppflags(aom_postproc_cfg_t *cfg) {
-  cfg->post_proc_flag = AOM_DEBLOCK | AOM_DEMACROBLOCK;
-  cfg->deblocking_level = 4;
-  cfg->noise_level = 0;
-}
-
 static int frame_worker_hook(void *arg1, void *arg2) {
   FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
   const uint8_t *data = frame_worker_data->data;
@@ -432,15 +414,10 @@
 }
 
 static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
-  int i;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
   ctx->last_show_frame = NULL;
-  ctx->next_output_worker_id = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers = 1;
-  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
-    ctx->num_frame_workers = MAX_DECODE_THREADS;
   ctx->flushed = 0;
 
   ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
@@ -453,60 +430,45 @@
   }
 #endif
 
-  ctx->frame_workers = (AVxWorker *)aom_malloc(ctx->num_frame_workers *
-                                               sizeof(*ctx->frame_workers));
-  if (ctx->frame_workers == NULL) {
-    set_error_detail(ctx, "Failed to allocate frame_workers");
+  ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker));
+  if (ctx->frame_worker == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker");
     return AOM_CODEC_MEM_ERROR;
   }
 
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    AVxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *frame_worker_data = NULL;
-    winterface->init(worker);
-    worker->thread_name = "aom frameworker";
-    worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
-    if (worker->data1 == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return AOM_CODEC_MEM_ERROR;
-    }
-    frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
-    if (frame_worker_data->pbi == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return AOM_CODEC_MEM_ERROR;
-    }
-    frame_worker_data->pbi->common.options = &ctx->cfg.cfg;
-    frame_worker_data->worker_id = i;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 0;
-    frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
-
-    // If decoding in serial mode, FrameWorker thread could create tile worker
-    // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads = ctx->cfg.threads;
-    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
-    frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
-    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
-    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
-    frame_worker_data->pbi->operating_point = ctx->operating_point;
-    frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
-    frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
-    frame_worker_data->pbi->row_mt = ctx->row_mt;
-
-    worker->hook = frame_worker_hook;
-    // The main thread acts as Frame Worker 0.
-    if (i != 0 && !winterface->reset(worker)) {
-      set_error_detail(ctx, "Frame Worker thread creation failed");
-      return AOM_CODEC_MEM_ERROR;
-    }
+  AVxWorker *const worker = ctx->frame_worker;
+  FrameWorkerData *frame_worker_data = NULL;
+  winterface->init(worker);
+  worker->thread_name = "aom frameworker";
+  worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+  if (worker->data1 == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker_data");
+    return AOM_CODEC_MEM_ERROR;
   }
+  frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+  if (frame_worker_data->pbi == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker_data");
+    return AOM_CODEC_MEM_ERROR;
+  }
+  frame_worker_data->frame_context_ready = 0;
+  frame_worker_data->received_frame = 0;
+  frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
 
-  // If postprocessing was enabled by the application and a
-  // configuration has not been provided, default it.
-  if (!ctx->postproc_cfg_set && (ctx->base.init_flags & AOM_CODEC_USE_POSTPROC))
-    set_default_ppflags(&ctx->postproc_cfg);
+  // If decoding in serial mode, FrameWorker thread could create tile worker
+  // thread or loopfilter thread.
+  frame_worker_data->pbi->max_threads = ctx->cfg.threads;
+  frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
+  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->operating_point = ctx->operating_point;
+  frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
+
+  worker->hook = frame_worker_hook;
 
   init_buffer_callbacks(ctx);
 
@@ -539,21 +501,21 @@
     if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
   }
 
-  AVxWorker *const worker = ctx->frame_workers;
+  AVxWorker *const worker = ctx->frame_worker;
   FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
   frame_worker_data->data = *data;
   frame_worker_data->data_size = data_sz;
   frame_worker_data->user_priv = user_priv;
   frame_worker_data->received_frame = 1;
 
-  frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
   frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
   frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
   frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
   frame_worker_data->pbi->row_mt = ctx->row_mt;
   frame_worker_data->pbi->ext_refs = ctx->ext_refs;
 
-  frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
 
   worker->had_error = 0;
   winterface->execute(worker);
@@ -576,14 +538,15 @@
                                        void *user_priv) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
+  const uint8_t *const data_end = data + data_sz;
   Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv;
 
-  if (ctx->frame_workers == NULL) {
+  if (ctx->frame_worker == NULL) {
     res = init_decoder(ctx);
     if (res != AOM_CODEC_OK) return res;
   }
   FrameWorkerData *const frame_worker_data =
-      (FrameWorkerData *)ctx->frame_workers[0].data1;
+      (FrameWorkerData *)ctx->frame_worker->data1;
   AV1Decoder *const pbi = frame_worker_data->pbi;
   AV1_COMMON *const cm = &pbi->common;
   frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
@@ -591,9 +554,16 @@
   res = av1_receive_compressed_data(frame_worker_data->pbi, data_sz, &data);
   check_resync(ctx, frame_worker_data->pbi);
 
-  if (ctx->frame_workers->had_error)
+  if (ctx->frame_worker->had_error)
     return update_error_state(ctx, &frame_worker_data->pbi->common.error);
 
+  // Allow extra zero bytes after the frame end
+  while (data < data_end) {
+    const uint8_t marker = data[0];
+    if (marker) break;
+    ++data;
+  }
+
   data2->idx = -1;
   for (int i = 0; i < REF_FRAMES; ++i)
     if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
@@ -616,20 +586,24 @@
   // Release any pending output frames from the previous decoder_decode call.
   // We need to do this even if the decoder is being flushed or the input
   // arguments are invalid.
-  if (ctx->frame_workers) {
+  if (ctx->frame_worker) {
     BufferPool *const pool = ctx->buffer_pool;
     lock_buffer_pool(pool);
-    for (int i = 0; i < ctx->num_frame_workers; ++i) {
-      AVxWorker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      struct AV1Decoder *pbi = frame_worker_data->pbi;
-      for (size_t j = 0; j < pbi->num_output_frames; j++) {
-        decrease_ref_count(pbi->output_frames[j], pool);
-      }
-      pbi->num_output_frames = 0;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    struct AV1Decoder *pbi = frame_worker_data->pbi;
+    for (size_t j = 0; j < pbi->num_output_frames; j++) {
+      decrease_ref_count(pbi->output_frames[j], pool);
     }
-    unlock_buffer_pool(ctx->buffer_pool);
+    pbi->num_output_frames = 0;
+    unlock_buffer_pool(pool);
+    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
+      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
+      ctx->grain_image_frame_buffers[j].data = NULL;
+      ctx->grain_image_frame_buffers[j].size = 0;
+      ctx->grain_image_frame_buffers[j].priv = NULL;
+    }
+    ctx->num_grain_image_frame_buffers = 0;
   }
 
   /* Sanity checks */
@@ -643,8 +617,8 @@
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  // Initialize the decoder workers on the first frame.
-  if (ctx->frame_workers == NULL) {
+  // Initialize the decoder worker on the first frame.
+  if (ctx->frame_worker == NULL) {
     res = init_decoder(ctx);
     if (res != AOM_CODEC_OK) return res;
   }
@@ -697,45 +671,59 @@
   return res;
 }
 
+typedef struct {
+  BufferPool *pool;
+  aom_codec_frame_buffer_t *fb;
+} AllocCbParam;
+
+static void *AllocWithGetFrameBufferCb(void *priv, size_t size) {
+  AllocCbParam *param = (AllocCbParam *)priv;
+  if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0)
+    return NULL;
+  if (param->fb->data == NULL || param->fb->size < size) return NULL;
+  return param->fb->data;
+}
+
 // If grain_params->apply_grain is false, returns img. Otherwise, adds film
-// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
-// if necessary), and returns *grain_img_ptr.
-static aom_image_t *add_grain_if_needed(aom_image_t *img,
-                                        aom_image_t **grain_img_ptr,
+// grain to img, saves the result in grain_img, and returns grain_img.
+static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx,
+                                        aom_image_t *img,
+                                        aom_image_t *grain_img,
                                         aom_film_grain_t *grain_params) {
   if (!grain_params->apply_grain) return img;
 
-  aom_image_t *grain_img_buf = *grain_img_ptr;
-
   const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
   const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
 
-  if (grain_img_buf) {
-    const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
-    const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
-    if (w_even != alloc_w || h_even != alloc_h ||
-        img->fmt != grain_img_buf->fmt) {
-      aom_img_free(grain_img_buf);
-      grain_img_buf = NULL;
-      *grain_img_ptr = NULL;
-    }
-  }
-  if (!grain_img_buf) {
-    grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
-    *grain_img_ptr = grain_img_buf;
+  BufferPool *const pool = ctx->buffer_pool;
+  aom_codec_frame_buffer_t *fb =
+      &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers];
+  AllocCbParam param;
+  param.pool = pool;
+  param.fb = fb;
+  if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16,
+                             AllocWithGetFrameBufferCb, &param)) {
+    return NULL;
   }
 
-  if (grain_img_buf) {
-    grain_img_buf->user_priv = img->user_priv;
-    grain_img_buf->fb_priv = img->fb_priv;
-    if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
-      aom_img_free(grain_img_buf);
-      grain_img_buf = NULL;
-      *grain_img_ptr = NULL;
-    }
+  grain_img->user_priv = img->user_priv;
+  grain_img->fb_priv = fb->priv;
+  if (av1_add_film_grain(grain_params, img, grain_img)) {
+    pool->release_fb_cb(pool->cb_priv, fb);
+    return NULL;
   }
 
-  return grain_img_buf;
+  ctx->num_grain_image_frame_buffers++;
+  return grain_img;
+}
+
+// Copies and clears the metadata from AV1Decoder.
+static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) {
+  if (pbi->metadata && img) {
+    assert(!img->metadata);
+    img->metadata = pbi->metadata;
+    pbi->metadata = NULL;
+  }
 }
 
 static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
@@ -750,103 +738,100 @@
   // simply a pointer to an integer index
   uintptr_t *index = (uintptr_t *)iter;
 
-  if (ctx->frame_workers != NULL) {
-    do {
-      YV12_BUFFER_CONFIG *sd;
-      // NOTE(david.barker): This code does not support multiple worker threads
-      // yet. We should probably move the iteration over threads into *iter
-      // instead of using ctx->next_output_worker_id.
-      const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-      AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      AV1Decoder *const pbi = frame_worker_data->pbi;
-      AV1_COMMON *const cm = &pbi->common;
-      ctx->next_output_worker_id =
-          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-      // Wait for the frame from worker thread.
-      if (winterface->sync(worker)) {
-        // Check if worker has received any frames.
-        if (frame_worker_data->received_frame == 1) {
-          frame_worker_data->received_frame = 0;
-          check_resync(ctx, frame_worker_data->pbi);
-        }
-        aom_film_grain_t *grain_params;
-        if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
-                              &grain_params) == 0) {
-          RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
-          ctx->last_show_frame = output_frame_buf;
-          if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
-
-          if (!pbi->ext_tile_debug && cm->large_scale_tile) {
-            *index += 1;  // Advance the iterator to point to the next image
-
-            yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
-            img = &ctx->img;
-            return img;
-          }
-
-          const int num_planes = av1_num_planes(cm);
-          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
-              pbi->dec_tile_row >= 0) {
-            int tile_width, tile_height;
-            av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
-            const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
-            const int mi_row = tile_row * tile_height;
-            const int ssy = ctx->img.y_chroma_shift;
-            int plane;
-            ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
-            if (num_planes > 1) {
-              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-                ctx->img.planes[plane] +=
-                    mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
-              }
-            }
-            ctx->img.d_h = AOMMIN(tile_height, cm->mi_rows - mi_row) * MI_SIZE;
-          }
-
-          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
-              pbi->dec_tile_col >= 0) {
-            int tile_width, tile_height;
-            av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
-            const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
-            const int mi_col = tile_col * tile_width;
-            const int ssx = ctx->img.x_chroma_shift;
-            const int is_hbd =
-                (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
-            int plane;
-            ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
-            if (num_planes > 1) {
-              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-                ctx->img.planes[plane] +=
-                    mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
-              }
-            }
-            ctx->img.d_w = AOMMIN(tile_width, cm->mi_cols - mi_col) * MI_SIZE;
-          }
-
-          ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
-          img = &ctx->img;
-          img->temporal_id = cm->temporal_layer_id;
-          img->spatial_id = cm->spatial_layer_id;
-          if (cm->skip_film_grain) grain_params->apply_grain = 0;
-          aom_image_t *res = add_grain_if_needed(
-              img, &ctx->image_with_grain[*index], grain_params);
-          if (!res) {
-            aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
-                               "Grain systhesis failed\n");
-          }
-          *index += 1;  // Advance the iterator to point to the next image
-          return res;
-        }
-      } else {
-        // Decoding failed. Release the worker thread.
+  if (ctx->frame_worker != NULL) {
+    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *const pbi = frame_worker_data->pbi;
+    AV1_COMMON *const cm = &pbi->common;
+    CommonTileParams *const tiles = &cm->tiles;
+    // Wait for the frame from worker thread.
+    if (winterface->sync(worker)) {
+      // Check if worker has received any frames.
+      if (frame_worker_data->received_frame == 1) {
         frame_worker_data->received_frame = 0;
-        ctx->need_resync = 1;
-        if (ctx->flushed != 1) return NULL;
+        check_resync(ctx, frame_worker_data->pbi);
       }
-    } while (ctx->next_output_worker_id != 0);
+      YV12_BUFFER_CONFIG *sd;
+      aom_film_grain_t *grain_params;
+      if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+                            &grain_params) == 0) {
+        RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
+        ctx->last_show_frame = output_frame_buf;
+        if (ctx->need_resync) return NULL;
+        aom_img_remove_metadata(&ctx->img);
+        yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+        move_decoder_metadata_to_img(pbi, &ctx->img);
+
+        if (!pbi->ext_tile_debug && tiles->large_scale) {
+          *index += 1;  // Advance the iterator to point to the next image
+          aom_img_remove_metadata(&ctx->img);
+          yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
+          move_decoder_metadata_to_img(pbi, &ctx->img);
+          img = &ctx->img;
+          return img;
+        }
+
+        const int num_planes = av1_num_planes(cm);
+        if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+            pbi->dec_tile_row >= 0) {
+          int tile_width, tile_height;
+          av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+          const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1);
+          const int mi_row = tile_row * tile_height;
+          const int ssy = ctx->img.y_chroma_shift;
+          int plane;
+          ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+          if (num_planes > 1) {
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            }
+          }
+          ctx->img.d_h =
+              AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
+        }
+
+        if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+            pbi->dec_tile_col >= 0) {
+          int tile_width, tile_height;
+          av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+          const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
+          const int mi_col = tile_col * tile_width;
+          const int ssx = ctx->img.x_chroma_shift;
+          const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+          int plane;
+          ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+          if (num_planes > 1) {
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+            }
+          }
+          ctx->img.d_w =
+              AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
+        }
+
+        ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
+        img = &ctx->img;
+        img->temporal_id = cm->temporal_layer_id;
+        img->spatial_id = cm->spatial_layer_id;
+        if (pbi->skip_film_grain) grain_params->apply_grain = 0;
+        aom_image_t *res =
+            add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
+        if (!res) {
+          aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+                             "Grain systhesis failed\n");
+        }
+        *index += 1;  // Advance the iterator to point to the next image
+        return res;
+      }
+    } else {
+      // Decoding failed. Release the worker thread.
+      frame_worker_data->received_frame = 0;
+      ctx->need_resync = 1;
+      if (ctx->flushed != 1) return NULL;
+    }
   }
   return NULL;
 }
@@ -856,7 +841,7 @@
     aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return AOM_CODEC_INVALID_PARAM;
-  } else if (ctx->frame_workers == NULL) {
+  } else if (ctx->frame_worker == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -875,7 +860,7 @@
   if (data) {
     av1_ref_frame_t *const frame = data;
     YV12_BUFFER_CONFIG sd;
-    AVxWorker *const worker = ctx->frame_workers;
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
@@ -890,7 +875,7 @@
   const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
   if (frame) {
     YV12_BUFFER_CONFIG sd;
-    AVxWorker *const worker = ctx->frame_workers;
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
@@ -904,7 +889,7 @@
   av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
   if (data) {
     YV12_BUFFER_CONFIG *fb;
-    AVxWorker *const worker = ctx->frame_workers;
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
     if (fb == NULL) return AOM_CODEC_ERROR;
@@ -920,7 +905,7 @@
   aom_image_t *new_img = va_arg(args, aom_image_t *);
   if (new_img) {
     YV12_BUFFER_CONFIG new_frame;
-    AVxWorker *const worker = ctx->frame_workers;
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
 
     if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
@@ -939,7 +924,7 @@
   aom_image_t *img = va_arg(args, aom_image_t *);
   if (img) {
     YV12_BUFFER_CONFIG new_frame;
-    AVxWorker *const worker = ctx->frame_workers;
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
 
     if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
@@ -955,27 +940,13 @@
   }
 }
 
-static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
-                                         va_list args) {
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_INCAPABLE;
-}
-
-static aom_codec_err_t ctrl_set_dbg_options(aom_codec_alg_priv_t *ctx,
-                                            va_list args) {
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_INCAPABLE;
-}
-
 static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
   if (update_info) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       *update_info =
@@ -993,8 +964,8 @@
                                                va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg =
-      ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+             ->pbi->common.quant_params.base_qindex;
   return AOM_CODEC_OK;
 }
 
@@ -1003,8 +974,8 @@
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       AV1Decoder *const pbi = frame_worker_data->pbi;
@@ -1026,8 +997,8 @@
   int *const frame_size = va_arg(args, int *);
 
   if (frame_size) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
@@ -1047,8 +1018,8 @@
   aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
 
   if (frame_header_info) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
@@ -1068,8 +1039,8 @@
   aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
 
   if (tile_data) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
@@ -1107,8 +1078,8 @@
   int *const render_size = va_arg(args, int *);
 
   if (render_size) {
-    if (ctx->frame_workers) {
-      AVxWorker *const worker = ctx->frame_workers;
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
@@ -1126,7 +1097,7 @@
 static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const bit_depth = va_arg(args, unsigned int *);
-  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  AVxWorker *const worker = ctx->frame_worker;
 
   if (bit_depth) {
     if (worker) {
@@ -1161,7 +1132,7 @@
 static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
-  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  AVxWorker *const worker = ctx->frame_worker;
 
   if (img_fmt) {
     if (worker) {
@@ -1184,7 +1155,7 @@
 static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const tile_size = va_arg(args, unsigned int *);
-  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  AVxWorker *const worker = ctx->frame_worker;
 
   if (tile_size) {
     if (worker) {
@@ -1207,7 +1178,7 @@
   unsigned int *const tile_count = va_arg(args, unsigned int *);
 
   if (tile_count) {
-    AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+    AVxWorker *const worker = ctx->frame_worker;
     if (worker) {
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
@@ -1240,10 +1211,10 @@
     return AOM_CODEC_INVALID_PARAM;
 
   ctx->byte_alignment = byte_alignment;
-  if (ctx->frame_workers) {
-    AVxWorker *const worker = ctx->frame_workers;
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+    frame_worker_data->pbi->common.features.byte_alignment = byte_alignment;
   }
   return AOM_CODEC_OK;
 }
@@ -1252,10 +1223,10 @@
                                                  va_list args) {
   ctx->skip_loop_filter = va_arg(args, int);
 
-  if (ctx->frame_workers) {
-    AVxWorker *const worker = ctx->frame_workers;
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+    frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter;
   }
 
   return AOM_CODEC_OK;
@@ -1265,10 +1236,10 @@
                                                 va_list args) {
   ctx->skip_film_grain = va_arg(args, int);
 
-  if (ctx->frame_workers) {
-    AVxWorker *const worker = ctx->frame_workers;
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+    frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain;
   }
 
   return AOM_CODEC_OK;
@@ -1281,8 +1252,8 @@
   (void)args;
   return AOM_CODEC_INCAPABLE;
 #else
-  if (ctx->frame_workers) {
-    AVxWorker *const worker = ctx->frame_workers;
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     AV1Decoder *pbi = frame_worker_data->pbi;
     Accounting **acct = va_arg(args, Accounting **);
@@ -1359,11 +1330,6 @@
 
   // Setters
   { AV1_SET_REFERENCE, ctrl_set_reference },
-  { AOM_SET_POSTPROC, ctrl_set_postproc },
-  { AOM_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
-  { AOM_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
-  { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
-  { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
   { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
   { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
@@ -1421,12 +1387,11 @@
   {
       // NOLINT
       0,
-      NULL,  // aom_codec_enc_cfg_map_t
+      NULL,  // aom_codec_enc_cfg_t
       NULL,  // aom_codec_encode_fn_t
       NULL,  // aom_codec_get_cx_data_fn_t
       NULL,  // aom_codec_enc_config_set_fn_t
       NULL,  // aom_codec_get_global_headers_fn_t
-      NULL,  // aom_codec_get_preview_frame_fn_t
-      NULL   // aom_codec_enc_mr_get_mem_loc_fn_t
+      NULL   // aom_codec_get_preview_frame_fn_t
   }
 };

diff --git a/libaom/av1/av1_iface_common.h b/libaom/av1/av1_iface_common.h
index 5568c89..9b5ffcb 100644
--- a/libaom/av1/av1_iface_common.h
+++ b/libaom/av1/av1_iface_common.h

@@ -11,6 +11,8 @@
 #ifndef AOM_AV1_AV1_IFACE_COMMON_H_
 #define AOM_AV1_AV1_IFACE_COMMON_H_
 
+#include <assert.h>
+
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
 
@@ -56,6 +58,7 @@
   img->stride[AOM_PLANE_U] = yv12->uv_stride;
   img->stride[AOM_PLANE_V] = yv12->uv_stride;
   if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    bps *= 2;
     // aom_image_t uses byte strides and a pointer to the first byte
     // of the image.
     img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
@@ -73,6 +76,8 @@
   img->img_data_owner = 0;
   img->self_allocd = 0;
   img->sz = yv12->frame_size;
+  assert(!yv12->metadata);
+  img->metadata = NULL;
 }
 
 static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
@@ -132,6 +137,7 @@
   yv12->border = (border < 0) ? 0 : border;
   yv12->subsampling_x = img->x_chroma_shift;
   yv12->subsampling_y = img->y_chroma_shift;
+  yv12->metadata = img->metadata;
   return AOM_CODEC_OK;
 }
 

diff --git a/libaom/av1/common/alloccommon.c b/libaom/av1/common/alloccommon.c
index 1c8528a..badee3d 100644
--- a/libaom/av1/common/alloccommon.c
+++ b/libaom/av1/common/alloccommon.c

@@ -15,10 +15,10 @@
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
-#include "av1/common/onyxc_int.h"
 
 int av1_get_MBs(int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -31,60 +31,6 @@
   return mb_rows * mb_cols;
 }
 
-#if LOOP_FILTER_BITMASK
-static int alloc_loop_filter_mask(AV1_COMMON *cm) {
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-
-  // Each lfm holds bit masks for all the 4x4 blocks in a max
-  // 64x64 (128x128 for ext_partitions) region.  The stride
-  // and rows are rounded up / truncated to a multiple of 16
-  // (32 for ext_partition).
-  cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
-  cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
-                   cm->lf.lfm_stride;
-  cm->lf.lfm =
-      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) return 1;
-
-  unsigned int i;
-  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
-
-  return 0;
-}
-
-static void free_loop_filter_mask(AV1_COMMON *cm) {
-  if (cm->lf.lfm == NULL) return;
-
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-  cm->lf.lfm_num = 0;
-  cm->lf.lfm_stride = 0;
-}
-#endif
-
-void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
-  // Ensure that the decoded width and height are both multiples of
-  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
-  // subsampling is used).
-  // This simplifies the implementation of various experiments,
-  // eg. cdef, which operates on units of 8x8 luma pixels.
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
-
-  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = calc_mi_size(cm->mi_cols);
-
-  cm->mb_cols = (cm->mi_cols + 2) >> 2;
-  cm->mb_rows = (cm->mi_rows + 2) >> 2;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-
-#if LOOP_FILTER_BITMASK
-  alloc_loop_filter_mask(cm);
-#endif
-}
-
 void av1_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
@@ -127,7 +73,7 @@
   // able to quickly answer the question "Where is the <n>'th stripe for tile
   // row <m>?" To make that efficient, we generate the rst_last_stripe array.
   int num_stripes = 0;
-  for (int i = 0; i < cm->tile_rows; ++i) {
+  for (int i = 0; i < cm->tiles.rows; ++i) {
     TileInfo tile_info;
     av1_tile_set_row(&tile_info, cm, i);
     const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
@@ -186,106 +132,131 @@
   aom_free_frame_buffer(&cm->rst_frame);
 }
 
-void av1_free_above_context_buffers(AV1_COMMON *cm,
-                                    int num_free_above_contexts) {
+void av1_free_above_context_buffers(CommonContexts *above_contexts) {
   int i;
-  const int num_planes = cm->num_allocated_above_context_planes;
+  const int num_planes = above_contexts->num_planes;
 
-  for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+  for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
     for (i = 0; i < num_planes; i++) {
-      aom_free(cm->above_context[i][tile_row]);
-      cm->above_context[i][tile_row] = NULL;
+      aom_free(above_contexts->entropy[i][tile_row]);
+      above_contexts->entropy[i][tile_row] = NULL;
     }
-    aom_free(cm->above_seg_context[tile_row]);
-    cm->above_seg_context[tile_row] = NULL;
+    aom_free(above_contexts->partition[tile_row]);
+    above_contexts->partition[tile_row] = NULL;
 
-    aom_free(cm->above_txfm_context[tile_row]);
-    cm->above_txfm_context[tile_row] = NULL;
+    aom_free(above_contexts->txfm[tile_row]);
+    above_contexts->txfm[tile_row] = NULL;
   }
   for (i = 0; i < num_planes; i++) {
-    aom_free(cm->above_context[i]);
-    cm->above_context[i] = NULL;
+    aom_free(above_contexts->entropy[i]);
+    above_contexts->entropy[i] = NULL;
   }
-  aom_free(cm->above_seg_context);
-  cm->above_seg_context = NULL;
+  aom_free(above_contexts->partition);
+  above_contexts->partition = NULL;
 
-  aom_free(cm->above_txfm_context);
-  cm->above_txfm_context = NULL;
+  aom_free(above_contexts->txfm);
+  above_contexts->txfm = NULL;
 
-  cm->num_allocated_above_contexts = 0;
-  cm->num_allocated_above_context_mi_col = 0;
-  cm->num_allocated_above_context_planes = 0;
+  above_contexts->num_tile_rows = 0;
+  above_contexts->num_mi_cols = 0;
+  above_contexts->num_planes = 0;
 }
 
 void av1_free_context_buffers(AV1_COMMON *cm) {
-  cm->free_mi(cm);
+  cm->mi_params.free_mi(&cm->mi_params);
 
-  av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+  av1_free_above_context_buffers(&cm->above_contexts);
 
-#if LOOP_FILTER_BITMASK
-  free_loop_filter_mask(cm);
+#if CONFIG_LPF_MASK
+  av1_free_loop_filter_mask(cm);
 #endif
 }
 
-int av1_alloc_above_context_buffers(AV1_COMMON *cm,
-                                    int num_alloc_above_contexts) {
-  const int num_planes = av1_num_planes(cm);
-  int plane_idx;
+int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes) {
   const int aligned_mi_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+      ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
 
   // Allocate above context buffers
-  cm->num_allocated_above_contexts = num_alloc_above_contexts;
-  cm->num_allocated_above_context_mi_col = aligned_mi_cols;
-  cm->num_allocated_above_context_planes = num_planes;
-  for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
-    cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
-        num_alloc_above_contexts, sizeof(cm->above_context[0]));
-    if (!cm->above_context[plane_idx]) return 1;
+  above_contexts->num_tile_rows = num_tile_rows;
+  above_contexts->num_mi_cols = aligned_mi_cols;
+  above_contexts->num_planes = num_planes;
+  for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_tile_rows, sizeof(above_contexts->entropy[0]));
+    if (!above_contexts->entropy[plane_idx]) return 1;
   }
 
-  cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
-      num_alloc_above_contexts, sizeof(cm->above_seg_context));
-  if (!cm->above_seg_context) return 1;
+  above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
+      num_tile_rows, sizeof(above_contexts->partition));
+  if (!above_contexts->partition) return 1;
 
-  cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
-      num_alloc_above_contexts, sizeof(cm->above_txfm_context));
-  if (!cm->above_txfm_context) return 1;
+  above_contexts->txfm =
+      (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
+  if (!above_contexts->txfm) return 1;
 
-  for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
-    for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
-      cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
-          aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
-      if (!cm->above_context[plane_idx][tile_row]) return 1;
+  for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
+    for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      above_contexts->entropy[plane_idx][tile_row] =
+          (ENTROPY_CONTEXT *)aom_calloc(
+              aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
+      if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
     }
 
-    cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
-    if (!cm->above_seg_context[tile_row]) return 1;
+    above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
+    if (!above_contexts->partition[tile_row]) return 1;
 
-    cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
-    if (!cm->above_txfm_context[tile_row]) return 1;
+    above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
+    if (!above_contexts->txfm[tile_row]) return 1;
+  }
+
+  return 0;
+}
+
+// Allocate the dynamically allocated arrays in 'mi_params' assuming
+// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of
+// the struct members.
+static int alloc_mi(CommonModeInfoParams *mi_params) {
+  const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
+  const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
+  const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int alloc_mi_size =
+      mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
+
+  if (mi_params->mi_alloc_size < alloc_mi_size ||
+      mi_params->mi_grid_size < mi_grid_size) {
+    mi_params->free_mi(mi_params);
+
+    mi_params->mi_alloc =
+        aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
+    if (!mi_params->mi_alloc) return 1;
+    mi_params->mi_alloc_size = alloc_mi_size;
+
+    mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
+        mi_grid_size, sizeof(*mi_params->mi_grid_base));
+    if (!mi_params->mi_grid_base) return 1;
+    mi_params->mi_grid_size = mi_grid_size;
+
+    mi_params->tx_type_map =
+        aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
+    if (!mi_params->tx_type_map) return 1;
   }
 
   return 0;
 }
 
 int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
-  int new_mi_size;
-
-  av1_set_mb_mi(cm, width, height);
-  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
-  if (cm->mi_alloc_size < new_mi_size) {
-    cm->free_mi(cm);
-    if (cm->alloc_mi(cm, new_mi_size)) goto fail;
-  }
-
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->set_mb_mi(mi_params, width, height);
+  if (alloc_mi(mi_params)) goto fail;
   return 0;
 
 fail:
   // clear the mi_* values to force a realloc on resync
-  av1_set_mb_mi(cm, 0, 0);
+  mi_params->set_mb_mi(mi_params, 0, 0);
   av1_free_context_buffers(cm);
   return 1;
 }
@@ -299,4 +270,40 @@
   cm->default_frame_context = NULL;
 }
 
-void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
+void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
+  mi_params->setup_mi(mi_params);
+}
+
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride =
+      (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num =
+      ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+      cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
+}
+
+void av1_free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif

diff --git a/libaom/av1/common/alloccommon.h b/libaom/av1/common/alloccommon.h
index 8e58969..fe8e0c5 100644
--- a/libaom/av1/common/alloccommon.h
+++ b/libaom/av1/common/alloccommon.h

@@ -14,21 +14,25 @@
 
 #define INVALID_IDX -1  // Invalid buffer index.
 
+#include "config/aom_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct AV1Common;
 struct BufferPool;
+struct CommonContexts;
+struct CommonModeInfoParams;
 
 void av1_remove_common(struct AV1Common *cm);
 
-int av1_alloc_above_context_buffers(struct AV1Common *cm,
-                                    int num_alloc_above_contexts);
-void av1_free_above_context_buffers(struct AV1Common *cm,
-                                    int num_free_above_contexts);
+int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes);
+void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
 int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
-void av1_init_context_buffers(struct AV1Common *cm);
+void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
@@ -38,9 +42,13 @@
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);
 
-void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
 int av1_get_MBs(int width, int height);
 
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(struct AV1Common *cm);
+void av1_free_loop_filter_mask(struct AV1Common *cm);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/common/arm/av1_inv_txfm_neon.c b/libaom/av1/common/arm/av1_inv_txfm_neon.c
index 7a23174..2f3567a 100644
--- a/libaom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/libaom/av1/common/arm/av1_inv_txfm_neon.c

@@ -48,11 +48,11 @@
 
 // 1D functions
 static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
-  { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
-  { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
-  { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
-  { av1_idct32_new, NULL, NULL },
-  { av1_idct64_new, NULL, NULL },
+  { av1_idct4, av1_iadst4, av1_iidentity4_c },
+  { av1_idct8, av1_iadst8, av1_iidentity8_c },
+  { av1_idct16, av1_iadst16, av1_iidentity16_c },
+  { av1_idct32, NULL, NULL },
+  { av1_idct64, NULL, NULL },
 };
 
 static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
@@ -248,31 +248,27 @@
   x[1] = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
-                                          int16_t *const c2,
-                                          int16_t *const c3) {
+static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+                                       const int16_t c2, const int16_t c3) {
   int16x4_t val = vdup_n_s16((int16_t)0);
-  val = vld1_lane_s16(c0, val, 0);
-  val = vld1_lane_s16(c1, val, 1);
-  val = vld1_lane_s16(c2, val, 2);
-  val = vld1_lane_s16(c3, val, 3);
+  val = vset_lane_s16(c0, val, 0);
+  val = vset_lane_s16(c1, val, 1);
+  val = vset_lane_s16(c2, val, 2);
+  val = vset_lane_s16(c3, val, 3);
   return val;
 }
 
-static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+                               int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[20], (int16_t)cospi[44]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[8];
   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -327,22 +323,21 @@
 
   // Stage 7
   out[0] = x[0];
-  out[1] = vnegq_s16(x[4]);
+  out[1] = vqnegq_s16(x[4]);
   out[2] = x[6];
-  out[3] = vnegq_s16(x[2]);
+  out[3] = vqnegq_s16(x[2]);
   out[4] = x[3];
-  out[5] = vnegq_s16(x[7]);
+  out[5] = vqnegq_s16(x[7]);
   out[6] = x[5];
-  out[7] = vnegq_s16(x[1]);
+  out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[8];
   int16x8_t s0, s1, s4, s5;
@@ -381,34 +376,32 @@
 
   // Stage 7
   out[0] = x[0];
-  out[1] = vnegq_s16(x[4]);
+  out[1] = vqnegq_s16(x[4]);
   out[2] = x[6];
-  out[3] = vnegq_s16(x[2]);
+  out[3] = vqnegq_s16(x[2]);
   out[4] = x[3];
-  out[5] = vnegq_s16(x[7]);
+  out[5] = vqnegq_s16(x[7]);
   out[6] = x[5];
-  out[7] = vnegq_s16(x[1]);
+  out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                                  int bit) {
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                              int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[8], step2[8];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   // stage 2
   btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
   btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
 
   // stage 3
-  btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+  btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
   step2[6] = vqsubq_s16(step1[7], step1[6]);
@@ -419,7 +412,7 @@
   step1[1] = vqaddq_s16(step2[1], step2[2]);
   step1[2] = vqsubq_s16(step2[1], step2[2]);
   step1[3] = vqsubq_s16(step2[0], step2[3]);
-  btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
 
   // stage 5
   out[0] = vqaddq_s16(step1[0], step2[7]);
@@ -432,8 +425,8 @@
   out[7] = vqsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                       int8_t cos_bit, int bit) {
+static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -489,19 +482,24 @@
   }
 }
 
-static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
-                                      int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
+static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                         4 * 5793 };
 
-  output[0] = vmulq_n_s16(input[0], (int16_t)2);
-  output[1] = vmulq_n_s16(input[1], (int16_t)2);
-  output[2] = vmulq_n_s16(input[2], (int16_t)2);
-  output[3] = vmulq_n_s16(input[3], (int16_t)2);
-  output[4] = vmulq_n_s16(input[4], (int16_t)2);
-  output[5] = vmulq_n_s16(input[5], (int16_t)2);
-  output[6] = vmulq_n_s16(input[6], (int16_t)2);
-  output[7] = vmulq_n_s16(input[7], (int16_t)2);
+static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+                                            int txw_idx, int8_t size, int bit) {
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
+  int16x4_t low_i16, high_i16;
+  int32x4_t low_i32, high_i32;
+  for (int i = 0; i < size; i++) {
+    int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
+    int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
+    low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
+    high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
+    low_i16 = vqmovn_s32(low_i32);
+    high_i16 = vqmovn_s32(high_i32);
+    output[i] = vcombine_s16(low_i16, high_i16);
+  }
 }
 
 static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
@@ -520,38 +518,8 @@
   }
 }
 
-static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
-                                       int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
-
-  int32x4_t out_low, out_high;
-  int16x4_t low, high;
-  int16_t scale = (int16_t)(2 * NewSqrt2);
-
-  for (int z = 0; z < 16; ++z) {
-    out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
-    out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
-
-    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
-    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
-
-    output[z] = vcombine_s16(low, high);
-  }
-}
-
-static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
-                                       int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
-
-  for (int z = 0; z < 32; ++z) {
-    output[z] = vmulq_n_s16(input[z], (int16_t)4);
-  }
-}
-
-static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -584,25 +552,23 @@
   out[15] = step1;
 }
 
-static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
   // stage 2
 
   btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
@@ -642,8 +608,7 @@
   btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
   btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
 
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -710,14 +675,16 @@
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c1 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -753,8 +720,7 @@
   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
   btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
 
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -820,30 +786,23 @@
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
+static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+                                int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
-                        (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
-                        (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
-                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
-                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[10], (int16_t)cospi[54]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[26], (int16_t)cospi[38]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[14];
@@ -933,14 +892,14 @@
   t[1] = x[1];
   t[2] = x[2];
   t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
   t[8] = x[8];
   t[9] = x[9];
   t[10] = x[10];
   t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+  btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
 
   // Stage 7
   x[0] = vqaddq_s16(t[0], t[2]);
@@ -961,40 +920,38 @@
   x[15] = vqsubq_s16(s13, s15);
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c5);
+  btf_16_half_neon(x + 6, c5);
+  btf_16_half_neon(x + 10, c5);
+  btf_16_half_neon(x + 14, c5);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[10];
@@ -1016,7 +973,7 @@
   // Stage 4
   t[0] = x[0];
   t[1] = x[1];
-  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
 
   // Stage 5
   x[0] = t[0];
@@ -1031,10 +988,10 @@
   // stage 6
   t[0] = x[0];
   t[1] = x[1];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
   t[8] = x[8];
   t[9] = x[9];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
 
   // Stage 7
   x[0] = t[0];
@@ -1055,41 +1012,39 @@
   x[15] = s13;
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[14];
@@ -1144,10 +1099,10 @@
   t[5] = x[5];
   t[6] = x[6];
   t[7] = x[7];
-  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
-  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
-  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
-  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
 
   // Stage 5
   x[0] = vqaddq_s16(t[0], t[4]);
@@ -1172,14 +1127,14 @@
   t[1] = x[1];
   t[2] = x[2];
   t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
   t[8] = x[8];
   t[9] = x[9];
   t[10] = x[10];
   t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
 
   // Stage 7
   x[0] = vqaddq_s16(t[0], t[2]);
@@ -1200,60 +1155,58 @@
   x[15] = vqsubq_s16(s13, s15);
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
-                        (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
-                        (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
-                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
-                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c5 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c6 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c7 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[34], (int16_t)cospi[30]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[50], (int16_t)cospi[14]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c8 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c9 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 2
 
@@ -1321,11 +1274,9 @@
   btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
   btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[1] = step1[1];
@@ -1353,8 +1304,7 @@
   btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
   btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
   btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
 
   step1[4] = vqaddq_s16(step2[4], step2[5]);
   step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1386,10 +1336,8 @@
   btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
 
   step2[0] = vqaddq_s16(step1[0], step1[3]);
   step2[1] = vqaddq_s16(step1[1], step1[2]);
@@ -1516,8 +1464,8 @@
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -1573,19 +1521,22 @@
   out[31] = step1;
 }
 
-static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
   // stage 1
   // stage 2
 
@@ -1627,11 +1578,9 @@
 
   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[8] = step1[8];
@@ -1659,8 +1608,7 @@
                           vrshrn_n_s32(t32[1], INV_COS_BIT));
 
   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
 
   step1[4] = step2[4];
   step1[5] = step2[4];
@@ -1692,10 +1640,8 @@
   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
 
   step2[0] = step1[0];
   step2[1] = step1[0];
@@ -1822,18 +1768,22 @@
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -1889,11 +1839,9 @@
   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
   btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[2] = step1[2];
@@ -1924,8 +1872,7 @@
 
   btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
 
   step1[4] = vqaddq_s16(step2[4], step2[5]);
   step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1957,10 +1904,8 @@
   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
 
   step2[0] = vqaddq_s16(step1[0], step1[3]);
   step2[1] = vqaddq_s16(step1[0], step1[2]);
@@ -2089,9 +2034,8 @@
 static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
   btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
@@ -2159,9 +2103,8 @@
 static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
                                        int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
   btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
@@ -2222,23 +2165,31 @@
   step2[63] = step1[63];
 }
 
-static INLINE void idct64_low32_new_neon(int16x8_t *in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -2342,17 +2293,13 @@
   btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
   btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[34]), vnegq_s16(step1[61]), c0,
-                       &step2[34], &step2[61]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
   btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
-                       &step2[38], &step2[57]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c1,
-                       &step2[42], &step2[53]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
   btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
-                       &step2[46], &step2[49]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
 
   step2[16] = vqaddq_s16(step1[16], step1[17]);
   step2[17] = vqsubq_s16(step1[16], step1[17]);
@@ -2395,11 +2342,9 @@
   btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
   btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step2[18]), vnegq_s16(step2[29]), c2,
-                       &step1[18], &step1[29]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
   btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
-                       &step1[22], &step1[25]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
 
   step1[8] = vqaddq_s16(step2[8], step2[9]);
   step1[9] = vqsubq_s16(step2[8], step2[9]);
@@ -2455,20 +2400,15 @@
   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
   btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
-                       &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
-                       &step2[37], &step2[58]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
-                       &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
-                       &step2[45], &step2[50]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
 
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -2516,10 +2456,8 @@
   btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
-                       &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
-                       &step1[21], &step1[26]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
 
   step1[0] = vqaddq_s16(step2[0], step2[3]);
   step1[1] = vqaddq_s16(step2[1], step2[2]);
@@ -2584,14 +2522,10 @@
   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
-                       &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
-                       &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
-                       &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
-                       &step2[43], &step2[52]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
 
   step2[0] = vqaddq_s16(step1[0], step1[7]);
   step2[1] = vqaddq_s16(step1[1], step1[6]);
@@ -2712,8 +2646,8 @@
   out[63] = vqsubq_s16(step2[0], step2[63]);
 }
 
-static INLINE void idct64_low1_new_neon(int16x8_t *input, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -2802,24 +2736,29 @@
   out[63] = step1;
 }
 
-static INLINE void idct64_low8_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -2865,11 +2804,9 @@
 
   btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
-                       &step2[38], &step2[57]);
+  btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
-                       &step2[46], &step2[49]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
 
   step2[16] = step1[16];
   step2[17] = step1[16];
@@ -2893,8 +2830,7 @@
   step1[0] = step2[0];
 
   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
-                       &step1[22], &step1[25]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
 
   step1[8] = step2[8];
   step1[9] = step2[8];
@@ -2944,16 +2880,12 @@
   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
-                       &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
-                       &step2[37], &step2[58]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
-                       &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
-                       &step2[45], &step2[50]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
@@ -2994,10 +2926,8 @@
 
   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
-                       &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
-                       &step1[21], &step1[26]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
 
   step1[0] = step2[0];
   step1[1] = step2[1];
@@ -3060,14 +2990,10 @@
   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
-                       &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
-                       &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
-                       &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
-                       &step2[43], &step2[52]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
 
   step2[0] = step1[0];
   step2[1] = step1[1];
@@ -3188,24 +3114,32 @@
   out[63] = vqsubq_s16(step2[0], step2[63]);
 }
 
-static INLINE void idct64_low16_new_neon(int16x8_t *in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -3281,17 +3215,13 @@
   btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
   btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[34]), vnegq_s16(step1[61]), c0,
-                       &step2[34], &step2[61]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
   btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
-                       &step2[38], &step2[57]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c1,
-                       &step2[42], &step2[53]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
   btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
-                       &step2[46], &step2[49]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
 
   step2[16] = step1[16];
   step2[17] = step1[16];
@@ -3332,11 +3262,9 @@
 
   btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step2[18]), vnegq_s16(step2[29]), c2,
-                       &step1[18], &step1[29]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
   btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
-                       &step1[22], &step1[25]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
 
   step1[8] = step2[8];
   step1[9] = step2[8];
@@ -3391,20 +3319,15 @@
 
   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
-                       &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
-                       &step2[37], &step2[58]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
-                       &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
-                       &step2[45], &step2[50]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
 
   step2[4] = step1[4];
   step2[5] = step1[4];
@@ -3452,10 +3375,8 @@
   btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
-                       &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
-                       &step1[21], &step1[26]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
 
   step1[0] = step2[0];
   step1[1] = step2[1];
@@ -3520,14 +3441,10 @@
   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
-                       &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
-                       &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
-                       &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
-                       &step2[43], &step2[52]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
 
   step2[0] = vqaddq_s16(step1[0], step1[7]);
   step2[1] = vqaddq_s16(step1[1], step1[6]);
@@ -3657,23 +3574,19 @@
           { NULL, NULL, NULL, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
-        { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
-        { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+      { { idct8_low1_neon, idct8_neon, NULL, NULL },
+        { iadst8_low1_neon, iadst8_neon, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
       {
-          { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
-          { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
-            NULL },
-          { identity16_new_neon, identity16_new_neon, identity16_new_neon,
-            NULL },
+          { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
+          { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
+          { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
-          idct32_new_neon },
+      { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
         { NULL, NULL, NULL, NULL },
-        { identity32_new_neon, identity32_new_neon, identity32_new_neon,
-          identity32_new_neon } },
-      { { idct64_low1_new_neon, idct64_low8_new_neon, idct64_low16_new_neon,
-          idct64_low32_new_neon },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
+          idct64_low32_neon },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -3682,15 +3595,14 @@
                                                   uint8_t *output, int stride,
                                                   TX_TYPE tx_type,
                                                   TX_SIZE tx_size, int eob) {
+  (void)tx_type;
   int16x8_t a[32 * 4];
   int16x8_t b[32 * 4];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -3701,17 +3613,8 @@
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
 
   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
     input_1 = input;
@@ -3726,9 +3629,8 @@
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], txfm_size_col);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
       int k = j * 8 + i * txfm_size_col;
       transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -3736,9 +3638,8 @@
     temp_b += 8;
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
   }
   if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -3757,11 +3658,10 @@
   int16x8_t b[16 * 2];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -3771,15 +3671,11 @@
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
   const transform_neon row_txfm =
       lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 
-  assert(col_txfm != NULL);
   assert(row_txfm != NULL);
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
@@ -3817,9 +3713,8 @@
     }
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
   }
   if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -3838,11 +3733,10 @@
   int16x8_t b[16 * 2];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -3851,17 +3745,13 @@
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   const transform_neon col_txfm =
       lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 
   assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
@@ -3878,9 +3768,8 @@
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], txfm_size_col);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
       int k = j * 8 + i * txfm_size_col;
       transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -3911,18 +3800,18 @@
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -3948,6 +3837,7 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -3973,18 +3863,19 @@
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -4012,6 +3903,7 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -4037,18 +3929,19 @@
   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -4076,6 +3969,7 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -4101,18 +3995,19 @@
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -4138,6 +4033,7 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -4163,18 +4059,19 @@
   DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -4200,6 +4097,7 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -4225,11 +4123,11 @@
   int16x8_t b[64 * 8];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);

diff --git a/libaom/av1/common/arm/cfl_neon.c b/libaom/av1/common/arm/cfl_neon.c
index 39025b5..371be5f 100644
--- a/libaom/av1/common/arm/cfl_neon.c
+++ b/libaom/av1/common/arm/cfl_neon.c

@@ -131,6 +131,7 @@
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #ifndef __aarch64__
 uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
@@ -247,6 +248,7 @@
     input += input_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
@@ -511,6 +513,7 @@
 
 CFL_PREDICT_FN(neon, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
   return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
 }
@@ -582,3 +585,4 @@
 }
 
 CFL_PREDICT_FN(neon, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/av1/common/arm/convolve_neon.c b/libaom/av1/common/arm/convolve_neon.c
index d0c4f8f..51c9696 100644
--- a/libaom/av1/common/arm/convolve_neon.c
+++ b/libaom/av1/common/arm/convolve_neon.c

@@ -195,12 +195,12 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
   const int8_t bits = FILTER_BITS - conv_params->round_0;
 
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)filter_params_y;
 
@@ -214,7 +214,7 @@
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -603,14 +603,14 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int vert_offset = filter_params_y->taps / 2 - 1;
 
   src -= vert_offset * src_stride;
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -618,7 +618,7 @@
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   if (w <= 4) {
     uint8x8_t d01;
@@ -844,17 +844,110 @@
   }
 }
 
+// Horizontal filtering for convolve_2d_sr for width multiple of 8
+// Processes one row at a time
+static INLINE void horiz_filter_w8_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    uint8x8_t t0 = vld1_u8(src_ptr);
+    s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+    int width_tmp = width;
+    const uint8_t *s = src_ptr + 8;
+    int16_t *dst_tmp = dst_ptr;
+
+    __builtin_prefetch(dst_ptr);
+
+    do {
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t sum = s0;
+      s0 = s7;
+
+      s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
+                                         x_filter, horiz_const, shift_round_0);
+
+      vst1q_s16(dst_tmp, res0);
+
+      s += 8;
+      dst_tmp += 8;
+      width_tmp -= 8;
+    } while (width_tmp > 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    height--;
+  } while (height > 0);
+}
+
+// Horizontal filtering for convolve_2d_sr for width <= 4
+// Processes one row at a time
+static INLINE void horiz_filter_w4_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    const uint8_t *s = src_ptr;
+
+    __builtin_prefetch(s);
+
+    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    s0 = vget_low_s16(tt0);
+    s4 = vget_high_s16(tt0);
+
+    __builtin_prefetch(dst_ptr);
+    s += 8;
+
+    t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+    s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+    s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+    s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+    s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+    s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+    s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+    s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+    int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                     horiz_const, shift_round_0);
+
+    if (width == 4) {
+      vst1_s16(dst_ptr, d0);
+      dst_ptr += dst_stride;
+    } else if (width == 2) {
+      vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+      dst_ptr += dst_stride;
+    }
+
+    src_ptr += src_stride;
+    height--;
+  } while (height > 0);
+}
+
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   int im_dst_stride;
   int width, height;
-  uint8x8_t t0;
 #if defined(__aarch64__)
+  uint8x8_t t0;
   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+  const uint8_t *s;
 #endif
 
   DECLARE_ALIGNED(16, int16_t,
@@ -867,7 +960,7 @@
   const int horiz_offset = filter_params_x->taps / 2 - 1;
 
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-  const uint8_t *s;
+
   int16_t *dst_ptr;
 
   dst_ptr = im_block;
@@ -880,7 +973,7 @@
   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -893,18 +986,14 @@
   assert(conv_params->round_0 > 0);
 
   if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if defined(__aarch64__)
-    int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
-
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
-    do {
-      s = src_ptr;
-
 #if defined(__aarch64__)
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    do {
+      assert(height >= 4);
+      s = src_ptr;
       __builtin_prefetch(s + 0 * src_stride);
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
@@ -963,57 +1052,30 @@
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * im_dst_stride;
       height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
 #else
-      int16x8_t tt0;
-
-      __builtin_prefetch(s);
-
-      t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s0 = vget_low_s16(tt0);
-      s4 = vget_high_s16(tt0);
-
-      __builtin_prefetch(dst_ptr);
-      s += 8;
-
-      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
-      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
-      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
-
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                             horiz_const, shift_round_0);
-
-      if (w == 4) {
-        vst1_s16(dst_ptr, d0);
-        dst_ptr += im_dst_stride;
-      } else if (w == 2) {
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
-        dst_ptr += im_dst_stride;
-      }
-
-      src_ptr += src_stride;
-      height -= 1;
+    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
 #endif
-    } while (height > 0);
+
   } else {
-    int16_t *d_tmp;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
-    int16x8_t s11, s12, s13, s14;
-#endif
-
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
 #if defined(__aarch64__)
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
     do {
+      assert(height >= 8);
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
       __builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1099,45 +1161,121 @@
       src_ptr += 8 * src_stride;
       dst_ptr += 8 * im_dst_stride;
       height -= 8;
-    } while (height > 0);
-#else
-    do {
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+    } while (height >= 8);
 
-      width = w;
-      s = src_ptr + 8;
+    if (height >= 4) {
+      assert(height < 8);
+      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+          reg10, reg11, reg12, reg13, reg14;
+      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+      int16x8_t out0, out1, out2, out3;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
       d_tmp = dst_ptr;
-
-      __builtin_prefetch(dst_ptr);
+      width = w;
 
       do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        int16x8_t sum = s0;
-        s0 = s7;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-        res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 horiz_const, shift_round_0);
+        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                           x_filter_tmp);
 
-        vst1q_s16(d_tmp, res0);
+        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+                           x_filter_tmp);
 
+        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+                           x_filter_tmp);
+
+        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+                           x_filter_tmp);
+
+        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+                           x_filter_tmp);
+
+        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+                           x_filter_tmp);
+
+        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                           x_filter_tmp);
+
+        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+                           x_filter_tmp);
+
+        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+                          &out2, &out3);
+
+        out0 = vaddq_s16(out0, horiz_const);
+        out0 = vqrshlq_s16(out0, shift_round_0);
+
+        out1 = vaddq_s16(out1, horiz_const);
+        out1 = vqrshlq_s16(out1, shift_round_0);
+
+        out2 = vaddq_s16(out2, horiz_const);
+        out2 = vqrshlq_s16(out2, shift_round_0);
+
+        out3 = vaddq_s16(out3, horiz_const);
+        out3 = vqrshlq_s16(out3, shift_round_0);
+
+        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+        reg0 = reg8;
+        reg1 = reg9;
+        reg2 = reg10;
+        reg3 = reg11;
+        reg4 = reg12;
+        reg5 = reg13;
+        reg6 = reg14;
         s += 8;
         d_tmp += 8;
         width -= 8;
       } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += im_dst_stride;
-      height -= 1;
-    } while (height > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    }
+
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
+#else
+
+    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
 #endif
   }
 
@@ -1149,7 +1287,7 @@
     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                               (1 << (offset_bits - conv_params->round_1 - 1));
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1409,12 +1547,12 @@
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   const uint8_t *src1;

diff --git a/libaom/av1/common/arm/convolve_neon.h b/libaom/av1/common/arm/convolve_neon.h
index f382984..dbcfab6 100644
--- a/libaom/av1/common/arm/convolve_neon.h
+++ b/libaom/av1/common/arm/convolve_neon.h

@@ -73,7 +73,7 @@
   int32x4_t sum_0, sum_1;
   int32x4_t s3_0, s3_1;
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
 
   /* for the purpose of right shift by { conv_params->round_0 } */
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
@@ -124,7 +124,7 @@
   int16x4_t sum, temp0, temp1, temp2;
 
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
   const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);

diff --git a/libaom/av1/common/arm/jnt_convolve_neon.c b/libaom/av1/common/arm/jnt_convolve_neon.c
index 379ff98..92112fb 100644
--- a/libaom/av1/common/arm/jnt_convolve_neon.c
+++ b/libaom/av1/common/arm/jnt_convolve_neon.c

@@ -717,7 +717,7 @@
                                    uint8_t *dst8, int dst8_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -732,9 +732,9 @@
   const int round_0 = conv_params->round_0 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -754,8 +754,8 @@
 void av1_dist_wtd_convolve_2d_copy_neon(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
       tmp_shift3;
   uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -778,8 +778,8 @@
 
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   if (!(w & 0x07)) {
     for (y = 0; y < (h >> 2); ++y) {
@@ -880,7 +880,7 @@
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -900,11 +900,11 @@
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - horiz_offset;
 
@@ -1343,7 +1343,7 @@
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -1364,11 +1364,11 @@
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - (vert_offset * src_stride);
 

diff --git a/libaom/av1/common/arm/mem_neon.h b/libaom/av1/common/arm/mem_neon.h
index beae4ed..171055f 100644
--- a/libaom/av1/common/arm/mem_neon.h
+++ b/libaom/av1/common/arm/mem_neon.h

@@ -13,6 +13,7 @@
 
 #include <arm_neon.h>
 #include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
 
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {
@@ -315,6 +316,26 @@
   *s3 = vld1q_s16(s);
 }
 
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
                                          uint32x2_t *tu0, uint32x2_t *tu1,
                                          uint32x2_t *tu2, uint32x2_t *tu3) {
@@ -500,4 +521,19 @@
   vst1q_u32(s, s4);
 }
 
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+}
+
 #endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_

diff --git a/libaom/av1/common/arm/selfguided_neon.c b/libaom/av1/common/arm/selfguided_neon.c
index b3a37c4..fc404a6 100644
--- a/libaom/av1/common/arm/selfguided_neon.c
+++ b/libaom/av1/common/arm/selfguided_neon.c

@@ -19,8 +19,8 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "av1/common/arm/mem_neon.h"
@@ -86,7 +86,7 @@
 
     for (int x = 0; x < 4; x++) {
       for (int y = 0; y < 4; y++) {
-        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
       }
     }
     load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
@@ -214,7 +214,7 @@
 
     for (int x = 0; x < 4; x++) {
       for (int y = 0; y < 8; y++) {
-        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
       }
     }
     load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
@@ -376,6 +376,21 @@
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -467,7 +482,7 @@
   const uint32_t n = (2 * r + 1) * (2 * r + 1);
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
-  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
@@ -509,6 +524,7 @@
   } while (h > 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
                                         uint16_t *B16, int32_t *B,
                                         const int buf_stride, const int width,
@@ -522,7 +538,7 @@
   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
-  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -573,6 +589,7 @@
     h -= (ht_inc * 4);
   } while (h > 0);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
@@ -584,7 +601,7 @@
   const uint32_t n = (2 * r + 1) * (2 * r + 1);
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
-  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -626,6 +643,7 @@
   } while (h > 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
                                              const int width, const int height,
@@ -638,7 +656,7 @@
   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
-  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -679,6 +697,7 @@
     h -= (ht_inc * 4);
   } while (h > 0);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
                            int32_t *dst2, const int dst_stride, const int width,
@@ -788,6 +807,21 @@
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -1145,7 +1179,7 @@
                                              int32_t *dst, int dst_stride,
                                              int bit_depth, int sgr_params_idx,
                                              int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1181,17 +1215,25 @@
   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
 
-  if (8 == bit_depth) {
-    calc_ab_fast_internal_lbd(
-        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
-        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
-        params->s[radius_idx], 2);
-  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (bit_depth > 8) {
     calc_ab_fast_internal_hbd(
         (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
         (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
         bit_depth, r, params->s[radius_idx], 2);
+  } else {
+    calc_ab_fast_internal_lbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+        params->s[radius_idx], 2);
   }
+#else
+  (void)bit_depth;
+  calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+                            (tmp16_buf - buf_stride - 1),
+                            (sum_buf - buf_stride - 1), buf_stride * 2,
+                            width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
   final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
                              dgd_stride, dst, dst_stride, width, height);
 }
@@ -1200,7 +1242,7 @@
                                         int dgd_stride, int32_t *dst,
                                         int dst_stride, int bit_depth,
                                         int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1235,19 +1277,27 @@
   A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
-  if (8 == bit_depth) {
-    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
-                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
-                         (B - buf_stride - 1), buf_stride, width + 2,
-                         height + 2, r, params->s[radius_idx], 1);
-  } else {
+  if (bit_depth > 8) {
     calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
                          (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                          (B - buf_stride - 1), buf_stride, width + 2,
                          height + 2, bit_depth, r, params->s[radius_idx], 1);
+  } else {
+    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, r, params->s[radius_idx], 1);
   }
+#else
+  (void)bit_depth;
+  calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                       (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                       (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+                       r, params->s[radius_idx], 1);
+#endif
   final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
                         dst_stride, width, height);
 }
@@ -1299,8 +1349,14 @@
       dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
     }
   }
+
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
                                         uint16_t *dst, const int dst_stride,
                                         int width, int height) {
@@ -1339,13 +1395,18 @@
     memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
            sizeof(uint16_t) * width);
   }
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
                                     int stride, int32_t *flt0, int32_t *flt1,
                                     int flt_stride, int sgr_params_idx,
                                     int bit_depth, int highbd) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
   uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
@@ -1356,6 +1417,7 @@
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   const int dgd_stride = stride;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
     src_convert_hbd_copy(
@@ -1370,6 +1432,13 @@
         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
         dgd16_stride, width_ext, height_ext);
   }
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
 
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
@@ -1380,11 +1449,11 @@
   return 0;
 }
 
-void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
-                                       int height, int stride, int eps,
-                                       const int *xqd, uint8_t *dst8,
-                                       int dst_stride, int32_t *tmpbuf,
-                                       int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1395,11 +1464,12 @@
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   const int dgd_stride = stride;
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
 
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
     src_convert_hbd_copy(
@@ -1414,7 +1484,13 @@
         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
         dgd16_stride, width_ext, height_ext);
   }
-
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
                               bit_depth, eps, 0);
@@ -1422,7 +1498,7 @@
     restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
                          bit_depth, eps, 1);
 
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   {
     int16_t *src_ptr;
@@ -1485,6 +1561,7 @@
 
         r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
 
+#if CONFIG_AV1_HIGHBITDEPTH
         if (highbd) {
           r4 = vminq_u16(r4, max);
           vst1q_u16(dst16_ptr, r4);
@@ -1492,6 +1569,11 @@
           t0 = vqmovn_u16(r4);
           vst1_u8(dst_ptr, t0);
         }
+#else
+        (void)max;
+        t0 = vqmovn_u16(r4);
+        vst1_u8(dst_ptr, t0);
+#endif
         w -= 8;
         count += 8;
         dst_ptr += 8;

diff --git a/libaom/av1/common/arm/transpose_neon.h b/libaom/av1/common/arm/transpose_neon.h
index 8a3d9f0..91d89b4 100644
--- a/libaom/av1/common/arm/transpose_neon.h
+++ b/libaom/av1/common/arm/transpose_neon.h

@@ -250,6 +250,71 @@
                      vreinterpret_u16_u32(c3.val[1]));
 }
 
+static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
+                                     int16x4_t *a2, int16x4_t *a3,
+                                     int16x4_t *a4, int16x4_t *a5,
+                                     int16x4_t *a6, int16x4_t *a7,
+                                     int16x8_t *o0, int16x8_t *o1,
+                                     int16x8_t *o2, int16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+  int16x4x2_t b2 = vtrn_s16(*a4, *a5);
+  int16x4x2_t b3 = vtrn_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                            vreinterpret_s32_s16(b1.val[0]));
+  int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                            vreinterpret_s32_s16(b1.val[1]));
+  int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                            vreinterpret_s32_s16(b3.val[0]));
+  int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                            vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
                                      uint16x8_t *a2, uint16x8_t *a3,
                                      uint16x8_t *a4, uint16x8_t *a5,
@@ -386,7 +451,7 @@
                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
 }
 
-static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -448,10 +513,10 @@
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
-  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
-  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
-  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
-  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
 
   *out = d0.val[0];
   *(out + 1) = d1.val[0];

diff --git a/libaom/av1/common/arm/warp_plane_neon.c b/libaom/av1/common/arm/warp_plane_neon.c
index 1062cc3..c10a34f 100644
--- a/libaom/av1/common/arm/warp_plane_neon.c
+++ b/libaom/av1/common/arm/warp_plane_neon.c

@@ -20,7 +20,7 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
    * Each coefficient is stored in 8 bits instead of 16 bits
    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
 
@@ -333,22 +333,22 @@
   c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
                  vreinterpretq_s32_s16(b3.val[1]));
 
-  f0 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f1 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f2 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f3 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f4 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f5 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f6 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f7 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f0 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
   d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));

diff --git a/libaom/av1/common/onyxc_int.h b/libaom/av1/common/av1_common_int.h
similarity index 62%
rename from libaom/av1/common/onyxc_int.h
rename to libaom/av1/common/av1_common_int.h
index 8117dfc..0403405 100644
--- a/libaom/av1/common/onyxc_int.h
+++ b/libaom/av1/common/av1_common_int.h

@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
-#define AOM_AV1_COMMON_ONYXC_INT_H_
+#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
+#define AOM_AV1_COMMON_AV1_COMMON_INT_H_
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -71,8 +71,8 @@
 // clang-format seems to think this is a pointer dereference and not a
 // multiplication.
 #define MAX_NUM_OPERATING_POINTS \
-  MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
-/* clang-format on*/
+  (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS)
+/* clang-format on */
 
 // TODO(jingning): Turning this on to set up transform coefficient
 // processing timer.
@@ -109,13 +109,11 @@
   MV_REFERENCE_FRAME ref_frame;
 } MV_REF;
 
-
 typedef struct RefCntBuffer {
   // For a RefCntBuffer, the following are reference-holding variables:
   // - cm->ref_frame_map[]
   // - cm->cur_frame
   // - cm->scaled_ref_buf[] (encoder only)
-  // - cm->next_ref_frame_map[] (decoder only)
   // - pbi->output_frame_index[] (decoder only)
   // With that definition, 'ref_count' is the number of reference-holding
   // variables that are currently referencing this buffer.
@@ -129,6 +127,13 @@
   unsigned int order_hint;
   unsigned int ref_order_hints[INTER_REFS_PER_FRAME];
 
+  // These variables are used only in encoder and compare the absolute
+  // display order hint to compute the relative distance and overcome
+  // the limitation of get_relative_dist() which returns incorrect
+  // distance when a very old frame is used as a reference.
+  unsigned int display_order_hint;
+  unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
+
   MV_REF *mvs;
   uint8_t *seg_map;
   struct segmentation seg;
@@ -144,7 +149,6 @@
   aom_film_grain_t film_grain_params;
   aom_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-  hash_table hash_table;
   FRAME_TYPE frame_type;
 
   // This is only used in the encoder but needs to be indexed per ref frame
@@ -183,8 +187,7 @@
 } BufferPool;
 
 typedef struct {
-  int cdef_pri_damping;
-  int cdef_sec_damping;
+  int cdef_damping;
   int nb_cdef_strengths;
   int cdef_strengths[CDEF_MAX_STRENGTHS];
   int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
@@ -205,21 +208,23 @@
 } DeltaQInfo;
 
 typedef struct {
-  int enable_order_hint;           // 0 - disable order hint, and related tools
-  int order_hint_bits_minus_1;     // dist_wtd_comp, ref_frame_mvs,
-                                   // frame_sign_bias
-                                   // if 0, enable_dist_wtd_comp and
-                                   // enable_ref_frame_mvs must be set as 0.
-  int enable_dist_wtd_comp;        // 0 - disable dist-wtd compound modes
-                                   // 1 - enable it
-  int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
-                                   // 1 - enable it
+  int enable_order_hint;        // 0 - disable order hint, and related tools
+  int order_hint_bits_minus_1;  // dist_wtd_comp, ref_frame_mvs,
+                                // frame_sign_bias
+                                // if 0, enable_dist_wtd_comp and
+                                // enable_ref_frame_mvs must be set as 0.
+  int enable_dist_wtd_comp;     // 0 - disable dist-wtd compound modes
+                                // 1 - enable it
+  int enable_ref_frame_mvs;     // 0 - disable ref frame mvs
+                                // 1 - enable it
 } OrderHintInfo;
 
 // Sequence header structure.
 // Note: All syntax elements of sequence_header_obu that need to be
 // bit-identical across multiple sequence headers must be part of this struct,
 // so that consistency is checked by are_seq_headers_consistent() function.
+// One exception is the last member 'op_params' that is ignored by
+// are_seq_headers_consistent() function.
 typedef struct SequenceHeader {
   int num_bits_width;
   int num_bits_height;
@@ -258,15 +263,6 @@
   uint8_t enable_restoration;          // To turn on/off loop restoration
   BITSTREAM_PROFILE profile;
 
-  // Operating point info.
-  int operating_points_cnt_minus_1;
-  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
-  uint8_t display_model_info_present_flag;
-  uint8_t decoder_model_info_present_flag;
-  AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
-  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
-                                           // or 1.
-
   // Color config.
   aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
                               // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
@@ -276,18 +272,34 @@
   aom_transfer_characteristics_t transfer_characteristics;
   aom_matrix_coefficients_t matrix_coefficients;
   int color_range;
-  int subsampling_x;          // Chroma subsampling for x
-  int subsampling_y;          // Chroma subsampling for y
+  int subsampling_x;  // Chroma subsampling for x
+  int subsampling_y;  // Chroma subsampling for y
   aom_chroma_sample_position_t chroma_sample_position;
   uint8_t separate_uv_delta_q;
   uint8_t film_grain_params_present;
+
+  // Operating point info.
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  uint8_t decoder_model_info_present_flag;
+  aom_dec_model_info_t decoder_model_info;
+  uint8_t display_model_info_present_flag;
+  AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in spec. One bit: 0 or 1.
+
+  // IMPORTANT: the op_params member must be at the end of the struct so that
+  // are_seq_headers_consistent() can be implemented with a memcmp() call.
+  // TODO(urvang): We probably don't need the +1 here.
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
 } SequenceHeader;
 
 typedef struct {
-    int skip_mode_allowed;
-    int skip_mode_flag;
-    int ref_frame_idx_0;
-    int ref_frame_idx_1;
+  int skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
 } SkipModeInfo;
 
 typedef struct {
@@ -295,34 +307,266 @@
   REFERENCE_MODE reference_mode;
 
   unsigned int order_hint;
+  unsigned int display_order_hint;
   unsigned int frame_number;
   SkipModeInfo skip_mode_info;
   int refresh_frame_flags;  // Which ref frames are overwritten by this frame
   int frame_refs_short_signaling;
 } CurrentFrame;
 
+// Struct containing some frame level features.
+typedef struct {
+  bool disable_cdf_update;
+  bool allow_high_precision_mv;
+  bool cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool allow_warped_motion;
+  // Whether to use previous frames' motion vectors for prediction.
+  bool allow_ref_frame_mvs;
+  bool coded_lossless;  // frame is fully lossless at the coded resolution.
+  bool all_lossless;    // frame is fully lossless at the upscaled resolution.
+  bool reduced_tx_set_used;
+  bool error_resilient_mode;
+  bool switchable_motion_mode;
+  TX_MODE tx_mode;
+  InterpFilter interp_filter;
+  int primary_ref_frame;
+  int byte_alignment;
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+} FeatureFlags;
+
+// Struct containing params related to tiles.
+typedef struct CommonTileParams {
+  int cols;           // number of tile columns that frame is divided into
+  int rows;           // number of tile rows that frame is divided into
+  int max_width_sb;   // maximum tile width in superblock units.
+  int max_height_sb;  // maximum tile height in superblock units.
+  // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+  int min_inner_width;
+
+  // If true, tiles are uniformly spaced with power-of-two number of rows and
+  // columns.
+  // If false, tiles have explicitly configured widths and heights.
+  int uniform_spacing;
+
+  // Following members are only valid when uniform_spacing == 1
+  int log2_cols;  // log2 of 'cols'.
+  int log2_rows;  // log2 of 'rows'.
+  int width;      // tile width in MI units
+  int height;     // tile height in MI units
+  // End of members that are only valid when uniform_spacing == 1
+
+  // Min num of tile columns possible based on 'max_width_sb' and frame width.
+  int min_log2_cols;
+  // Min num of tile rows possible based on 'max_height_sb' and frame height.
+  int min_log2_rows;
+  // Min num of tile columns possible based on frame width.
+  int max_log2_cols;
+  // Max num of tile columns possible based on frame width.
+  int max_log2_rows;
+  // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+  int min_log2;
+  // col_start_sb[i] is the start position of tile column i in superblock units.
+  // valid for 0 <= i <= cols
+  int col_start_sb[MAX_TILE_COLS + 1];
+  // row_start_sb[i] is the start position of tile row i in superblock units.
+  // valid for 0 <= i <= rows
+  int row_start_sb[MAX_TILE_ROWS + 1];
+  // If true, we are using large scale tile mode.
+  unsigned int large_scale;
+  // Only relevant when large_scale == 1.
+  // If true, the independent decoding of a single tile or a section of a frame
+  // is allowed.
+  unsigned int single_tile_decoding;
+} CommonTileParams;
+
+// Struct containing params related to MB_MODE_INFO arrays and related info.
+typedef struct CommonModeInfoParams CommonModeInfoParams;
+struct CommonModeInfoParams {
+  // Number of rows/cols in the frame in 16 pixel units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mb_rows;
+  int mb_cols;
+  // Total MBs = mb_rows * mb_cols.
+  int MBs;
+
+  // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mi_rows;
+  int mi_cols;
+
+  // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+  // in the frame.
+  // Note: This array should be treated like a scratch memory, and should NOT be
+  // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+  MB_MODE_INFO *mi_alloc;
+  // Number of allocated elements in 'mi_alloc'.
+  int mi_alloc_size;
+  // Stride for 'mi_alloc' array.
+  int mi_alloc_stride;
+  // The minimum block size that each element in 'mi_alloc' can correspond to.
+  // For decoder, this is always BLOCK_4X4.
+  // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
+  // and BLOCK_8X8 for resolution >= 4k.
+  BLOCK_SIZE mi_alloc_bsize;
+
+  // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+  // It's possible that:
+  // - Multiple pointers in the grid point to the same element in 'mi_alloc'
+  // (for example, for all 4x4 blocks that belong to the same partition block).
+  // - Some pointers can be NULL (for example, for blocks outside visible area).
+  MB_MODE_INFO **mi_grid_base;
+  // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_grid_size;
+  // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_stride;
+
+  // An array of tx types for each 4x4 block in the frame.
+  // Number of allocated elements is same as 'mi_grid_size', and stride is
+  // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+  // 'mi_grid_base'.
+  TX_TYPE *tx_type_map;
+
+  // Function pointers to allow separate logic for encoder and decoder.
+  void (*free_mi)(struct CommonModeInfoParams *mi_params);
+  void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+  void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
+                    int height);
+};
+
+// Parameters related to quantization at the frame level.
+typedef struct CommonQuantParams CommonQuantParams;
+struct CommonQuantParams {
+  // Base qindex of the frame in the range 0 to 255.
+  int base_qindex;
+
+  // Delta of qindex (from base_qindex) for Y plane DC coefficient.
+  // Note: y_ac_delta_q is implicitly 0.
+  int y_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
+  // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  // Note: The qindex per superblock may have a delta from the qindex obtained
+  // at frame level from parameters above, based on 'cm->delta_q_info'.
+
+  // The dequantizers below are true dequantizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+  // Global quant matrix tables
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+  // Local quant matrix tables for each frame
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // Flag indicating whether quantization matrices are being used:
+  //  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+  //    indices to be used to access appropriate global quant matrix tables.
+  //  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+  bool using_qmatrix;
+  int qmatrix_level_y;
+  int qmatrix_level_u;
+  int qmatrix_level_v;
+};
+
+// Context used for transmitting various symbols in the bistream.
+typedef struct CommonContexts CommonContexts;
+struct CommonContexts {
+  // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+  // partition[i][j] is the context for ith tile row, jth mi_col.
+  PARTITION_CONTEXT **partition;
+
+  // Context used to derive context for multiple symbols:
+  // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+  // to transmit skip_txfm flag.
+  // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+  // sign.
+  // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+  ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
+
+  // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+  // transmit 'is_split' flag to indicate if this transform block should be
+  // split into smaller sub-blocks.
+  // txfm[i][j] is the context for ith tile row, jth mi_col.
+  TXFM_CONTEXT **txfm;
+
+  // Dimensions that were used to allocate the arrays above.
+  // If these dimensions change, the arrays may have to be re-allocated.
+  int num_planes;     // Corresponds to av1_num_planes(cm)
+  int num_tile_rows;  // Corresponds to cm->tiles.row
+  int num_mi_cols;    // Corresponds to cm->mi_params.mi_cols
+};
+
 typedef struct AV1Common {
+  // Information about the current frame that is being coded.
   CurrentFrame current_frame;
+  // Code and details about current error status.
   struct aom_internal_error_info error;
+
+  // AV1 allows two types of frame scaling operations:
+  // (1) Frame super-resolution: that allows coding a frame at lower resolution
+  // and after decoding the frame, normatively uscales and restores the frame --
+  // inside the coding loop.
+  // (2) Frame resize: that allows coding frame at lower/higher resolution, and
+  // then non-normatively upscale the frame at the time of rendering -- outside
+  // the coding loop.
+  // Hence, the need for 3 types of dimensions.
+
+  // Coded frame dimensions.
   int width;
   int height;
+
+  // Rendered frame dimensions, after applying both super-resolution and resize
+  // to the coded frame.
+  // Different from coded dimensions if super-resolution and/or resize are
+  // being used for this frame.
   int render_width;
   int render_height;
-  int timing_info_present;
-  aom_timing_info_t timing_info;
-  int buffer_removal_time_present;
-  aom_dec_model_info_t buffer_model;
-  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
-  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+
+  // Frame dimensions after applying super-resolution to the coded frame (if
+  // present), but before applying resize.
+  // Larger than the coded dimensions if super-resolution is being used for
+  // this frame.
+  // Different from rendered dimensions if resize is being used for this frame.
+  int superres_upscaled_width;
+  int superres_upscaled_height;
+
+  // The denominator of the superres scale used by this frame.
+  // Note: The numerator is fixed to be SCALE_NUMERATOR.
+  uint8_t superres_scale_denominator;
+
+  // If true, buffer removal times are present.
+  bool buffer_removal_time_present;
+  // buffer_removal_times[op_num] specifies the frame removal time in units of
+  // DecCT clock ticks counted from the removal time of the last random access
+  // point for operating point op_num.
+  // TODO(urvang): We probably don't need the +1 here.
+  uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
+  // Presentation time of the frame in clock ticks DispCT counted from the
+  // removal time of the last random access point for the operating point that
+  // is being decoded.
   uint32_t frame_presentation_time;
 
-  int context_update_tile_id;
-
-  // Scale of the current frame with respect to itself.
-  struct scale_factors sf_identity;
-
+  // Buffer where previous frame is stored.
   RefCntBuffer *prev_frame;
 
+  // Buffer into which the current frame will be stored and other related info.
   // TODO(hkuang): Combine this with cur_buf in macroblockd.
   RefCntBuffer *cur_frame;
 
@@ -346,6 +590,15 @@
   // have a remapped index for the same.
   int remapped_ref_idx[REF_FRAMES];
 
+  // Scale of the current frame with respect to itself.
+  // This is currently used for intra block copy, which behaves like an inter
+  // prediction mode, where the reference frame is the current frame itself.
+  struct scale_factors sf_identity;
+
+  // Scale factors of the reference frame with respect to the current frame.
+  // This is required for generating inter prediction and will be non-identity
+  // for a reference frame, if it has different dimensions than the coded
+  // dimensions of the current frame.
   struct scale_factors ref_scale_factors[REF_FRAMES];
 
   // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
@@ -355,207 +608,133 @@
   // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
   RefCntBuffer *ref_frame_map[REF_FRAMES];
 
-  // Prepare ref_frame_map for the next frame.
-  // Only used in frame parallel decode.
-  RefCntBuffer *next_ref_frame_map[REF_FRAMES];
-  FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
-
+  // If true, this frame is actually shown after decoding.
+  // If false, this frame is coded in the bitstream, but not shown. It is only
+  // used as a reference for other frames coded later.
   int show_frame;
-  int showable_frame;  // frame can be used as show existing frame in future
+
+  // If true, this frame can be used as a show-existing frame for other frames
+  // coded later.
+  // When 'show_frame' is true, this is always true for all non-keyframes.
+  // When 'show_frame' is false, this value is transmitted in the bitstream.
+  int showable_frame;
+
+  // If true, show an existing frame coded before, instead of actually coding a
+  // frame. The existing frame comes from one of the existing reference buffers,
+  // as signaled in the bitstream.
   int show_existing_frame;
 
-  uint8_t disable_cdf_update;
-  int allow_high_precision_mv;
-  uint8_t cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  // Whether some features are allowed or not.
+  FeatureFlags features;
 
-  uint8_t allow_screen_content_tools;
-  int allow_intrabc;
-  int allow_warped_motion;
-
-  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MB_MODE_INFO (8-pixel) units.
-  int MBs;
-  int mb_rows, mi_rows;
-  int mb_cols, mi_cols;
-  int mi_stride;
-
-  /* profile settings */
-  TX_MODE tx_mode;
+  // Params related to MB_MODE_INFO arrays and related info.
+  CommonModeInfoParams mi_params;
 
 #if CONFIG_ENTROPY_STATS
   int coef_cdf_category;
 #endif
+  // Quantization params.
+  CommonQuantParams quant_params;
 
-  int base_qindex;
-  int y_dc_delta_q;
-  int u_dc_delta_q;
-  int v_dc_delta_q;
-  int u_ac_delta_q;
-  int v_ac_delta_q;
+  // Segmentation info for current frame.
+  struct segmentation seg;
 
-  // The dequantizers below are true dequantizers used only in the
-  // dequantization process.  They have the same coefficient
-  // shift/scale as TX.
-  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
-
-  // Global quant matrix tables
-  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
-  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
-
-  // Local quant matrix tables for each frame
-  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
-  // Encoder
-  int using_qmatrix;
-  int qm_y;
-  int qm_u;
-  int qm_v;
-  int min_qmlevel;
-  int max_qmlevel;
-  int use_quant_b_adapt;
-
-  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
-     an extra row on top and column on the left to simplify prediction. */
-  int mi_alloc_size;
-  MB_MODE_INFO *mip; /* Base of allocated array */
-  MB_MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
-
-  // TODO(agrange): Move prev_mi into encoder structure.
-  // prev_mip and prev_mi will only be allocated in encoder.
-  MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
-  MB_MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
-
-  // Separate mi functions between encoder and decoder.
-  int (*alloc_mi)(struct AV1Common *cm, int mi_size);
-  void (*free_mi)(struct AV1Common *cm);
-  void (*setup_mi)(struct AV1Common *cm);
-
-  // Grid of pointers to 8x8 MB_MODE_INFO structs.  Any 8x8 not in the visible
-  // area will be NULL.
-  MB_MODE_INFO **mi_grid_base;
-  MB_MODE_INFO **mi_grid_visible;
-  MB_MODE_INFO **prev_mi_grid_base;
-  MB_MODE_INFO **prev_mi_grid_visible;
-
-  // Whether to use previous frames' motion vectors for prediction.
-  int allow_ref_frame_mvs;
-
+  // Segmentation map for previous frame.
   uint8_t *last_frame_seg_map;
 
-  InterpFilter interp_filter;
-
-  int switchable_motion_mode;
-
+  // Deblocking filter parameters.
   loop_filter_info_n lf_info;
-  // The denominator of the superres scale; the numerator is fixed.
-  uint8_t superres_scale_denominator;
-  int superres_upscaled_width;
-  int superres_upscaled_height;
-  RestorationInfo rst_info[MAX_MB_PLANE];
-
-  // Pointer to a scratch buffer used by self-guided restoration
-  int32_t *rst_tmpbuf;
-  RestorationLineBuffers *rlbs;
-
-  // Output of loop restoration
-  YV12_BUFFER_CONFIG rst_frame;
-
-  // Flag signaling how frame contexts should be updated at the end of
-  // a frame decode
-  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
-
-  int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
-
   struct loopfilter lf;
-  struct segmentation seg;
-  int coded_lossless;  // frame is fully lossless at the coded resolution.
-  int all_lossless;    // frame is fully lossless at the upscaled resolution.
 
-  int reduced_tx_set_used;
+  // Loop Restoration filter parameters.
+  RestorationInfo rst_info[MAX_MB_PLANE];  // Loop Restoration filter info.
+  int32_t *rst_tmpbuf;  // Scratch buffer for self-guided restoration filter.
+  RestorationLineBuffers *rlbs;  // Line buffers required by loop restoration.
+  YV12_BUFFER_CONFIG rst_frame;  // Stores the output of loop restoration.
 
-  // Context probabilities for reference frame prediction
-  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
-  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+  // CDEF (Constrained Directional Enhancement Filter) parameters.
+  CdefInfo cdef_info;
 
-  FRAME_CONTEXT *fc;              /* this frame entropy */
+  // Parameters for film grain synthesis.
+  aom_film_grain_t film_grain_params;
+
+  // Parameters for delta quantization and delta loop filter level.
+  DeltaQInfo delta_q_info;
+
+  // Global motion parameters for each reference frame.
+  WarpedMotionParams global_motion[REF_FRAMES];
+
+  // Elements part of the sequence header, that are applicable for all the
+  // frames in the video.
+  SequenceHeader seq_params;
+
+  // Current CDFs of all the symbols for the current frame.
+  FRAME_CONTEXT *fc;
+  // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+  // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+  // copied from default CDF tables for each symbol.
   FRAME_CONTEXT *default_frame_context;
-  int primary_ref_frame;
 
-  int error_resilient_mode;
-
-  int tile_cols, tile_rows;
-
-  int max_tile_width_sb;
-  int min_log2_tile_cols;
-  int max_log2_tile_cols;
-  int max_log2_tile_rows;
-  int min_log2_tile_rows;
-  int min_log2_tiles;
-  int max_tile_height_sb;
-  int uniform_tile_spacing_flag;
-  int log2_tile_cols;                        // only valid for uniform tiles
-  int log2_tile_rows;                        // only valid for uniform tiles
-  int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
-  int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
-  int tile_width, tile_height;               // In MI units
-  int min_inner_tile_width;                  // min width of non-rightmost tile
-
-  unsigned int large_scale_tile;
-  unsigned int single_tile_decoding;
-
-  int byte_alignment;
-  int skip_loop_filter;
-  int skip_film_grain;
+  // Parameters related to tiling.
+  CommonTileParams tiles;
 
   // External BufferPool passed from outside.
   BufferPool *buffer_pool;
 
-  PARTITION_CONTEXT **above_seg_context;
-  ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
-  TXFM_CONTEXT **above_txfm_context;
-  WarpedMotionParams global_motion[REF_FRAMES];
-  aom_film_grain_t film_grain_params;
+  // Above context buffers and their sizes.
+  // Note: above contexts are allocated in this struct, as their size is
+  // dependent on frame width, while left contexts are declared and allocated in
+  // MACROBLOCKD struct, as they have a fixed size.
+  CommonContexts above_contexts;
 
-  CdefInfo cdef_info;
-  DeltaQInfo delta_q_info;  // Delta Q and Delta LF parameters
-
-  int num_tg;
-  SequenceHeader seq_params;
+  // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
+  // reference frame IDs are signaled in the bitstream.
   int current_frame_id;
   int ref_frame_id[REF_FRAMES];
-  int valid_for_referencing[REF_FRAMES];
+
+  // Motion vectors provided by motion field estimation.
+  // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+  // mi_row = 2 * row,
+  // mi_col = 2 * col, and
+  // stride = cm->mi_params.mi_stride / 2
   TPL_MV_REF *tpl_mvs;
+  // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
   int tpl_mvs_mem_size;
+  // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive; and 0 otherwise.
+  int ref_frame_sign_bias[REF_FRAMES];
+  // ref_frame_side[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
   // TODO(jingning): This can be combined with sign_bias later.
   int8_t ref_frame_side[REF_FRAMES];
 
-  int is_annexb;
-
-  int temporal_layer_id;
-  int spatial_layer_id;
+  // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
   unsigned int number_temporal_layers;
+  // Temporal layer ID of this frame
+  // (in the range 0 ... (number_temporal_layers - 1)).
+  int temporal_layer_id;
+
+  // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
   unsigned int number_spatial_layers;
-  int num_allocated_above_context_mi_col;
-  int num_allocated_above_contexts;
-  int num_allocated_above_context_planes;
+  // Spatial layer ID of this frame
+  // (in the range 0 ... (number_spatial_layers - 1)).
+  int spatial_layer_id;
 
 #if TXCOEFF_TIMER
   int64_t cum_txcoeff_timer;
   int64_t txcoeff_timer;
   int txb_count;
-#endif
+#endif  // TXCOEFF_TIMER
 
 #if TXCOEFF_COST_TIMER
   int64_t cum_txcoeff_cost_timer;
   int64_t txcoeff_cost_timer;
   int64_t txcoeff_cost_count;
-#endif
-  const cfg_options_t *options;
+#endif  // TXCOEFF_COST_TIMER
+
+#if CONFIG_LPF_MASK
   int is_decoding;
+#endif  // CONFIG_LPF_MASK
 } AV1_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -635,7 +814,7 @@
 // Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
 // counts accordingly.
 static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
-                                       RefCntBuffer *rhs_ptr) {
+                                         RefCntBuffer *rhs_ptr) {
   RefCntBuffer *const old_ptr = *lhs_ptr;
   if (old_ptr != NULL) {
     assert(old_ptr->ref_count > 0);
@@ -650,7 +829,7 @@
 
 static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
   return cm->current_frame.frame_type == KEY_FRAME ||
-      cm->current_frame.frame_type == INTRA_ONLY_FRAME;
+         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
 }
 
 static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
@@ -689,45 +868,49 @@
 
 static INLINE RefCntBuffer *get_primary_ref_frame_buf(
     const AV1_COMMON *const cm) {
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE) return NULL;
-  const int map_idx = get_ref_frame_map_idx(cm, cm->primary_ref_frame + 1);
+  const int primary_ref_frame = cm->features.primary_ref_frame;
+  if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+  const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
 }
 
 // Returns 1 if this frame might allow mvs from some reference frame.
 static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode &&
-    cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
-    cm->seq_params.order_hint_info.enable_order_hint &&
-    !frame_is_intra_only(cm);
+  return !cm->features.error_resilient_mode &&
+         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params.order_hint_info.enable_order_hint &&
+         !frame_is_intra_only(cm);
 }
 
 // Returns 1 if this frame might use warped_motion
 static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
+  return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
          cm->seq_params.enable_warped_motion;
 }
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
   const int buf_rows = buf->mi_rows;
   const int buf_cols = buf->mi_cols;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 
-  if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
+  if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
+      buf_cols != mi_params->mi_cols) {
     aom_free(buf->mvs);
-    buf->mi_rows = cm->mi_rows;
-    buf->mi_cols = cm->mi_cols;
+    buf->mi_rows = mi_params->mi_rows;
+    buf->mi_cols = mi_params->mi_cols;
     CHECK_MEM_ERROR(cm, buf->mvs,
-                    (MV_REF *)aom_calloc(
-                        ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
-                        sizeof(*buf->mvs)));
+                    (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
+                                             ((mi_params->mi_cols + 1) >> 1),
+                                         sizeof(*buf->mvs)));
     aom_free(buf->seg_map);
-    CHECK_MEM_ERROR(cm, buf->seg_map,
-                    (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
-                                          sizeof(*buf->seg_map)));
+    CHECK_MEM_ERROR(
+        cm, buf->seg_map,
+        (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
+                              sizeof(*buf->seg_map)));
   }
 
   const int mem_size =
-      ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+      ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
   int realloc = cm->tpl_mvs == NULL;
   if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
 
@@ -745,48 +928,51 @@
   return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
 }
 
-static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          const int tile_row) {
-  const int num_planes = av1_num_planes(cm);
+static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+                                          int num_planes, int tile_row,
+                                          MACROBLOCKD *xd) {
   for (int i = 0; i < num_planes; ++i) {
-    xd->above_context[i] = cm->above_context[i][tile_row];
+    xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
   }
-  xd->above_seg_context = cm->above_seg_context[tile_row];
-  xd->above_txfm_context = cm->above_txfm_context[tile_row];
+  xd->above_partition_context = above_contexts->partition[tile_row];
+  xd->above_txfm_context = above_contexts->txfm[tile_row];
 }
 
 static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
                                         tran_low_t *dqcoeff) {
   const int num_planes = av1_num_planes(cm);
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+
   for (int i = 0; i < num_planes; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
 
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
-      memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
-             sizeof(cm->y_dequant_QTX));
-      memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
+      memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
+             sizeof(quant_params->y_dequant_QTX));
+      memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
+             sizeof(quant_params->y_iqmatrix));
 
     } else {
       if (i == AOM_PLANE_U) {
-        memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
-               sizeof(cm->u_dequant_QTX));
-        memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
-               sizeof(cm->u_iqmatrix));
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
+               sizeof(quant_params->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
+               sizeof(quant_params->u_iqmatrix));
       } else {
-        memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
-               sizeof(cm->v_dequant_QTX));
-        memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
-               sizeof(cm->v_iqmatrix));
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
+               sizeof(quant_params->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
+               sizeof(quant_params->v_iqmatrix));
       }
     }
   }
-  xd->mi_stride = cm->mi_stride;
+  xd->mi_stride = cm->mi_params.mi_stride;
   xd->error_info = &cm->error;
   cfl_init(&xd->cfl, &cm->seq_params);
 }
 
-static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    const int num_planes) {
+static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       const int num_planes) {
   int i;
   int row_offset = mi_row;
   int col_offset = mi_col;
@@ -800,8 +986,10 @@
       col_offset = mi_col - 1;
     int above_idx = col_offset;
     int left_idx = row_offset & MAX_MIB_MASK;
-    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
-    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+    pd->above_entropy_context =
+        &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
+    pd->left_entropy_context =
+        &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
   }
 }
 
@@ -825,10 +1013,13 @@
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
                                   int mi_rows, int mi_cols) {
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
+  xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
 
   // Are edges available for intra prediction?
   xd->up_available = (mi_row > tile->mi_row_start);
@@ -857,6 +1048,7 @@
 
   const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
                          ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  xd->is_chroma_ref = chroma_ref;
   if (chroma_ref) {
     // To help calculate the "above" and "left" chroma blocks, note that the
     // current block may cover multiple luma blocks (eg, if partitioned into
@@ -878,18 +1070,18 @@
     xd->chroma_left_mbmi = chroma_left_mi;
   }
 
-  xd->n4_h = bh;
-  xd->n4_w = bw;
+  xd->height = bh;
+  xd->width = bw;
   xd->is_sec_rect = 0;
-  if (xd->n4_w < xd->n4_h) {
+  if (xd->width < xd->height) {
     // Only mark is_sec_rect as 1 for the last block.
     // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
     // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
+    if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
   }
 
-  if (xd->n4_w > xd->n4_h)
-    if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
+  if (xd->width > xd->height)
+    if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
 }
 
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -905,9 +1097,9 @@
 static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
                                             int mi_col, BLOCK_SIZE subsize,
                                             BLOCK_SIZE bsize) {
-  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
   PARTITION_CONTEXT *const left_ctx =
-      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
 
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
@@ -917,6 +1109,7 @@
 
 static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
                                       int subsampling_x, int subsampling_y) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
@@ -924,55 +1117,6 @@
   return ref_pos;
 }
 
-static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
-                                            int subsampling_y) {
-  BLOCK_SIZE bs = bsize;
-  switch (bsize) {
-    case BLOCK_4X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_4X8:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_8X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_8X8;
-      break;
-    case BLOCK_4X16:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X16;
-      break;
-    case BLOCK_16X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_16X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_16X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_16X8;
-      break;
-    default: break;
-  }
-  return bs;
-}
-
 static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
                                             size_t element) {
   assert(cdf != NULL);
@@ -1050,9 +1194,9 @@
 
 static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
                                           int mi_col, BLOCK_SIZE bsize) {
-  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
   const PARTITION_CONTEXT *left_ctx =
-      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
   // Minimum partition point is 8x8. Offset the bsl accordingly.
   const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
   int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
@@ -1076,78 +1220,70 @@
 
 static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                  int plane) {
+  assert(bsize < BLOCK_SIZES_ALL);
   int max_blocks_wide = block_size_wide[bsize];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
 
-  if (xd->mb_to_right_edge < 0)
+  if (xd->mb_to_right_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
     max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+  }
 
   // Scale the width in the transform block unit.
-  return max_blocks_wide >> tx_size_wide_log2[0];
+  return max_blocks_wide >> MI_SIZE_LOG2;
 }
 
 static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                  int plane) {
   int max_blocks_high = block_size_high[bsize];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
 
-  if (xd->mb_to_bottom_edge < 0)
+  if (xd->mb_to_bottom_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
     max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+  }
 
   // Scale the height in the transform block unit.
-  return max_blocks_high >> tx_size_high_log2[0];
+  return max_blocks_high >> MI_SIZE_LOG2;
 }
 
-static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
-                                        BLOCK_SIZE plane_bsize, int plane,
-                                        TX_SIZE tx_size) {
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
-                              << tx_size_wide_log2[0];
-  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
-}
-
-static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
-                                         BLOCK_SIZE plane_bsize, int plane,
-                                         TX_SIZE tx_size) {
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
-                              << tx_size_high_log2[0];
-  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
-}
-
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
-  int mi_col_start, int mi_col_end, const int tile_row) {
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+                                          const MACROBLOCKD *xd,
+                                          int mi_col_start, int mi_col_end,
+                                          const int tile_row) {
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
   const int aligned_width =
-    ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
-
+      ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
   const int offset_y = mi_col_start;
   const int width_y = aligned_width;
   const int offset_uv = offset_y >> seq_params->subsampling_x;
   const int width_uv = width_y >> seq_params->subsampling_x;
+  CommonContexts *const above_contexts = &cm->above_contexts;
 
-  av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
+  av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
   if (num_planes > 1) {
-    if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
-      av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
-      av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
+    if (above_contexts->entropy[1][tile_row] &&
+        above_contexts->entropy[2][tile_row]) {
+      av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
+                     width_uv);
+      av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
+                     width_uv);
     } else {
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid value of planes");
     }
   }
 
-  av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
+  av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
+                 aligned_width);
 
-  memset(cm->above_txfm_context[tile_row] + mi_col_start,
-    tx_size_wide[TX_SIZES_LARGEST],
-    aligned_width * sizeof(TXFM_CONTEXT));
+  memset(above_contexts->txfm[tile_row] + mi_col_start,
+         tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
 }
 
 static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
-  av1_zero(xd->left_context);
-  av1_zero(xd->left_seg_context);
+  av1_zero(xd->left_entropy_context);
+  av1_zero(xd->left_partition_context);
 
   memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
          sizeof(xd->left_txfm_context_buffer));
@@ -1184,6 +1320,35 @@
   set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 }
 
+static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+                                  int mi_row, int mi_col) {
+  return mi_row * mi_params->mi_stride + mi_col;
+}
+
+static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+                                   int mi_row, int mi_col) {
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_row = mi_row / mi_alloc_size_1d;
+  const int mi_alloc_col = mi_col / mi_alloc_size_1d;
+
+  return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
+}
+
+// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
+static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+                                  MACROBLOCKD *const xd, int mi_row,
+                                  int mi_col) {
+  // 'mi_grid_base' should point to appropriate memory in 'mi'.
+  const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+  const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+  // 'xd->mi' should point to an offset in 'mi_grid_base';
+  xd->mi = mi_params->mi_grid_base + mi_grid_idx;
+  // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
+  xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+}
+
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          TX_SIZE tx_size, TX_SIZE txb_size) {
@@ -1276,10 +1441,12 @@
 static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
                                            int mi_row, int mi_col,
                                            BLOCK_SIZE bsize) {
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
+    return PARTITION_INVALID;
 
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
+  const int offset = mi_row * mi_params->mi_stride + mi_col;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
   const BLOCK_SIZE subsize = mi[0]->sb_type;
 
   if (subsize == bsize) return PARTITION_NONE;
@@ -1289,12 +1456,12 @@
   const int sshigh = mi_size_high[subsize];
   const int sswide = mi_size_wide[subsize];
 
-  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
-      mi_col + bhigh / 2 < cm->mi_cols) {
+  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
+      mi_col + bhigh / 2 < mi_params->mi_cols) {
     // In this case, the block might be using an extended partition
     // type.
     const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
-    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
 
     if (sswide == bwide) {
       // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
@@ -1373,11 +1540,18 @@
 }
 
 static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
-  return seq_level_idx < SEQ_LEVELS || seq_level_idx == SEQ_LEVEL_MAX;
+  return seq_level_idx == SEQ_LEVEL_MAX ||
+         (seq_level_idx < SEQ_LEVELS &&
+          // The following levels are currently undefined.
+          seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
+          seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
+          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 &&
+          seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
+          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AOM_AV1_COMMON_ONYXC_INT_H_
+#endif  // AOM_AV1_COMMON_AV1_COMMON_INT_H_

diff --git a/libaom/av1/common/av1_inv_txfm1d.c b/libaom/av1/common/av1_inv_txfm1d.c
index 7ef2d6d..8d69efc 100644
--- a/libaom/av1/common/av1_inv_txfm1d.c
+++ b/libaom/av1/common/av1_inv_txfm1d.c

@@ -13,11 +13,8 @@
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_txfm.h"
 
-// TODO(angiebird): Make 1-d txfm functions static
-//
-
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 4;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -57,8 +54,8 @@
   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
 }
 
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 8;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -138,8 +135,8 @@
   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
 }
 
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 16;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -303,8 +300,8 @@
   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
 }
 
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 32;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -656,8 +653,8 @@
   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
 }
 
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   int bit = cos_bit;
   const int32_t *sinpi = sinpi_arr(bit);
   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -713,8 +710,8 @@
   output[3] = round_shift(x3, bit);
 }
 
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 8;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -809,7 +806,6 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
-  stage++;
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -822,8 +818,8 @@
   bf1[7] = -bf0[1];
 }
 
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range) {
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 16;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1010,7 +1006,6 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
-  stage++;
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -1064,8 +1059,8 @@
   for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
 }
 
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 64;
   const int32_t *cospi = cospi_arr(cos_bit);

diff --git a/libaom/av1/common/av1_inv_txfm1d.h b/libaom/av1/common/av1_inv_txfm1d.h
index c31c019..e1d5d98 100644
--- a/libaom/av1/common/av1_inv_txfm1d.h
+++ b/libaom/av1/common/av1_inv_txfm1d.h

@@ -29,22 +29,22 @@
   for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
 }
 
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range);
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range);
 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,

diff --git a/libaom/av1/common/av1_inv_txfm1d_cfg.h b/libaom/av1/common/av1_inv_txfm1d_cfg.h
index 7d80a00..47fedbd 100644
--- a/libaom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/libaom/av1/common/av1_inv_txfm1d_cfg.h

@@ -36,12 +36,12 @@
   7,  // 64x16 transform
 };
 
-extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
 
-// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
 // for each valid row and col combination
 #define INV_COS_BIT 12
-extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
-extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
 #endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_

diff --git a/libaom/av1/common/av1_inv_txfm2d.c b/libaom/av1/common/av1_inv_txfm2d.c
index fc9c8d2..559d121 100644
--- a/libaom/av1/common/av1_inv_txfm2d.c
+++ b/libaom/av1/common/av1_inv_txfm2d.c

@@ -113,14 +113,14 @@
 
 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_idct4_new;
-    case TXFM_TYPE_DCT8: return av1_idct8_new;
-    case TXFM_TYPE_DCT16: return av1_idct16_new;
-    case TXFM_TYPE_DCT32: return av1_idct32_new;
-    case TXFM_TYPE_DCT64: return av1_idct64_new;
-    case TXFM_TYPE_ADST4: return av1_iadst4_new;
-    case TXFM_TYPE_ADST8: return av1_iadst8_new;
-    case TXFM_TYPE_ADST16: return av1_iadst16_new;
+    case TXFM_TYPE_DCT4: return av1_idct4;
+    case TXFM_TYPE_DCT8: return av1_idct8;
+    case TXFM_TYPE_DCT16: return av1_idct16;
+    case TXFM_TYPE_DCT32: return av1_idct32;
+    case TXFM_TYPE_DCT64: return av1_idct64;
+    case TXFM_TYPE_ADST4: return av1_iadst4;
+    case TXFM_TYPE_ADST8: return av1_iadst8;
+    case TXFM_TYPE_ADST16: return av1_iadst16;
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
@@ -149,7 +149,7 @@
 static const int8_t inv_shift_16x64[2] = { -2, -4 };
 static const int8_t inv_shift_64x16[2] = { -2, -4 };
 
-const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
   inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
   inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
   inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
@@ -158,7 +158,7 @@
 };
 
 /* clang-format off */
-const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
                             [MAX_TXWH_IDX] = {  // txh_idx
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
@@ -167,7 +167,7 @@
     {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
   };
 
-const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
                             [MAX_TXWH_IDX] = {  // txh_idx
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
@@ -177,7 +177,7 @@
   };
 /* clang-format on */
 
-const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
 
 void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
                           TXFM_2D_FLIP_CFG *cfg) {
@@ -188,11 +188,11 @@
   set_flip_cfg(tx_type, cfg);
   const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
   const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
-  cfg->shift = inv_txfm_shift_ls[tx_size];
+  cfg->shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
   if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
     memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));

diff --git a/libaom/av1/common/av1_loopfilter.c b/libaom/av1/common/av1_loopfilter.c
index 0aa1f9b..c756760 100644
--- a/libaom/av1/common/av1_loopfilter.c
+++ b/libaom/av1/common/av1_loopfilter.c

@@ -17,8 +17,8 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
@@ -28,11 +28,9 @@
   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
 };
 
-static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
-  { 0, 1 }, { 2, 2 }, { 3, 3 }
-};
-
-enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
+                                                      { 2, 2 },
+                                                      { 3, 3 } };
 
 static const int mode_lf_lut[] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
@@ -40,397 +38,6 @@
   1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
 };
 
-// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the left border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this (-- and | are used for better view)
-//
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    -----------------
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the top border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    -----------------
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
-};
-
-const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
-};
-
-const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
-};
-
-const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
-                                                      -1, -1, -1, 0,  1,  2,
-                                                      3,  -1, -1, -1, -1, -1,
-                                                      -1, -1, -1, -1 };
-const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = { 0,  47, 49, 19, 51, 53,
-                                                         33, 55, 57, 42, 59, 60,
-                                                         46, -1, -1, -1, 61, 62,
-                                                         63, 64, 65, 66 };
-
-const FilterMask left_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
-      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
-      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
-      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
-      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
-      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-const FilterMask above_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
-      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
-      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
-      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
-      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
-      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
-      0x000000000000000fULL } },  // block size 16X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-#if LOOP_FILTER_BITMASK
-LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
-                                     int mi_col) {
-  assert(cm->lf.lfm != NULL);
-  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
-  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
-  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
-}
-
-typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
-                        const uint8_t *limit, const uint8_t *thresh);
-
-typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
-                            const uint8_t *limit0, const uint8_t *thresh0,
-                            const uint8_t *blimit1, const uint8_t *limit1,
-                            const uint8_t *thresh1);
-
-typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh, int bd);
-
-typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
-#endif  // LOOP_FILTER_BITMASK
-
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
 
@@ -452,12 +59,12 @@
   }
 }
 
-uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
-                         const int dir_idx, int plane,
-                         const MB_MODE_INFO *mbmi) {
+uint8_t av1_get_filter_level(const AV1_COMMON *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
   if (cm->delta_q_info.delta_lf_present_flag) {
-    int delta_lf;
+    int8_t delta_lf;
     if (cm->delta_q_info.delta_lf_multi) {
       const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
       delta_lf = mbmi->delta_lf[delta_lf_idx];
@@ -581,1359 +188,6 @@
   }
 }
 
-#if LOOP_FILTER_BITMASK
-// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
-// Every 4 rows is represented by one uint64_t mask. Hence,
-// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
-//
-// Given a location by (mi_col, mi_row), This function returns the index
-// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
-//
-// For example, mi_row is the offset of pixels in mi size (4),
-// (mi_row / 4) returns which uint64_t.
-// After locating which uint64_t, mi_row % 4 is the
-// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
-// Therefore, shift = (row << stride_log2) + mi_col;
-int get_index_shift(int mi_col, int mi_row, int *index) {
-  // *index = mi_row >> 2;
-  // rows = mi_row % 4;
-  // stride_log2 = 4;
-  // shift = (rows << stride_log2) + mi_col;
-  *index = mi_row >> 2;
-  return ((mi_row & 3) << 4) | mi_col;
-}
-
-static void check_mask(const FilterMask *lfm) {
-#ifndef NDEBUG
-  for (int i = 0; i < 4; ++i) {
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
-  }
-#else
-  (void)lfm;
-#endif
-}
-
-static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
-  if (plane == 0) {
-    // Assert if we try to apply 2 different loop filters at the same
-    // position.
-    check_mask(lfm->left_y);
-    check_mask(lfm->above_y);
-  } else if (plane == 1) {
-    check_mask(lfm->left_u);
-    check_mask(lfm->above_u);
-  } else {
-    check_mask(lfm->left_v);
-    check_mask(lfm->above_v);
-  }
-}
-
-static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
-                         TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
-  if (dir == VERT_EDGE) {
-    switch (plane) {
-      case 0:
-        for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 1:
-        for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 2:
-        for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      default: assert(plane <= 2);
-    }
-  } else {
-    switch (plane) {
-      case 0:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 1:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 2:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      default: assert(plane <= 2);
-    }
-  }
-}
-
-static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
-                             int mi_col, int ssx, int ssy, EDGE_DIR dir) {
-  if (plane && (ssx || ssy)) {
-    if (ssx && ssy) {  // format 420
-      if ((mi_row << MI_SIZE_LOG2) > cm->height ||
-          (mi_col << MI_SIZE_LOG2) > cm->width)
-        return 1;
-    } else if (ssx) {  // format 422
-      if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-          (mi_col << MI_SIZE_LOG2) > cm->width)
-        return 1;
-    }
-  } else {
-    if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-        (mi_col << MI_SIZE_LOG2) >= cm->width)
-      return 1;
-  }
-
-  int row_or_col;
-  if (plane == 0) {
-    row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
-  } else {
-    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
-    // So if mi_col == 1, it is actually the frame boundary.
-    if (dir == VERT_EDGE) {
-      row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
-    } else {
-      row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
-    }
-  }
-  return row_or_col == 0;
-}
-
-static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
-                        int ssx, int ssy, TX_SIZE tx_size) {
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
-  const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
-  // decide whether current vertical/horizontal edge needs loop filtering
-  for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
-    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
-    mi_row |= ssy;
-    mi_col |= ssx;
-
-    MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-    const MB_MODE_INFO *const mbmi = mi[0];
-    const int curr_skip = mbmi->skip && is_inter_block(mbmi);
-    const BLOCK_SIZE bsize = mbmi->sb_type;
-    const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
-    const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
-    const int prediction_masks = dir == VERT_EDGE
-                                     ? block_size_wide[plane_bsize] - 1
-                                     : block_size_high[plane_bsize] - 1;
-    const int is_coding_block_border =
-        dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
-
-    // TODO(chengchen): step can be optimized.
-    const int row_step = mi_size_high[TX_4X4] << ssy;
-    const int col_step = mi_size_wide[TX_4X4] << ssx;
-    const int mi_height =
-        dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
-    const int mi_width =
-        dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
-
-    // assign filter levels
-    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
-      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
-        // do not filter frame boundary
-        // Note: when chroma planes' size are half of luma plane,
-        // chroma plane mi corresponds to even position.
-        // If frame size is not even, we still need to filter this chroma
-        // position. Therefore the boundary condition check needs to be
-        // separated to two cases.
-        if (plane && (ssx || ssy)) {
-          if (ssx && ssy) {  // format 420
-            if ((r << MI_SIZE_LOG2) > cm->height ||
-                (c << MI_SIZE_LOG2) > cm->width)
-              continue;
-          } else if (ssx) {  // format 422
-            if ((r << MI_SIZE_LOG2) >= cm->height ||
-                (c << MI_SIZE_LOG2) > cm->width)
-              continue;
-          }
-        } else {
-          if ((r << MI_SIZE_LOG2) >= cm->height ||
-              (c << MI_SIZE_LOG2) >= cm->width)
-            continue;
-        }
-
-        const int row = r % MI_SIZE_64X64;
-        const int col = c % MI_SIZE_64X64;
-        if (plane == 0) {
-          if (dir == VERT_EDGE)
-            lfm->lfl_y_ver[row][col] = level;
-          else
-            lfm->lfl_y_hor[row][col] = level;
-        } else if (plane == 1) {
-          lfm->lfl_u_ver[row][col] = level;
-          lfm->lfl_u_hor[row][col] = level;
-        } else {
-          lfm->lfl_v_ver[row][col] = level;
-          lfm->lfl_v_hor[row][col] = level;
-        }
-      }
-    }
-
-    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
-      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
-        // do not filter frame boundary
-        if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
-
-        uint64_t mask[4] = { 0 };
-        const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
-        const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
-        MB_MODE_INFO **mi_prev =
-            cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
-        const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
-        const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
-        const uint8_t level_prev =
-            get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
-        const int is_edge =
-            (level || level_prev) &&
-            (!curr_skip || !prev_skip || is_coding_block_border);
-
-        if (is_edge) {
-          const TX_SIZE prev_tx_size =
-              plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
-                    : mbmi_prev->tx_size;
-          TX_SIZE min_tx_size = (dir == VERT_EDGE)
-                                    ? AOMMIN(txsize_horz_map[tx_size],
-                                             txsize_horz_map[prev_tx_size])
-                                    : AOMMIN(txsize_vert_map[tx_size],
-                                             txsize_vert_map[prev_tx_size]);
-          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
-          assert(min_tx_size < TX_SIZES);
-          const int row = r % MI_SIZE_64X64;
-          const int col = c % MI_SIZE_64X64;
-          int index = 0;
-          const int shift = get_index_shift(col, row, &index);
-          assert(index < 4 && index >= 0);
-          mask[index] |= ((uint64_t)1 << shift);
-          // set mask on corresponding bit
-          update_masks(dir, plane, mask, min_tx_size, lfm);
-        }
-      }
-    }
-  }
-}
-
-static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                                int blk_row, int blk_col,
-                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                int plane, int ssx, int ssy) {
-  blk_row <<= ssy;
-  blk_col <<= ssx;
-  if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
-      ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
-    return;
-
-  // U/V plane, tx_size is always the largest size
-  if (plane) {
-    assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
-    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
-                tx_size);
-    return;
-  }
-
-  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-  const MB_MODE_INFO *const mbmi = mi[0];
-  // For Y plane:
-  // If intra block, tx size is univariant.
-  // If inter block, tx size follows inter_tx_size.
-  TX_SIZE plane_tx_size = tx_size;
-  const int is_inter = is_inter_block(mbmi);
-
-  if (plane == 0) {
-    if (is_inter) {
-      if (mbmi->skip) {
-        // TODO(chengchen): change av1_get_transform_size() to be consistant.
-        // plane_tx_size = get_max_rect_tx_size(plane_bsize);
-        plane_tx_size = mbmi->tx_size;
-      } else {
-        plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
-            plane_bsize, blk_row, blk_col)];
-      }
-    } else {
-      MB_MODE_INFO **mi_this = cm->mi_grid_visible +
-                               (mi_row + blk_row) * cm->mi_stride + mi_col +
-                               blk_col;
-      const MB_MODE_INFO *const mbmi_this = mi_this[0];
-      plane_tx_size = mbmi_this->tx_size;
-    }
-  }
-
-  assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
-
-  if (plane || plane_tx_size == tx_size) {
-    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
-                tx_size);
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
-        const int offsetc = blk_col + col;
-        setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
-                            sub_txs, plane, ssx, ssy);
-      }
-    }
-  }
-}
-
-static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                                 int plane, int ssx, int ssy) {
-  MB_MODE_INFO **mi =
-      cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
-  const MB_MODE_INFO *const mbmi = mi[0];
-
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
-  const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
-
-  const int block_width = mi_size_wide[plane_bsize];
-  const int block_height = mi_size_high[plane_bsize];
-
-  TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
-  // The decoder is designed so that it can process 64x64 luma pixels at a
-  // time. If this is a chroma plane with subsampling and bsize corresponds to
-  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
-  // mustn't be used for the subsampled plane (because it would be bigger than
-  // a 64x64 luma block) so we round down to TX_32X32.
-  if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
-    if (max_txsize == TX_16X64)
-      max_txsize = TX_16X32;
-    else if (max_txsize == TX_64X16)
-      max_txsize = TX_32X16;
-    else
-      max_txsize = TX_32X32;
-  }
-
-  const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
-  const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-  const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
-  const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-  mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
-  mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
-
-  // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
-  // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
-  // U/V: largest tx size is 32x32.
-  for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
-    for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
-      const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
-      const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
-      for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
-        for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
-          setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
-                              max_txsize, plane, ssx, ssy);
-        }
-      }
-    }
-  }
-}
-
-static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
-  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-      (mi_col << MI_SIZE_LOG2) >= cm->width)
-    return;
-
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int quarter_step = mi_size_wide[bsize] / 4;
-  const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
-  const int has_next_row =
-      (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
-  const int has_next_col =
-      (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
-  int i;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_SPLIT:
-      setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
-      if (has_next_col)
-        setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
-      if (has_next_row)
-        setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
-      if (has_next_col & has_next_row)
-        setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
-                         ssy);
-      break;
-    case PARTITION_HORZ_A:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ_B:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      if (has_next_col & has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT_A:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT_B:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
-        // chroma plane filter the odd location
-        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
-        setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
-        // chroma plane filter the odd location
-        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
-        setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
-      }
-      break;
-    default: assert(0);
-  }
-}
-
-// TODO(chengchen): if lossless, do not need to setup mask. But when
-// segments enabled, each segment has different lossless settings.
-void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
-                       int subsampling_x, int subsampling_y, int row_end,
-                       int col_end) {
-  const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
-  for (int y = 0; y < num_64x64; ++y) {
-    for (int x = 0; x < num_64x64; ++x) {
-      const int row = mi_row + y * MI_SIZE_64X64;
-      const int col = mi_col + x * MI_SIZE_64X64;
-      if (row >= row_end || col >= col_end) continue;
-      if ((row << MI_SIZE_LOG2) >= cm->height ||
-          (col << MI_SIZE_LOG2) >= cm->width)
-        continue;
-
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
-      if (lfm == NULL) return;
-
-      // init mask to zero
-      if (plane == 0) {
-        av1_zero(lfm->left_y);
-        av1_zero(lfm->above_y);
-        av1_zero(lfm->lfl_y_ver);
-        av1_zero(lfm->lfl_y_hor);
-      } else if (plane == 1) {
-        av1_zero(lfm->left_u);
-        av1_zero(lfm->above_u);
-        av1_zero(lfm->lfl_u_ver);
-        av1_zero(lfm->lfl_u_hor);
-      } else {
-        av1_zero(lfm->left_v);
-        av1_zero(lfm->above_v);
-        av1_zero(lfm->lfl_v_ver);
-        av1_zero(lfm->lfl_v_hor);
-      }
-    }
-  }
-
-  // set up bitmask for each superblock
-  setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
-                   subsampling_x, subsampling_y);
-
-  for (int y = 0; y < num_64x64; ++y) {
-    for (int x = 0; x < num_64x64; ++x) {
-      const int row = mi_row + y * MI_SIZE_64X64;
-      const int col = mi_col + x * MI_SIZE_64X64;
-      if (row >= row_end || col >= col_end) continue;
-      if ((row << MI_SIZE_LOG2) >= cm->height ||
-          (col << MI_SIZE_LOG2) >= cm->width)
-        continue;
-
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
-      if (lfm == NULL) return;
-
-      // check if the mask is valid
-      check_loop_filter_masks(lfm, plane);
-
-      {
-        // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
-        // Even tx size is greater, we only apply max length filter, which
-        // is 16.
-        if (plane == 0) {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
-            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
-            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
-            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
-
-            // set 32x32 and 64x64 to 0
-            lfm->left_y[TX_32X32].bits[j] = 0;
-            lfm->left_y[TX_64X64].bits[j] = 0;
-            lfm->above_y[TX_32X32].bits[j] = 0;
-            lfm->above_y[TX_64X64].bits[j] = 0;
-          }
-        } else if (plane == 1) {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
-            lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
-
-            // set 32x32 to 0
-            lfm->left_u[TX_32X32].bits[j] = 0;
-            lfm->above_u[TX_32X32].bits[j] = 0;
-          }
-        } else {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
-            lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
-
-            // set 32x32 to 0
-            lfm->left_v[TX_32X32].bits[j] = 0;
-            lfm->above_v[TX_32X32].bits[j] = 0;
-          }
-        }
-      }
-
-      // check if the mask is valid
-      check_loop_filter_masks(lfm, plane);
-    }
-  }
-}
-
-static void filter_selectively_vert_row2(
-    int subsampling_factor, uint8_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                     lfi1->hev_thr);
-          }
-        } else if (mask_16x16_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          }
-        } else if (mask_8x8_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                            lfi0->hev_thr, lfi1->mblim,
-                                            lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_16x16_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_8x8_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
-                                     int subsampling, uint64_t mask_16x16,
-                                     uint64_t mask_8x8, uint64_t mask_4x4,
-                                     const loop_filter_info_n *lfi_n,
-                                     const uint8_t *lfl) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, lfin->mblim, lfin->lim,
-                                       lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-
-static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
-    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
-    uint8_t *lfl, int bd) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                              lfi->hev_thr, lfin->mblim,
-                                              lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_8x8 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          count = 2;
-        } else {
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-
-void av1_build_bitmask_vert_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
-    const int mi_row = r << subsampling_y;
-    const int row = mi_row % MI_SIZE_64X64;
-    const int row_uv = row | subsampling_y;
-    int index = 0;
-    const int shift = get_index_shift(0, row, &index);
-
-    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
-         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
-      const int mi_col = c << subsampling_x;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int col_in_unit = 0;
-           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
-        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
-        if (x >= plane_ptr->dst.width) break;
-        const int col = col_in_unit << subsampling_x;
-        const int col_uv = col | subsampling_x;
-        const uint64_t mask = ((uint64_t)1 << (shift | col));
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((c + col_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-          switch (plane) {
-            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        col_in_unit += tx_size_wide_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_build_bitmask_horz_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
-    const int mi_col = c << subsampling_x;
-    const int col = mi_col % MI_SIZE_64X64;
-    const int col_uv = col | subsampling_x;
-
-    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
-         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
-      const int mi_row = r << subsampling_y;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int r_in_unit = 0;
-           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
-        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
-        if (y >= plane_ptr->dst.height) break;
-        const int row = r_in_unit << subsampling_y;
-        const int row_uv = row | subsampling_y;
-        int index = 0;
-        const int shift = get_index_shift(col, row, &index);
-        const uint64_t mask = ((uint64_t)1 << shift);
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((r + r_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-
-          switch (plane) {
-            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        r_in_unit += tx_size_high_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_filter_block_plane_bitmask_vert(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int two_row_step = 2 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  const int two_row_stride = row_stride << 1;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-
-  // 1. vertical filtering. filter two rows at a time
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += two_row_step) {
-    const int row = r | ssy;
-    const int row_next = row + row_step;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    int index_next = 0;
-    const int shift_next = get_index_shift(col, row_next, &index_next);
-    const int has_next_row = row_next < cm->mi_rows;
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_ver[row][col];
-        lfl2 = &lfm->lfl_y_ver[row_next][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_ver[row][col];
-        lfl2 = &lfm->lfl_u_ver[row_next][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_ver[row][col];
-        lfl2 = &lfm->lfl_v_ver[row_next][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-    if (!has_next_row) {
-      mask_16x16_1 = 0;
-      mask_8x8_1 = 0;
-      mask_4x4_1 = 0;
-    }
-
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-    dst->buf += two_row_stride;
-  }
-  // reset buf pointer for horizontal filtering
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_bitmask_horz(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += row_step) {
-    if (mi_row + r == 0) {
-      dst->buf += row_stride;
-      continue;
-    }
-    const int row = r | ssy;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_hor[row][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_hor[row][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_hor[row][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-    dst->buf += row_stride;
-  }
-  // reset buf pointer for next block
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_ver(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int single_step = 1 << ssy;
-  const int r_step = 2 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-
-  // filter two rows at a time
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      // current and next row should belong to the same mask_idx and index
-      // next row's shift
-      const int row_next = row + single_step;
-      int index_next = 0;
-      const int shift_next = get_index_shift(col, row_next, &index_next);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_ver[row][col];
-          lfl2 = &lfm->lfl_y_ver[row_next][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_ver[row][col];
-          lfl2 = &lfm->lfl_u_ver[row_next][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_ver[row][col];
-          lfl2 = &lfm->lfl_v_ver[row_next][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_vert_row2(
-            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
-                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
-                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
-                                     &cm->lf_info, lfl, lfl2);
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += 2 * MI_SIZE * dst->stride;
-  }
-}
-
-void av1_filter_block_plane_hor(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int r_step = 1 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      if (mi_row + r == 0) continue;
-
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_hor[row][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_hor[row][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_hor[row][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride, pl, ssx, mask_16x16,
-                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += MI_SIZE * dst->stride;
-  }
-}
-#endif  // LOOP_FILTER_BITMASK
-
 static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
                                   const MB_MODE_INFO *const mbmi,
                                   const EDGE_DIR edge_dir, const int mi_row,
@@ -1958,7 +212,7 @@
     tx_size = mb_tx_size;
   }
 
-  // since in case of chrominance or non-square transorm need to convert
+  // since in case of chrominance or non-square transform need to convert
   // transform size into transform size in particular direction.
   // for vertical edge, filter direction is horizontal, for horizontal
   // edge, filter direction is vertical.
@@ -1977,7 +231,7 @@
 } AV1_DEBLOCKING_PARAMETERS;
 
 // Return TX_SIZE from get_transform_size(), so it is plane and direction
-// awared
+// aware
 static TX_SIZE set_lpf_parameters(
     AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
@@ -2002,7 +256,8 @@
   // and mi_col should be odd number for chroma plane.
   const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
   const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
-  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO **mi =
+      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
   const MB_MODE_INFO *mbmi = mi[0];
   // If current mbmi is not correctly setup, return an invalid value to stop
   // filtering. One example is that if this tile is not coded, then its mbmi
@@ -2023,7 +278,7 @@
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
     {
       const uint32_t curr_level =
-          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+          av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
       const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
       uint32_t level = curr_level;
       if (coord) {
@@ -2038,12 +293,13 @@
               xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
 
           const uint32_t pv_lvl =
-              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+              av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
 
           const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
           const BLOCK_SIZE bsize =
               get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
                                    plane_ptr->subsampling_y);
+          assert(bsize < BLOCK_SIZES_ALL);
           const int prediction_masks = edge_dir == VERT_EDGE
                                            ? block_size_wide[bsize] - 1
                                            : block_size_high[bsize] - 1;
@@ -2091,21 +347,18 @@
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-  for (int y = 0; y < y_range; y += row_step) {
+  for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
-      // If 4x4 trasnform is used, it will then filter the internal edge
+      // If 4x4 transform is used, it will then filter the internal edge
       //  aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2122,6 +375,9 @@
         tx_size = TX_4X4;
       }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -2166,6 +422,32 @@
         // no filtering
         default: break;
       }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                              params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       // advance the destination pointer
       advance_units = tx_size_wide_unit[tx_size];
       x += advance_units;
@@ -2178,21 +460,18 @@
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-  for (int x = 0; x < x_range; x += col_step) {
+  for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will first filter the vertical edge aligned with a 8x8
-      // block. If 4x4 trasnform is used, it will then filter the internal
+      // block. If 4x4 transform is used, it will then filter the internal
       // edge aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2201,14 +480,17 @@
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      tx_size =
-          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
-                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
       if (tx_size == TX_INVALID) {
         params.filter_length = 0;
         tx_size = TX_4X4;
       }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -2254,6 +536,33 @@
         // no filtering
         default: break;
       }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
       // advance the destination pointer
       advance_units = tx_size_high_unit[tx_size];
@@ -2269,19 +578,18 @@
                                       const MACROBLOCKD_PLANE *const plane_ptr,
                                       const uint32_t mi_row,
                                       const uint32_t mi_col) {
-  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_rows >> scale_vert;
-  const int x_range = cm->mi_cols >> scale_horz;
-  for (int y = 0; y < y_range; y += row_step) {
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
-      // If 4x4 trasnform is used, it will then filter the internal edge
+      // If 4x4 transform is used, it will then filter the internal edge
       //  aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2312,19 +620,18 @@
                                       const MACROBLOCKD_PLANE *const plane_ptr,
                                       const uint32_t mi_row,
                                       const uint32_t mi_col) {
-  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_rows >> scale_vert;
-  const int x_range = cm->mi_cols >> scale_horz;
-  for (int x = 0; x < x_range; x += col_step) {
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will first filter the vertical edge aligned with a 8x8
-      // block. If 4x4 trasnform is used, it will then filter the internal
+      // block. If 4x4 transform is used, it will then filter the internal
       // edge aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2333,9 +640,9 @@
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      tx_size =
-          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
-                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
       if (tx_size == TX_INVALID) {
         params.filter_length = 0;
         tx_size = TX_4X4;
@@ -2351,17 +658,17 @@
 
 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                              MACROBLOCKD *xd, int start, int stop,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                              int is_decoding,
 #endif
                              int plane_start, int plane_end) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
-  const int col_end = cm->mi_cols;
+  const int col_end = cm->mi_params.mi_cols;
   int mi_row, mi_col;
   int plane;
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   if (is_decoding) {
     cm->is_decoding = is_decoding;
     for (plane = plane_start; plane < plane_end; plane++) {
@@ -2460,23 +767,23 @@
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                            MACROBLOCKD *xd,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                            int is_decoding,
 #endif
                            int plane_start, int plane_end, int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_rows;
-  if (partial_frame && cm->mi_rows > 8) {
-    start_mi_row = cm->mi_rows >> 1;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
   loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                    is_decoding,
 #endif
                    plane_start, plane_end);

diff --git a/libaom/av1/common/av1_loopfilter.h b/libaom/av1/common/av1_loopfilter.h
index ae4d372..ce26d16 100644
--- a/libaom/av1/common/av1_loopfilter.h
+++ b/libaom/av1/common/av1_loopfilter.h

@@ -33,11 +33,12 @@
   LF_PATH_SLOW,
 };
 
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
 typedef struct {
   uint64_t bits[4];
 } FilterMask;
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
 // This structure holds bit masks for all 4x4 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
@@ -76,7 +77,7 @@
   FilterMask tx_size_ver[2][5];
   FilterMask tx_size_hor[2][5];
 } LoopFilterMask;
-#endif  // LOOP_FILTER_BITMASK
+#endif  // CONFIG_LPF_MASK
 
 struct loopfilter {
   int filter_level[2];
@@ -97,11 +98,11 @@
 
   int combine_vert_horz_lf;
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   LoopFilterMask *lfm;
   size_t lfm_num;
   int lfm_stride;
-#endif  // LOOP_FILTER_BITMASK
+#endif  // CONFIG_LPF_MASK
 };
 
 // Need to align this structure so when it is declared and
@@ -127,13 +128,13 @@
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int is_decoding,
+                           struct macroblockd *xd, int is_decoding,
                            int plane_start, int plane_end, int partial_frame);
 #else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int plane_start,
+                           struct macroblockd *xd, int plane_start,
                            int plane_end, int partial_frame);
 #endif
 
@@ -156,14 +157,10 @@
   MACROBLOCKD *xd;
 } LFWorkerData;
 
-uint8_t get_filter_level(const struct AV1Common *cm,
-                         const loop_filter_info_n *lfi_n, const int dir_idx,
-                         int plane, const MB_MODE_INFO *mbmi);
-#if LOOP_FILTER_BITMASK
-void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
-                       int plane, int subsampling_x, int subsampling_y,
-                       int row_end, int col_end);
-
+uint8_t av1_get_filter_level(const struct AV1Common *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi);
+#if CONFIG_LPF_MASK
 void av1_filter_block_plane_ver(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane_ptr,
                                 int pl, int mi_row, int mi_col);
@@ -171,8 +168,7 @@
 void av1_filter_block_plane_hor(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane, int pl,
                                 int mi_row, int mi_col);
-LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
-                                     int mi_row, int mi_col);
+
 int get_index_shift(int mi_col, int mi_row, int *index);
 
 void av1_build_bitmask_vert_info(
@@ -191,23 +187,19 @@
     struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
     int pl, int mi_row, int mi_col);
 
-#endif  // LOOP_FILTER_BITMASK
+void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
+                                     int mi_col, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO *mbmi);
 
-extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border);
 
-extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
-
-extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
-
-extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
-
-// corresponds to entry id in table left_mask_univariant_reordered,
-// of block size mxn and TX_mxn.
-extern const int mask_id_table_vert_border[BLOCK_SIZES_ALL];
-
-extern const FilterMask left_mask_univariant_reordered[67];
-
-extern const FilterMask above_mask_univariant_reordered[67];
+void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi);
+#endif  // CONFIG_LPF_MASK
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/common/av1_rtcd_defs.pl b/libaom/av1/common/av1_rtcd_defs.pl
index d0e5030..296c6c5 100644
--- a/libaom/av1/common/av1_rtcd_defs.pl
+++ b/libaom/av1/common/av1_rtcd_defs.pl

@@ -36,20 +36,43 @@
 struct NN_CONFIG;
 typedef struct NN_CONFIG NN_CONFIG;
 
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -67,17 +90,16 @@
 add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
 specialize qw/av1_convolve_horiz_rs sse4_1/;
 
-add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
-specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+  specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
+
+  add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
+  specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+}
 
 add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
-
-add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
-
 specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
-specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
-specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
-
 
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
@@ -111,26 +133,22 @@
 
 #inv txfm
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
-# mismatches.
-specialize qw/av1_inv_txfm_add ssse3 neon/;
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
-add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
-# mismatches.
-specialize qw/av1_highbd_inv_txfm_add sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
 
-add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -160,15 +178,15 @@
 add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-# directional intra predictor functions
-add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
-specialize qw/av1_highbd_dr_prediction_z1 avx2/;
-add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-# TODO(niva213@gmail.com): Re-enable avx2 after fixing valgrind issue
-# https://crbug.com/aomedia/2316
-# specialize qw/av1_highbd_dr_prediction_z2 avx2/;
-add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
-specialize qw/av1_highbd_dr_prediction_z3 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  # directional intra predictor functions
+  add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z1 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z3 avx2/;
+}
 
 # build compound seg mask functions
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
@@ -194,10 +212,17 @@
   # the transform coefficients are held in 32-bit
   # values, so the assembler code for  av1_block_error can no longer be used.
   add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/av1_block_error avx2/;
+  specialize qw/av1_block_error sse2 avx2 neon/;
+
+  add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
+  specialize qw/av1_block_error_lp avx2 neon/;
 
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp sse2 avx2/;
+  specialize qw/av1_quantize_fp sse2 avx2 neon/;
+
+  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
+  specialize qw/av1_quantize_lp avx2 neon/;
+
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp_32x32 avx2/;
@@ -256,25 +281,29 @@
   #
   # Motion search
   #
-  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
-
   add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
-  add_proto qw/void av1_apply_temporal_filter/, "const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-  specialize qw/av1_apply_temporal_filter sse4_1/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+  }
 
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
+  }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+    specialize qw/av1_highbd_block_error sse2 avx2/;
+  }
 
-  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/av1_highbd_block_error sse2 avx2/;
-
-  add_proto qw/void av1_highbd_apply_temporal_filter/, "const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride, const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up, const uint8_t *vp, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-  specialize qw/av1_highbd_apply_temporal_filter sse4_1/;
-
-  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+  }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
@@ -288,42 +317,57 @@
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
-  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
   specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
 
   # hash
-  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
   add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
   specialize qw/av1_compute_stats sse4_1 avx2/;
 
-  add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-  specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+    specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+  }
+
+  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+  specialize qw/av1_calc_proj_params avx2/;
 
   add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
   specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
 
-  add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-  specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
-
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+  }
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
 
-  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, float *const output";
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
   specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions
 
+# CNN functions
+
+add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
+add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step";
+add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+
 # Deringing Functions
 
 add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
 add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
 
-add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
 # VS compiling for 32 bit targets does not support vector types in
 # structs as arguments, which makes the v256 type of the intrinsics
@@ -331,58 +375,61 @@
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
   specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
   specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 neon/;
+specialize qw/av1_warp_affine sse4_1 avx2 neon/;
 
-add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_highbd_warp_affine sse4_1/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+  specialize qw/av1_highbd_warp_affine sse4_1/;
+}
+
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error sse2 avx2/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
-  specialize qw/compute_cross_correlation sse4_1 avx2/;
+  add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+  specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
 }
 
 # LOOP_RESTORATION functions
 
-add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-# TODO(b/141858830,b/141859709): neon is currently disabled due to use of
-# uninitialized memory.
-specialize qw/apply_selfguided_restoration sse4_1 avx2/;
+add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                  int sgr_params_idx, int bit_depth, int highbd";
-# TODO(b/141858830,b/141859709): neon is currently disabled due to use of
-# uninitialized memory.
-specialize qw/av1_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
+}
 
-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
   specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
@@ -393,15 +440,17 @@
   specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
   specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
   specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
-  specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
-  specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
-  specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
-  specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
-  specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
-  specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
+  if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
+    specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+    specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  }
 
 # INTRA_EDGE functions
 add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
@@ -415,8 +464,8 @@
 specialize qw/av1_upsample_intra_edge_high sse4_1/;
 
 # CFL
-add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
-specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
+add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
 
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
@@ -427,19 +476,21 @@
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+  add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
+}
 
-add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
+add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
 
 1;

diff --git a/libaom/av1/common/blockd.c b/libaom/av1/common/blockd.c
index 2e796b6..00725ea 100644
--- a/libaom/av1/common/blockd.c
+++ b/libaom/av1/common/blockd.c

@@ -13,8 +13,8 @@
 
 #include "aom_ports/system_state.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
   if (!left_mi) return DC_PRED;
@@ -28,11 +28,12 @@
   return above_mi->mode;
 }
 
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                      int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
   const int txs_wide = tx_size_wide_unit[tx_size];
   const int txs_high = tx_size_high_unit[tx_size];
 
@@ -56,23 +57,18 @@
     memset(l, has_eob, sizeof(*l) * txs_high);
   }
 }
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, const int num_planes) {
-  int i;
-  int nplanes;
-  int chroma_ref;
-  chroma_ref =
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y);
-  nplanes = 1 + (num_planes - 1) * chroma_ref;
-  for (i = 0; i < nplanes; i++) {
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
+  for (int i = 0; i < nplanes; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
-    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+    const int txs_wide = mi_size_wide[plane_bsize];
+    const int txs_high = mi_size_high[plane_bsize];
+    memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
   }
 }
 
@@ -104,37 +100,3 @@
     xd->plane[i].subsampling_y = 1;
   }
 }
-
-const int16_t dr_intra_derivative[90] = {
-  // More evenly spread out angles and limited to 10-bit
-  // Values that are 0 will never be used
-  //                    Approx angle
-  0,    0, 0,        //
-  1023, 0, 0,        // 3, ...
-  547,  0, 0,        // 6, ...
-  372,  0, 0, 0, 0,  // 9, ...
-  273,  0, 0,        // 14, ...
-  215,  0, 0,        // 17, ...
-  178,  0, 0,        // 20, ...
-  151,  0, 0,        // 23, ... (113 & 203 are base angles)
-  132,  0, 0,        // 26, ...
-  116,  0, 0,        // 29, ...
-  102,  0, 0, 0,     // 32, ...
-  90,   0, 0,        // 36, ...
-  80,   0, 0,        // 39, ...
-  71,   0, 0,        // 42, ...
-  64,   0, 0,        // 45, ... (45 & 135 are base angles)
-  57,   0, 0,        // 48, ...
-  51,   0, 0,        // 51, ...
-  45,   0, 0, 0,     // 54, ...
-  40,   0, 0,        // 58, ...
-  35,   0, 0,        // 61, ...
-  31,   0, 0,        // 64, ...
-  27,   0, 0,        // 67, ... (67 & 157 are base angles)
-  23,   0, 0,        // 70, ...
-  19,   0, 0,        // 73, ...
-  15,   0, 0, 0, 0,  // 76, ...
-  11,   0, 0,        // 81, ...
-  7,    0, 0,        // 84, ...
-  3,    0, 0,        // 87, ...
-};

diff --git a/libaom/av1/common/blockd.h b/libaom/av1/common/blockd.h
index 91ef3df..47597bc 100644
--- a/libaom/av1/common/blockd.h
+++ b/libaom/av1/common/blockd.h

@@ -37,6 +37,8 @@
 
 #define MAX_DIFFWTD_MASK_BITS 1
 
+#define INTERINTRA_WEDGE_SIGN 0
+
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
 enum {
   DIFFWTD_38 = 0,
@@ -73,24 +75,24 @@
 }
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
-  static PREDICTION_MODE lut[] = {
-    MB_MODE_COUNT,  // DC_PRED
-    MB_MODE_COUNT,  // V_PRED
-    MB_MODE_COUNT,  // H_PRED
-    MB_MODE_COUNT,  // D45_PRED
-    MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D113_PRED
-    MB_MODE_COUNT,  // D157_PRED
-    MB_MODE_COUNT,  // D203_PRED
-    MB_MODE_COUNT,  // D67_PRED
-    MB_MODE_COUNT,  // SMOOTH_PRED
-    MB_MODE_COUNT,  // SMOOTH_V_PRED
-    MB_MODE_COUNT,  // SMOOTH_H_PRED
-    MB_MODE_COUNT,  // PAETH_PRED
-    MB_MODE_COUNT,  // NEARESTMV
-    MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // GLOBALMV
-    MB_MODE_COUNT,  // NEWMV
+  static const PREDICTION_MODE lut[] = {
+    DC_PRED,        // DC_PRED
+    V_PRED,         // V_PRED
+    H_PRED,         // H_PRED
+    D45_PRED,       // D45_PRED
+    D135_PRED,      // D135_PRED
+    D113_PRED,      // D113_PRED
+    D157_PRED,      // D157_PRED
+    D203_PRED,      // D203_PRED
+    D67_PRED,       // D67_PRED
+    SMOOTH_PRED,    // SMOOTH_PRED
+    SMOOTH_V_PRED,  // SMOOTH_V_PRED
+    SMOOTH_H_PRED,  // SMOOTH_H_PRED
+    PAETH_PRED,     // PAETH_PRED
+    NEARESTMV,      // NEARESTMV
+    NEARMV,         // NEARMV
+    GLOBALMV,       // GLOBALMV
+    NEWMV,          // NEWMV
     NEARESTMV,      // NEAREST_NEARESTMV
     NEARMV,         // NEAR_NEARMV
     NEARESTMV,      // NEAREST_NEWMV
@@ -101,12 +103,12 @@
     NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-  assert(is_inter_compound_mode(mode));
+  assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
   return lut[mode];
 }
 
 static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
-  static PREDICTION_MODE lut[] = {
+  static const PREDICTION_MODE lut[] = {
     MB_MODE_COUNT,  // DC_PRED
     MB_MODE_COUNT,  // V_PRED
     MB_MODE_COUNT,  // H_PRED
@@ -187,13 +189,14 @@
   int64_t rdcost;
   int64_t sse;
   int skip;  // sse should equal to dist when skip == 1
-  int64_t ref_rdcost;
   int zero_rate;
-  uint8_t invalid_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-  int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
-                        [TXB_COEFF_COST_MAP_SIZE];
+  // TODO(jingning): Temporary solution to silence stack over-size warning
+  // in handle_inter_mode. This should be fixed after rate-distortion
+  // optimization refactoring.
+  int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                            [TXB_COEFF_COST_MAP_SIZE];
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
@@ -201,8 +204,8 @@
 // sent together in functions related to interinter compound modes
 typedef struct {
   uint8_t *seg_mask;
-  int wedge_index;
-  int wedge_sign;
+  int8_t wedge_index;
+  int8_t wedge_sign;
   DIFFWTD_MASK_TYPE mask_type;
   COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
@@ -211,40 +214,23 @@
 #define TXK_TYPE_BUF_LEN 64
 // This structure now relates to 4x4 block regions.
 typedef struct MB_MODE_INFO {
-  PALETTE_MODE_INFO palette_mode_info;
-  WarpedMotionParams wm_params;
   // interinter members
   INTERINTER_COMPOUND_DATA interinter_comp;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  WarpedMotionParams wm_params;
   int_mv mv[2];
-  // Only for INTER blocks
-  InterpFilters interp_filters;
-  // TODO(debargha): Consolidate these flags
-  int interintra_wedge_index;
-  int interintra_wedge_sign;
-  int overlappable_neighbors[2];
   int current_qindex;
-  int delta_lf_from_base;
-  int delta_lf[FRAME_LF_COUNT];
+  // Only for INTER blocks
+  int_interpfilters interp_filters;
+  // TODO(debargha): Consolidate these flags
 #if CONFIG_RD_DEBUG
   RD_STATS rd_stats;
   int mi_row;
   int mi_col;
 #endif
-  int num_proj_ref;
-
-  // Index of the alpha Cb and alpha Cr combination
-  int cfl_alpha_idx;
-  // Joint sign of alpha Cb and alpha Cr
-  int cfl_alpha_signs;
-
-  // Indicate if masked compound is used(1) or not(0).
-  int comp_group_idx;
-  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
-  int compound_idx;
 #if CONFIG_INSPECTION
   int16_t tx_skip[TXK_TYPE_BUF_LEN];
 #endif
+  PALETTE_MODE_INFO palette_mode_info;
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
@@ -254,21 +240,34 @@
   INTERINTRA_MODE interintra_mode;
   MOTION_MODE motion_mode;
   PARTITION_TYPE partition;
-  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
   MV_REFERENCE_FRAME ref_frame[2];
-  int8_t use_wedge_interintra;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
   int8_t skip;
-  int8_t skip_mode;
   uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
   TX_SIZE tx_size;
-  int8_t segment_id;
-  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
-  uint8_t use_intrabc;
+  int8_t delta_lf_from_base;
+  int8_t delta_lf[FRAME_LF_COUNT];
+  int8_t interintra_wedge_index;
   // The actual prediction angle is the base angle + (angle_delta * step).
   int8_t angle_delta[PLANE_TYPES];
   /* deringing gain *per-superblock* */
-  int8_t cdef_strength;
-  uint8_t ref_mv_idx;
+  // Joint sign of alpha Cb and alpha Cr
+  int8_t cfl_alpha_signs;
+  // Index of the alpha Cb and alpha Cr combination
+  uint8_t cfl_alpha_idx;
+  uint8_t num_proj_ref;
+  uint8_t overlappable_neighbors[2];
+  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+  uint8_t compound_idx;
+  uint8_t use_wedge_interintra : 1;
+  uint8_t segment_id : 3;
+  uint8_t seg_id_predicted : 1;  // valid only when temporal_update is enabled
+  uint8_t skip_mode : 1;
+  uint8_t use_intrabc : 1;
+  uint8_t ref_mv_idx : 2;
+  // Indicate if masked compound is used(1) or not(0).
+  uint8_t comp_group_idx : 1;
+  int8_t cdef_strength : 4;
 } MB_MODE_INFO;
 
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
@@ -362,9 +361,9 @@
                                    int mi_row, int tx_blk_col, int tx_blk_row,
                                    int subsampling_x, int subsampling_y) {
   *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
-             (tx_blk_col << tx_size_wide_log2[0]);
+             (tx_blk_col << MI_SIZE_LOG2);
   *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
-             (tx_blk_row << tx_size_high_log2[0]);
+             (tx_blk_row << MI_SIZE_LOG2);
 }
 #endif
 
@@ -399,8 +398,8 @@
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  ENTROPY_CONTEXT *above_context;
-  ENTROPY_CONTEXT *left_context;
+  ENTROPY_CONTEXT *above_entropy_context;
+  ENTROPY_CONTEXT *left_entropy_context;
 
   // The dequantizers below are true dequantizers used only in the
   // dequantization process.  They have the same coefficient
@@ -413,16 +412,9 @@
 
   qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
   qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
-  // the 'dequantizers' below are not literal dequantizer values.
-  // They're used by encoder RDO to generate ad-hoc lambda values.
-  // They use a hardwired Q3 coeff shift and do not necessarily match
-  // the TX scale in use.
-  const int16_t *dequant_Q3;
 } MACROBLOCKD_PLANE;
 
-#define BLOCK_OFFSET(x, i) \
-  ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
+#define BLOCK_OFFSET(i) ((i) << 4)
 
 typedef struct {
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
@@ -467,16 +459,12 @@
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
-  int mi_row, mi_col;
-
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
 
 #if CONFIG_DEBUG
   int rate;
 #endif  // CONFIG_DEBUG
-
-  int is_chroma_reference;
 } CFL_CTX;
 
 typedef struct dist_wtd_comp_params {
@@ -490,53 +478,129 @@
 // Most/all of the pointers are mere pointers to actual arrays are allocated
 // elsewhere. This is mostly for coding convenience.
 typedef struct macroblockd {
+  // Row and column position of current macroblock in mi units.
+  int mi_row;
+  int mi_col;
+  // Same as cm->mi_params.mi_stride, copied here for convenience.
+  int mi_stride;
+
+  // True if current block transmits chroma information.
+  // More detail:
+  // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+  // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+  // blocks smaller than 8x8 maybe combined into one chroma block.
+  // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+  // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+  // these four luma blocks. This is implemented in bitstream as follows:
+  // - There are four MB_MODE_INFO structs for the four luma blocks.
+  // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+  // any information for chroma planes.
+  // - Last block will have is_chroma_ref = true and transmits chroma
+  // information for the 4x4 chroma block that covers whole 8x8 area covered by
+  // four luma blocks.
+  // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+  bool is_chroma_ref;
+
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
   TileInfo tile;
 
-  int mi_stride;
-
+  // Appropriate offset inside cm->mi_params.mi_grid_base based on current
+  // mi_row and mi_col.
   MB_MODE_INFO **mi;
+
+  // True if 4x4 block above the current block is available.
+  bool up_available;
+  // True if 4x4 block to the left of the current block is available.
+  bool left_available;
+  // True if the above chrome reference block is available.
+  bool chroma_up_available;
+  // True if the left chrome reference block is available.
+  bool chroma_left_available;
+
+  // MB_MODE_INFO for 4x4 block to the left of the current block, if
+  // left_available == true; otherwise NULL.
   MB_MODE_INFO *left_mbmi;
+  // MB_MODE_INFO for 4x4 block above the current block, if
+  // up_available == true; otherwise NULL.
   MB_MODE_INFO *above_mbmi;
+  // Above chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_up_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
   MB_MODE_INFO *chroma_left_mbmi;
+  // Left chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_left_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
   MB_MODE_INFO *chroma_above_mbmi;
 
-  int up_available;
-  int left_available;
-  int chroma_up_available;
-  int chroma_left_available;
+  // Appropriate offset based on current 'mi_row' and 'mi_col', inside
+  // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+  // 'MACROBLOCK' structs.
+  uint8_t *tx_type_map;
+  // Stride for 'tx_type_map'. Note that this may / may not be same as
+  // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+  int tx_type_map_stride;
 
-  /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
+  // Distance of this macroblock from frame edges in 1/8th pixel units.
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  /* pointers to reference frame scale factors */
+  // Scale factors for reference frames of the current block.
+  // These are pointers into 'cm->ref_scale_factors'.
   const struct scale_factors *block_ref_scale_factors[2];
 
-  /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+  // Entropy contexts for the above blocks.
+  // above_entropy_context[i][j] corresponds to above entropy context for ith
+  // plane and jth mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.entropy'.
+  ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
+  // Entropy contexts for the left blocks.
+  // left_entropy_context[i][j] corresponds to left entropy context for ith
+  // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
 
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+  // Partition contexts for the above blocks.
+  // above_partition_context[i] corresponds to above partition context for ith
+  // mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.partition'.
+  PARTITION_CONTEXT *above_partition_context;
+  // Partition contexts for the left blocks.
+  // left_partition_context[i] corresponds to left partition context for ith
+  // mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
 
+  // Transform contexts for the above blocks.
+  // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
+  // code currently. Need to make it consistent / document why.
   TXFM_CONTEXT *above_txfm_context;
+  // Transform contexts for the left blocks.
   TXFM_CONTEXT *left_txfm_context;
+  // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
+  // Can we remove this indirection?
   TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
+  // Default values for the two restoration filters for each plane.
+  // These values are used as reference values when writing the bitstream. That
+  // is, we transmit the delta between the actual values in
+  // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
   WienerInfo wiener_info[MAX_MB_PLANE];
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
 
-  // block dimension in the unit of mode_info.
-  uint8_t n4_w, n4_h;
+  // Block dimensions in MB_MODE_INFO units.
+  uint8_t width;
+  uint8_t height;
 
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   uint8_t is_sec_rect;
 
   // Counts of each reference frame in the above and left neighboring blocks.
@@ -544,15 +608,18 @@
   uint8_t neighbors_ref_counts[REF_FRAMES];
 
   FRAME_CONTEXT *tile_ctx;
-  /* Bit depth: 8, 10, 12 */
+  // Bit depth: copied from cm->seq_params.bit_depth for convenience.
   int bd;
 
   int qindex[MAX_SEGMENTS];
   int lossless[MAX_SEGMENTS];
+  // TODO(urvang): Move to decoder.
   int corrupted;
+  // Same as cm->features.cur_frame_force_integer_mv.
   int cur_frame_force_integer_mv;
-  // same with that in AV1_COMMON
+  // Pointer to cm->error.
   struct aom_internal_error_info *error_info;
+  // Same as cm->global_motion.
   const WarpedMotionParams *global_motion;
   int delta_qindex;
   int current_qindex;
@@ -562,7 +629,7 @@
   // filtering level) and code the delta between previous superblock's delta
   // lf and current delta lf. It is equivalent to the delta between previous
   // superblock's actual lf and current lf.
-  int delta_lf_from_base;
+  int8_t delta_lf_from_base;
   // For this experiment, we have four frame filter levels for different plane
   // and direction. So, to support the per superblock update, we need to add
   // a few more params as below.
@@ -576,8 +643,21 @@
   // SEG_LVL_ALT_LF_Y_H = 2;
   // SEG_LVL_ALT_LF_U   = 3;
   // SEG_LVL_ALT_LF_V   = 4;
-  int delta_lf[FRAME_LF_COUNT];
-  int cdef_preset[4];
+  int8_t delta_lf[FRAME_LF_COUNT];
+  // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+  // current superblock has already been read from (decoder) / written to
+  // (encoder) the bitstream; and false otherwise.
+  // More detail:
+  // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
+  // non-skip coding block. So, we need this array to keep track of whether CDEF
+  // strengths for the given CDEF units have been transmitted yet or not.
+  // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+  // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+  // superblock size is 128x128). Hence the array size is 4.
+  // (3) In the current implementation, CDEF strength for this CDEF unit is
+  // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+  // cm->mi_params.mi_grid_base).
+  bool cdef_transmitted[4];
 
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
   uint8_t *mc_buf[2];
@@ -677,6 +757,22 @@
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
 
+static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
+  0x080F,  // DC_PRED:       0000 1000 0000 1111
+  0x040F,  // V_PRED:        0000 0100 0000 1111
+  0x080F,  // H_PRED:        0000 1000 0000 1111
+  0x020F,  // D45_PRED:      0000 0010 0000 1111
+  0x080F,  // D135_PRED:     0000 1000 0000 1111
+  0x040F,  // D113_PRED:     0000 0100 0000 1111
+  0x080F,  // D157_PRED:     0000 1000 0000 1111
+  0x080F,  // D203_PRED:     0000 1000 0000 1111
+  0x040F,  // D67_PRED:      0000 0100 0000 1111
+  0x080F,  // SMOOTH_PRED:   0000 1000 0000 1111
+  0x040F,  // SMOOTH_V_PRED: 0000 0100 0000 1111
+  0x080F,  // SMOOTH_H_PRED: 0000 1000 0000 1111
+  0x0C0E,  // PAETH_PRED:    0000 1100 0000 1110
+};
+
 static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
   0x0001,  // 0000 0000 0000 0001
   0x0201,  // 0000 0010 0000 0001
@@ -686,6 +782,11 @@
   0xFFFF,  // 1111 1111 1111 1111
 };
 
+static const TxSetType av1_ext_tx_set_lookup[2][2] = {
+  { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX },
+  { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
+};
+
 static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                 int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -695,13 +796,7 @@
   if (use_reduced_set)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
   const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-  if (is_inter) {
-    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
-                                    : EXT_TX_SET_ALL16);
-  } else {
-    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
-                                    : EXT_TX_SET_DTT4_IDTX_1DDCT);
-  }
+  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 }
 
 // Maps tx set types to the indices.
@@ -740,7 +835,6 @@
     return largest_tx_size;
 }
 
-extern const int16_t dr_intra_derivative[90];
 static const uint8_t mode_to_angle_map[] = {
   0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
 };
@@ -785,45 +879,77 @@
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
                                               int subsampling_x,
                                               int subsampling_y) {
-  if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+  assert(bsize < BLOCK_SIZES_ALL);
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
   return ss_size_lookup[bsize][subsampling_x][subsampling_y];
 }
 
+/*
+ * Logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
 static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
-  TX_SIZE txs = max_txsize_rect_lookup[bsize];
-  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
-    txs = sub_tx_size_map[txs];
-  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
-  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
-  const int bw_log2 = mi_size_wide_log2[bsize];
-  const int stride_log2 = bw_log2 - tx_w_log2;
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
+  };
   const int index =
-      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
   assert(index < INTER_TX_SIZE_BUF_LEN);
   return index;
 }
 
+#if CONFIG_INSPECTION
+/*
+ * Here is the logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
 static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
-  TX_SIZE txs = max_txsize_rect_lookup[bsize];
-  for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
-    txs = sub_tx_size_map[txs];
-  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
-  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
-  const int bw_uint_log2 = mi_size_wide_log2[bsize];
-  const int stride_log2 = bw_uint_log2 - tx_w_log2;
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2,
+  };
   const int index =
-      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
   assert(index < TXK_TYPE_BUF_LEN);
   return index;
 }
+#endif  // CONFIG_INSPECTION
 
-static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
-                                    int blk_row, int blk_col, TX_SIZE tx_size,
+static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+                                    int blk_col, TX_SIZE tx_size,
                                     TX_TYPE tx_type) {
-  const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
-  txk_type[txk_type_idx] = tx_type;
+  const int stride = xd->tx_type_map_stride;
+  xd->tx_type_map[blk_row * stride + blk_col] = tx_type;
 
   const int txw = tx_size_wide_unit[tx_size];
   const int txh = tx_size_high_unit[tx_size];
@@ -836,71 +962,84 @@
     const int tx_unit = tx_size_wide_unit[TX_16X16];
     for (int idy = 0; idy < txh; idy += tx_unit) {
       for (int idx = 0; idx < txw; idx += tx_unit) {
-        const int this_index =
-            av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
-        txk_type[this_index] = tx_type;
+        xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type;
       }
     }
   }
 }
 
-static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd, int blk_row,
+static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+                                      PLANE_TYPE plane_type, int blk_row,
                                       int blk_col, TX_SIZE tx_size,
                                       int reduced_tx_set) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const struct macroblockd_plane *const pd = &xd->plane[plane_type];
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    return DCT_DCT;
+  }
 
   TX_TYPE tx_type;
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
-    tx_type = DCT_DCT;
+  if (plane_type == PLANE_TYPE_Y) {
+    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
   } else {
-    if (plane_type == PLANE_TYPE_Y) {
-      const int txk_type_idx =
-          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-      tx_type = mbmi->txk_type[txk_type_idx];
-    } else if (is_inter_block(mbmi)) {
+    if (is_inter_block(mbmi)) {
       // scale back to y plane's coordinate
+      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
       blk_row <<= pd->subsampling_y;
       blk_col <<= pd->subsampling_x;
-      const int txk_type_idx =
-          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-      tx_type = mbmi->txk_type[txk_type_idx];
+      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
     } else {
       // In intra mode, uv planes don't share the same prediction mode as y
       // plane, so the tx_type should not be shared
       tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
     }
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
   }
   assert(tx_type < TX_TYPES);
-  if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
+                                                 reduced_tx_set)][tx_type]);
   return tx_type;
 }
 
 void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
                             const int num_planes);
 
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * int depth = 0;
+ * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ */
 static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
-  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  int depth = 0;
-  while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
-    depth++;
-    tx_size = sub_tx_size_map[tx_size];
-  }
-  return depth;
+  static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  };
+  return bsize_to_max_depth_table[bsize];
 }
 
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * assert(tx_size != TX_4X4);
+ * int depth = 0;
+ * while (tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ * assert(depth < 10);
+ */
 static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
-  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  assert(tx_size != TX_4X4);
-  int depth = 0;
-  while (tx_size != TX_4X4) {
-    depth++;
-    tx_size = sub_tx_size_map[tx_size];
-    assert(depth < 10);
-  }
+  assert(bsize < BLOCK_SIZES_ALL);
+  static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
+  };
+  const int depth = bsize_to_tx_size_depth_table[bsize];
   assert(depth <= MAX_TX_CATS);
   return depth - 1;
 }
@@ -941,8 +1080,8 @@
                                pd->subsampling_y);
 }
 
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, const int num_planes);
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes);
 
 void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
 
@@ -953,9 +1092,10 @@
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size, void *arg);
 
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                      int has_eob, int aoff, int loff);
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff);
 
 #define MAX_INTERINTRA_SB_SQUARE 32 * 32
 static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
@@ -1006,15 +1146,13 @@
 }
 
 static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
 static INLINE int is_motion_variation_allowed_compound(
     const MB_MODE_INFO *mbmi) {
-  if (!has_second_ref(mbmi))
-    return 1;
-  else
-    return 0;
+  return !has_second_ref(mbmi);
 }
 
 // input: log2 of length, 0(4), 1(8), ...
@@ -1051,25 +1189,13 @@
   }
 }
 
-static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
-                                            const WarpedMotionParams *gm_params,
-                                            const MACROBLOCKD *xd,
-                                            const MB_MODE_INFO *mbmi,
-                                            int allow_warped_motion) {
-  const MOTION_MODE last_motion_mode_allowed =
-      motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
-
-  // Check that the input mode is not illegal
-  if (last_motion_mode_allowed < mode)
-    assert(0 && "Illegal motion mode selected");
-}
-
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
   return (is_inter_block(mbmi));
 }
 
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
+  assert(sb_type < BLOCK_SIZES_ALL);
   return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
          block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
 }

diff --git a/libaom/av1/common/cdef.c b/libaom/av1/common/cdef.c
index 63f9883..ef7b866 100644
--- a/libaom/av1/common/cdef.c
+++ b/libaom/av1/common/cdef.c

@@ -16,45 +16,29 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef.h"
 #include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int maxc, maxr;
-  int skip = 1;
-  maxc = cm->mi_cols - mi_col;
-  maxr = cm->mi_rows - mi_row;
-
-  maxr = AOMMIN(maxr, MI_SIZE_64X64);
-  maxc = AOMMIN(maxc, MI_SIZE_64X64);
-
-  for (int r = 0; r < maxr; r++) {
-    for (int c = 0; c < maxc; c++) {
-      skip =
-          skip &&
-          cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
-    }
-  }
-  return skip;
-}
-
 static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
-  int is_skip = 1;
-  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
-    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
-      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
+  MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
+  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
+    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
+      if (!mbmi[c]->skip) return 0;
+    }
+  }
 
-  return is_skip;
+  return 1;
 }
 
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, BLOCK_SIZE bs) {
-  MB_MODE_INFO **grid = cm->mi_grid_visible;
-  int maxc = cm->mi_cols - mi_col;
-  int maxr = cm->mi_rows - mi_row;
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = mi_params->mi_grid_base;
+  int maxc = mi_params->mi_cols - mi_col;
+  int maxr = mi_params->mi_rows - mi_row;
 
   if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
     maxc = AOMMIN(maxc, MI_SIZE_128X128);
@@ -65,19 +49,15 @@
   else
     maxr = AOMMIN(maxr, MI_SIZE_64X64);
 
-  const int r_step = mi_size_high[BLOCK_8X8];
-  const int c_step = mi_size_wide[BLOCK_8X8];
-  const int r_shift = (r_step == 2);
-  const int c_shift = (c_step == 2);
-
-  assert(r_step == 1 || r_step == 2);
-  assert(c_step == 1 || c_step == 2);
-
+  const int r_step = 2;  // mi_size_high[BLOCK_8X8]
+  const int c_step = 2;  // mi_size_wide[BLOCK_8X8]
+  const int r_shift = 1;
+  const int c_shift = 1;
   int count = 0;
-
   for (int r = 0; r < maxr; r += r_step) {
     for (int c = 0; c < maxc; c += c_step) {
-      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
+                             mi_params->mi_stride)) {
         dlist[count].by = r >> r_shift;
         dlist[count].bx = c >> c_shift;
         count++;
@@ -87,8 +67,9 @@
   return count;
 }
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
-                                int sstride, int v, int h) {
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
+                                     const uint8_t *src, int sstride, int v,
+                                     int h) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
@@ -96,9 +77,9 @@
   }
 }
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
-                                 const uint16_t *src, int sstride, int v,
-                                 int h) {
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                      const uint16_t *src, int sstride, int v,
+                                      int h) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
@@ -112,10 +93,10 @@
   if (cm->seq_params.use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+    cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   } else {
     const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+    cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   }
 }
 
@@ -140,6 +121,7 @@
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                     MACROBLOCKD *xd) {
   const CdefInfo *const cdef_info = &cm->cdef_info;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
   uint16_t *linebuf[3];
@@ -154,8 +136,8 @@
   int xdec[3];
   int ydec[3];
   int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
                        num_planes);
   row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
@@ -168,7 +150,7 @@
     mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
     mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
   }
-  const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
   for (int pli = 0; pli < num_planes; pli++) {
     linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
     colbuf[pli] =
@@ -190,17 +172,18 @@
       int nhb, nvb;
       int cstart = 0;
       curr_row_cdef[fbc] = 0;
-      if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc] == NULL ||
-          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc]
+      if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                  MI_SIZE_64X64 * fbc] == NULL ||
+          mi_params
+                  ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                 MI_SIZE_64X64 * fbc]
                   ->cdef_strength == -1) {
         cdef_left = 0;
         continue;
       }
       if (!cdef_left) cstart = -CDEF_HBORDER;
-      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
-      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+      nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
       int frame_top, frame_left, frame_bottom, frame_right;
 
       int mi_row = MI_SIZE_64X64 * fbr;
@@ -218,18 +201,19 @@
       frame_left = (mi_col == 0) ? 1 : 0;
 
       if (fbr != nvfb - 1)
-        frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
+        frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0;
       else
         frame_bottom = 1;
 
       if (fbc != nhfb - 1)
-        frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
+        frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0;
       else
         frame_right = 1;
 
       const int mbmi_cdef_strength =
-          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc]
+          mi_params
+              ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                             MI_SIZE_64X64 * fbc]
               ->cdef_strength;
       level =
           cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
@@ -243,9 +227,9 @@
       uv_sec_strength += uv_sec_strength == 3;
       if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
            uv_sec_strength == 0) ||
-          (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
-                                             fbc * MI_SIZE_64X64, dlist,
-                                             BLOCK_64X64)) == 0) {
+          (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+                                                 fbc * MI_SIZE_64X64, dlist,
+                                                 BLOCK_64X64)) == 0) {
         cdef_left = 0;
         continue;
       }
@@ -254,8 +238,7 @@
       for (int pli = 0; pli < num_planes; pli++) {
         int coffset;
         int rend, cend;
-        int pri_damping = cdef_info->cdef_pri_damping;
-        int sec_damping = cdef_info->cdef_sec_damping;
+        int damping = cdef_info->cdef_damping;
         int hsize = nhb << mi_wide_l2[pli];
         int vsize = nvb << mi_high_l2[pli];
 
@@ -366,7 +349,7 @@
         }
 
         if (cm->seq_params.use_highbitdepth) {
-          cdef_filter_fb(
+          av1_cdef_filter_fb(
               NULL,
               &CONVERT_TO_SHORTPTR(
                   xd->plane[pli]
@@ -376,9 +359,9 @@
               xd->plane[pli].dst.stride,
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, pri_damping, sec_damping, coeff_shift);
+              sec_strength, damping, coeff_shift);
         } else {
-          cdef_filter_fb(
+          av1_cdef_filter_fb(
               &xd->plane[pli]
                    .dst.buf[xd->plane[pli].dst.stride *
                                 (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
@@ -386,7 +369,7 @@
               NULL, xd->plane[pli].dst.stride,
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, pri_damping, sec_damping, coeff_shift);
+              sec_strength, damping, coeff_shift);
         }
       }
       cdef_left = 1;

diff --git a/libaom/av1/common/cdef.h b/libaom/av1/common/cdef.h
index 3b2eac8..c36fd13 100644
--- a/libaom/av1/common/cdef.h
+++ b/libaom/av1/common/cdef.h

@@ -20,8 +20,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
 
 static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
 
@@ -37,13 +37,14 @@
 extern "C" {
 #endif
 
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, BLOCK_SIZE bsize);
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bsize);
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast);
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+                     int rdmult);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/common/cdef_block.c b/libaom/av1/common/cdef_block.c
index dfd5882..7120705 100644
--- a/libaom/av1/common/cdef_block.c
+++ b/libaom/av1/common/cdef_block.c

@@ -108,7 +108,7 @@
 }
 
 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+const int cdef_sec_taps[2] = { 2, 1 };
 
 /* Smooth in the direction detected. */
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
@@ -118,7 +118,7 @@
   int i, j, k;
   const int s = CDEF_BSTRIDE;
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
   for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
     for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
       int16_t sum = 0;
@@ -173,25 +173,20 @@
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int pri_damping, int sec_damping,
-                    int coeff_shift) {
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift) {
   int bi;
   int bx;
   int by;
-  int bsize, bsizex, bsizey;
-
-  int pri_strength = level << coeff_shift;
+  const int pri_strength = level << coeff_shift;
   sec_strength <<= coeff_shift;
-  sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
-  pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
-  bsize =
-      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
-  bsizex = 3 - xdec;
-  bsizey = 3 - ydec;
+  damping += coeff_shift - (pli != AOM_PLANE_Y);
+  const int bw_log2 = 3 - xdec;
+  const int bh_log2 = 3 - ydec;
   if (dirinit && pri_strength == 0 && sec_strength == 0) {
     // If we're here, both primary and secondary strengths are 0, and
     // we still haven't written anything to y[] yet, so we just copy
@@ -200,12 +195,12 @@
     for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-      int iy, ix;
       // TODO(stemidts/jmvalin): SIMD optimisations
-      for (iy = 0; iy < 1 << bsizey; iy++)
-        for (ix = 0; ix < 1 << bsizex; ix++)
-          dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-              in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
+      for (int iy = 0; iy < 1 << bh_log2; iy++) {
+        memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
+               &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
+               ((size_t)1 << bw_log2) * sizeof(*dst16));
+      }
     }
     return;
   }
@@ -231,25 +226,28 @@
     }
   }
 
+  const int bsize =
+      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+  const int t = pri_strength;
+  const int s = sec_strength;
   for (bi = 0; bi < cdef_count; bi++) {
-    int t = pri_strength;
-    int s = sec_strength;
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    if (dst8)
+    if (dst8) {
       cdef_filter_block(
-          &dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, dstride,
-          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
           (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, coeff_shift);
-    else
+          damping, damping, bsize, coeff_shift);
+    } else {
       cdef_filter_block(
           NULL,
-          &dst16[dirinit ? bi << (bsizex + bsizey)
-                         : (by << bsizey) * dstride + (bx << bsizex)],
-          dirinit ? 1 << bsizex : dstride,
-          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          &dst16[dirinit ? bi << (bw_log2 + bh_log2)
+                         : (by << bh_log2) * dstride + (bx << bw_log2)],
+          dirinit ? 1 << bw_log2 : dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
           (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, coeff_shift);
+          damping, damping, bsize, coeff_shift);
+    }
   }
 }

diff --git a/libaom/av1/common/cdef_block.h b/libaom/av1/common/cdef_block.h
index 8321d48..6b0ae0a 100644
--- a/libaom/av1/common/cdef_block.h
+++ b/libaom/av1/common/cdef_block.h

@@ -32,7 +32,7 @@
   (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
 
 extern const int cdef_pri_taps[2][2];
-extern const int cdef_sec_taps[2][2];
+extern const int cdef_sec_taps[2];
 DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
 
 typedef struct {
@@ -49,10 +49,10 @@
 void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
 
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int pri_damping, int sec_damping,
-                    int coeff_shift);
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift);
 #endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_

diff --git a/libaom/av1/common/cdef_block_simd.h b/libaom/av1/common/cdef_block_simd.h
index a3368ec..5a52bc1 100644
--- a/libaom/av1/common/cdef_block_simd.h
+++ b/libaom/av1/common/cdef_block_simd.h

@@ -238,7 +238,7 @@
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -405,7 +405,7 @@
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -551,7 +551,7 @@
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -708,7 +708,7 @@
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -882,9 +882,9 @@
   }
 }
 
-void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
-                                         const uint8_t *src, int sstride, int v,
-                                         int h) {
+void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                              const uint8_t *src, int sstride,
+                                              int v, int h) {
   int i, j;
   for (i = 0; i < v; i++) {
     for (j = 0; j < (h & ~0x7); j += 8) {
@@ -897,9 +897,9 @@
   }
 }
 
-void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                          const uint16_t *src, int sstride,
-                                          int v, int h) {
+void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                               const uint16_t *src, int sstride,
+                                               int v, int h) {
   int i, j;
   for (i = 0; i < v; i++) {
     for (j = 0; j < (h & ~0x7); j += 8) {

diff --git a/libaom/av1/common/cfl.c b/libaom/av1/common/cfl.c
index 65e18e8..98199cb 100644
--- a/libaom/av1/common/cfl.c
+++ b/libaom/av1/common/cfl.c

@@ -9,9 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
 #include "av1/common/common_data.h"
-#include "av1/common/onyxc_int.h"
 
 #include "config/av1_rtcd.h"
 
@@ -136,7 +136,7 @@
 
 CFL_SUB_AVG_FN(c)
 
-static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
+static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
                                    CFL_PRED_TYPE pred_type) {
   const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
                                                    : CFL_SIGN_V(joint_sign);
@@ -160,6 +160,7 @@
 
 CFL_PREDICT_FN(c, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
                        int alpha_q3, int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
@@ -173,6 +174,7 @@
 }
 
 CFL_PREDICT_FN(c, hbd)
+#endif
 
 static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
   CFL_CTX *const cfl = &xd->cfl;
@@ -180,7 +182,7 @@
   assert(cfl->are_parameters_computed == 0);
 
   cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
-  get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
   cfl->are_parameters_computed = 1;
 }
 
@@ -196,13 +198,15 @@
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
   assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
          CFL_BUF_SQUARE);
+#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
-                                xd->bd);
+    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
+                                    alpha_q3, xd->bd);
     return;
   }
-  get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+#endif
+  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
 }
 
 static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
@@ -248,6 +252,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
                                            int input_stride,
                                            uint16_t *output_q3, int width,
@@ -290,9 +295,11 @@
     output_q3 += CFL_BUF_LINE;
   }
 }
+#endif
 
 CFL_GET_SUBSAMPLE_FUNCTION(c)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
   if (sub_x == 1) {
@@ -303,6 +310,7 @@
   }
   return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
+#endif
 
 static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
@@ -319,7 +327,7 @@
                       int row, int col, TX_SIZE tx_size, int use_hbd) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const int tx_off_log2 = tx_size_wide_log2[0];
+  const int tx_off_log2 = MI_SIZE_LOG2;
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
   const int store_row = row << (tx_off_log2 - sub_y);
@@ -348,7 +356,7 @@
   // Store the input into the CfL pixel buffer
   uint16_t *recon_buf_q3 =
       cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
     cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
                                                input_stride, recon_buf_q3);
@@ -356,20 +364,25 @@
     cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
                                                recon_buf_q3);
   }
+#else
+  (void)use_hbd;
+  cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
+#endif
 }
 
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
 // and non-chroma-referenced blocks are stored together in the CfL buffer.
-static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+                                        int mi_col, int *row_out,
                                         int *col_out) {
   // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
-  if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+  if ((mi_row & 0x01) && cfl->subsampling_y) {
     assert(*row_out == 0);
     (*row_out)++;
   }
 
   // Increment col index for right: 4x8, 4x16 or both right 4x4s.
-  if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+  if ((mi_col & 0x01) && cfl->subsampling_x) {
     assert(*col_out == 0);
     (*col_out)++;
   }
@@ -379,18 +392,33 @@
                   BLOCK_SIZE bsize) {
   CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
-  uint8_t *dst =
-      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
-    sub8x8_adjust_offset(cfl, &row, &col);
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
   }
   cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
 }
 
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
   CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
@@ -398,7 +426,7 @@
   int col = 0;
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
-    sub8x8_adjust_offset(cfl, &row, &col);
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
   }
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);

diff --git a/libaom/av1/common/cfl.h b/libaom/av1/common/cfl.h
index 3b91d85..a1d6dc2 100644
--- a/libaom/av1/common/cfl.h
+++ b/libaom/av1/common/cfl.h

@@ -12,8 +12,8 @@
 #ifndef AOM_AV1_COMMON_CFL_H_
 #define AOM_AV1_COMMON_CFL_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 // Can we use CfL for the current block?
 static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
@@ -41,7 +41,7 @@
 
   if (cm->seq_params.monochrome) return CFL_DISALLOWED;
 
-  if (!xd->cfl.is_chroma_reference) {
+  if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
     // in case the corresponding chroma-reference block uses CfL.
     // Note that this can only happen for block sizes which are <8 on
@@ -89,7 +89,7 @@
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
-  void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
       const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
     cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
                                                output_q3, width, height); \
@@ -119,31 +119,32 @@
 
 // Declare an architecture-specific array of function pointers for size-specific
 // wrappers.
-#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                       \
-  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {      \
-    subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
-    subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
-    subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
-    subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
-    NULL,                                  /* 64x64 (invalid CFL size) */ \
-    subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
-    subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
-    subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
-    subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
-    subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
-    subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
-    NULL,                                  /* 32x64 (invalid CFL size) */ \
-    NULL,                                  /* 64x32 (invalid CFL size) */ \
-    subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
-    subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
-    subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
-    subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
-    NULL,                                  /* 16x64 (invalid CFL size) */ \
-    NULL,                                  /* 64x16 (invalid CFL size) */ \
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {          \
+    cfl_subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    NULL,                                      /* 64x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    cfl_subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    cfl_subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    NULL,                                      /* 32x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x32 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    cfl_subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    cfl_subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    cfl_subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    NULL,                                      /* 16x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x16 (invalid CFL size) */ \
   };
 
 // The RTCD script does not support passing in an array, so we wrap it in this
 // function.
+#if CONFIG_AV1_HIGHBITDEPTH
 #define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
@@ -151,128 +152,137 @@
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+#else
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd)
+#endif
 
 // Declare a size-specific wrapper for the size-generic function. The compiler
 // will inline the size generic function in here, the advantage is that the size
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
-#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
-  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
-                                                    int16_t *dst) {      \
-    subtract_average_##arch(src, dst, width, height, round_offset,       \
-                            num_pel_log2);                               \
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,           \
+                            num_pel_log2);                                   \
   }
 
 // Declare size-specific wrappers for all valid CfL sizes.
-#define CFL_SUB_AVG_FN(arch)                                                \
-  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                           \
-  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                          \
-  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                         \
-  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                          \
-  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                          \
-  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                         \
-  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                        \
-  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                         \
-  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                         \
-  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                       \
-  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                       \
-  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                        \
-  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                       \
-  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                      \
-  cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
-    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {          \
-      subtract_average_4x4_##arch,   /* 4x4 */                              \
-      subtract_average_8x8_##arch,   /* 8x8 */                              \
-      subtract_average_16x16_##arch, /* 16x16 */                            \
-      subtract_average_32x32_##arch, /* 32x32 */                            \
-      NULL,                          /* 64x64 (invalid CFL size) */         \
-      subtract_average_4x8_##arch,   /* 4x8 */                              \
-      subtract_average_8x4_##arch,   /* 8x4 */                              \
-      subtract_average_8x16_##arch,  /* 8x16 */                             \
-      subtract_average_16x8_##arch,  /* 16x8 */                             \
-      subtract_average_16x32_##arch, /* 16x32 */                            \
-      subtract_average_32x16_##arch, /* 32x16 */                            \
-      NULL,                          /* 32x64 (invalid CFL size) */         \
-      NULL,                          /* 64x32 (invalid CFL size) */         \
-      subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */          \
-      subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */          \
-      subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */          \
-      subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */          \
-      NULL,                          /* 16x64 (invalid CFL size) */         \
-      NULL,                          /* 64x16 (invalid CFL size) */         \
-    };                                                                      \
-    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
-    /* index the function pointer array out of bounds. */                   \
-    return sub_avg[tx_size % TX_SIZES_ALL];                                 \
+#define CFL_SUB_AVG_FN(arch)                                              \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                         \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                        \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                     \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                    \
+  cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch(             \
+      TX_SIZE tx_size) {                                                  \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {        \
+      cfl_subtract_average_4x4_##arch,   /* 4x4 */                        \
+      cfl_subtract_average_8x8_##arch,   /* 8x8 */                        \
+      cfl_subtract_average_16x16_##arch, /* 16x16 */                      \
+      cfl_subtract_average_32x32_##arch, /* 32x32 */                      \
+      NULL,                              /* 64x64 (invalid CFL size) */   \
+      cfl_subtract_average_4x8_##arch,   /* 4x8 */                        \
+      cfl_subtract_average_8x4_##arch,   /* 8x4 */                        \
+      cfl_subtract_average_8x16_##arch,  /* 8x16 */                       \
+      cfl_subtract_average_16x8_##arch,  /* 16x8 */                       \
+      cfl_subtract_average_16x32_##arch, /* 16x32 */                      \
+      cfl_subtract_average_32x16_##arch, /* 32x16 */                      \
+      NULL,                              /* 32x64 (invalid CFL size) */   \
+      NULL,                              /* 64x32 (invalid CFL size) */   \
+      cfl_subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */    \
+      cfl_subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */    \
+      cfl_subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */    \
+      cfl_subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */    \
+      NULL,                              /* 16x64 (invalid CFL size) */   \
+      NULL,                              /* 64x16 (invalid CFL size) */   \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return sub_avg[tx_size % TX_SIZES_ALL];                               \
   }
 
 // For VSX SIMD optimization, the C versions of width == 4 subtract are
 // faster than the VSX. As such, the VSX code calls the C versions.
-void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
 
-#define CFL_PREDICT_lbd(arch, width, height)                                 \
-  void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
-                                               uint8_t *dst, int dst_stride, \
-                                               int alpha_q3) {               \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,    \
-                           height);                                          \
+#define CFL_PREDICT_lbd(arch, width, height)                              \
+  void cfl_predict_lbd_##width##x##height##_##arch(                       \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
+      int alpha_q3) {                                                     \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+                           height);                                       \
   }
 
-#define CFL_PREDICT_hbd(arch, width, height)                                  \
-  void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,    \
-                                               uint16_t *dst, int dst_stride, \
-                                               int alpha_q3, int bd) {        \
-    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
-                           height);                                           \
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_PREDICT_hbd(arch, width, height)                                   \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd) {                                                                \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
+                           height);                                            \
   }
+#endif
 
 // This wrapper exists because clang format does not like calling macros with
 // lowercase letters.
 #define CFL_PREDICT_X(arch, width, height, bd) \
   CFL_PREDICT_##bd(arch, width, height)
 
-#define CFL_PREDICT_FN(arch, bd)                                          \
-  CFL_PREDICT_X(arch, 4, 4, bd)                                           \
-  CFL_PREDICT_X(arch, 4, 8, bd)                                           \
-  CFL_PREDICT_X(arch, 4, 16, bd)                                          \
-  CFL_PREDICT_X(arch, 8, 4, bd)                                           \
-  CFL_PREDICT_X(arch, 8, 8, bd)                                           \
-  CFL_PREDICT_X(arch, 8, 16, bd)                                          \
-  CFL_PREDICT_X(arch, 8, 32, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 4, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 8, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 16, bd)                                         \
-  CFL_PREDICT_X(arch, 16, 32, bd)                                         \
-  CFL_PREDICT_X(arch, 32, 8, bd)                                          \
-  CFL_PREDICT_X(arch, 32, 16, bd)                                         \
-  CFL_PREDICT_X(arch, 32, 32, bd)                                         \
-  cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) {   \
-    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {             \
-      predict_##bd##_4x4_##arch,   /* 4x4 */                              \
-      predict_##bd##_8x8_##arch,   /* 8x8 */                              \
-      predict_##bd##_16x16_##arch, /* 16x16 */                            \
-      predict_##bd##_32x32_##arch, /* 32x32 */                            \
-      NULL,                        /* 64x64 (invalid CFL size) */         \
-      predict_##bd##_4x8_##arch,   /* 4x8 */                              \
-      predict_##bd##_8x4_##arch,   /* 8x4 */                              \
-      predict_##bd##_8x16_##arch,  /* 8x16 */                             \
-      predict_##bd##_16x8_##arch,  /* 16x8 */                             \
-      predict_##bd##_16x32_##arch, /* 16x32 */                            \
-      predict_##bd##_32x16_##arch, /* 32x16 */                            \
-      NULL,                        /* 32x64 (invalid CFL size) */         \
-      NULL,                        /* 64x32 (invalid CFL size) */         \
-      predict_##bd##_4x16_##arch,  /* 4x16  */                            \
-      predict_##bd##_16x4_##arch,  /* 16x4  */                            \
-      predict_##bd##_8x32_##arch,  /* 8x32  */                            \
-      predict_##bd##_32x8_##arch,  /* 32x8  */                            \
-      NULL,                        /* 16x64 (invalid CFL size) */         \
-      NULL,                        /* 64x16 (invalid CFL size) */         \
-    };                                                                    \
-    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
-    /* index the function pointer array out of bounds. */                 \
-    return pred[tx_size % TX_SIZES_ALL];                                  \
+#define CFL_PREDICT_FN(arch, bd)                                            \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                           \
+  cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {               \
+      cfl_predict_##bd##_4x4_##arch,   /* 4x4 */                            \
+      cfl_predict_##bd##_8x8_##arch,   /* 8x8 */                            \
+      cfl_predict_##bd##_16x16_##arch, /* 16x16 */                          \
+      cfl_predict_##bd##_32x32_##arch, /* 32x32 */                          \
+      NULL,                            /* 64x64 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x8_##arch,   /* 4x8 */                            \
+      cfl_predict_##bd##_8x4_##arch,   /* 8x4 */                            \
+      cfl_predict_##bd##_8x16_##arch,  /* 8x16 */                           \
+      cfl_predict_##bd##_16x8_##arch,  /* 16x8 */                           \
+      cfl_predict_##bd##_16x32_##arch, /* 16x32 */                          \
+      cfl_predict_##bd##_32x16_##arch, /* 32x16 */                          \
+      NULL,                            /* 32x64 (invalid CFL size) */       \
+      NULL,                            /* 64x32 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x16_##arch,  /* 4x16  */                          \
+      cfl_predict_##bd##_16x4_##arch,  /* 16x4  */                          \
+      cfl_predict_##bd##_8x32_##arch,  /* 8x32  */                          \
+      cfl_predict_##bd##_32x8_##arch,  /* 32x8  */                          \
+      NULL,                            /* 16x64 (invalid CFL size) */       \
+      NULL,                            /* 64x16 (invalid CFL size) */       \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return pred[tx_size % TX_SIZES_ALL];                                    \
   }
 
 #endif  // AOM_AV1_COMMON_CFL_H_

diff --git a/libaom/av1/common/common_data.h b/libaom/av1/common/common_data.h
index 46e455f..402845c 100644
--- a/libaom/av1/common/common_data.h
+++ b/libaom/av1/common/common_data.h

@@ -82,16 +82,16 @@
     BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
   }, {  // PARTITION_HORZ_A
-    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
     BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_HORZ_B
-    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
     BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT_A
-    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
     BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_VERT_B
-    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
     BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_HORZ_4
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,

diff --git a/libaom/av1/common/convolve.c b/libaom/av1/common/convolve.c
index 5a55ece..e177e3c 100644
--- a/libaom/av1/common/convolve.c
+++ b/libaom/av1/common/convolve.c

@@ -15,10 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
@@ -116,11 +116,12 @@
                           int dst_stride, int w, int h,
                           const InterpFilterParams *filter_params_x,
                           const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
+                          const int subpel_x_qn, const int subpel_y_qn,
                           ConvolveParams *conv_params) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bd = 8;
@@ -130,7 +131,7 @@
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -146,7 +147,7 @@
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -167,11 +168,11 @@
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_q4, const int subpel_y_q4,
+                         const int subpel_x_qn, const int subpel_y_qn,
                          ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -180,7 +181,7 @@
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -197,12 +198,12 @@
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_q4, const int subpel_y_q4,
+                         const int subpel_x_qn, const int subpel_y_qn,
                          ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   assert(bits >= 0);
@@ -211,7 +212,7 @@
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -229,12 +230,12 @@
                                int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
+                               const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   for (int y = 0; y < h; ++y) {
@@ -243,13 +244,13 @@
 }
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst8, int dst8_stride, int w, int h,
+                                uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
+                                const int subpel_x_qn, const int subpel_y_qn,
                                 ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
@@ -262,7 +263,7 @@
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -278,7 +279,7 @@
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -289,7 +290,7 @@
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -299,23 +300,23 @@
         }
         tmp -= (1 << (offset_bits - conv_params->round_1)) +
                (1 << (offset_bits - conv_params->round_1 - 1));
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst8, int dst8_stride, int w, int h,
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
+                               const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   const int bd = 8;
@@ -325,11 +326,11 @@
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -340,7 +341,7 @@
       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -349,23 +350,23 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst8, int dst8_stride, int w, int h,
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
+                               const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_1;
   const int bd = 8;
@@ -375,11 +376,11 @@
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -390,7 +391,7 @@
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -399,22 +400,24 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_dist_wtd_convolve_2d_copy_c(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int bd = 8;
@@ -423,8 +426,8 @@
                            (1 << (offset_bits - conv_params->round_1 - 1));
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -432,7 +435,7 @@
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -441,16 +444,16 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int x_step_qn,
@@ -520,7 +523,7 @@
           /* Subtract round offset and convolve round */
           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
                        (1 << (offset_bits - conv_params->round_1 - 1)));
-          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
         } else {
           dst16[y * dst16_stride + x] = res;
         }
@@ -528,7 +531,7 @@
         /* Subtract round offset and convolve round */
         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
                              (1 << (offset_bits - conv_params->round_1 - 1)));
-        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
       }
     }
     src_vert++;
@@ -549,84 +552,66 @@
                         y_step_qn, conv_params);
 }
 
-// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
-// we may create optimized code to do 2-tap filtering for all bilinear filtering
-// usages, not just IntraBC.
-static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    int subpel_x_q4, int subpel_y_q4,
-                                    ConvolveParams *conv_params) {
-  const InterpFilterParams *filter_params_x =
-      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
-  const InterpFilterParams *filter_params_y =
-      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
-  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
-    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, 0, 0, conv_params);
-  } else if (subpel_x_q4 != 0) {
-    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        filter_params_y, 0, 0, conv_params);
-  } else {
-    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        filter_params_y, 0, 0, conv_params);
-  }
-}
-
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilters interp_filters, const int subpel_x_q4,
-                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf, int is_intrabc) {
-  assert(IMPLIES(is_intrabc, !scaled));
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
 
-  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
-    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
-                            subpel_y_q4, conv_params);
-    return;
-  }
+  const InterpFilterParams *filter_params_x = interp_filters[0];
+  const InterpFilterParams *filter_params_y = interp_filters[1];
 
-  InterpFilter filter_x = 0;
-  InterpFilter filter_y = 0;
-  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
-  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
-  if (need_filter_params_x)
-    filter_x = av1_extract_interp_filter(interp_filters, 1);
-  if (need_filter_params_y)
-    filter_y = av1_extract_interp_filter(interp_filters, 0);
-  const InterpFilterParams *filter_params_x =
-      need_filter_params_x
-          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
-          : NULL;
-  const InterpFilterParams *filter_params_y =
-      need_filter_params_y
-          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
-          : NULL;
+  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
+  // Do we have SIMD support to 4-tap case?
+  // 2-tap filter indicates that it is for IntraBC.
+  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+    assert(!scaled);
+    if (subpel_x_qn && subpel_y_qn) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_x_qn) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_y_qn) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    }
+  }
 
   if (scaled) {
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
-                              filter_params_x, filter_params_y, subpel_x_q4,
-                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
   } else {
-    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+    sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_convolve_2d_copy_sr_c(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
 
@@ -639,12 +624,12 @@
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
+                                const int subpel_x_qn, const int subpel_y_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -652,7 +637,7 @@
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -670,11 +655,11 @@
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
+                                const int subpel_x_qn, const int subpel_y_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -682,7 +667,7 @@
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -699,11 +684,12 @@
                                  uint16_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams *filter_params_x,
                                  const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 const int subpel_x_qn, const int subpel_y_qn,
                                  ConvolveParams *conv_params, int bd) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits =
@@ -713,7 +699,7 @@
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -729,7 +715,7 @@
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -748,14 +734,14 @@
 }
 
 void av1_highbd_dist_wtd_convolve_2d_c(
-    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   int x, y, k;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -767,7 +753,7 @@
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (y = 0; y < im_h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -785,7 +771,7 @@
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = 1 << offset_bits;
@@ -795,7 +781,7 @@
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -805,22 +791,22 @@
         }
         tmp -= (1 << (offset_bits - conv_params->round_1)) +
                (1 << (offset_bits - conv_params->round_1 - 1));
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
 void av1_highbd_dist_wtd_convolve_x_c(
-    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -830,11 +816,11 @@
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -845,7 +831,7 @@
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -854,22 +840,22 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
 void av1_highbd_dist_wtd_convolve_y_c(
-    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -879,11 +865,11 @@
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   assert(bits >= 0);
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -894,7 +880,7 @@
       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -903,22 +889,22 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
 void av1_highbd_dist_wtd_convolve_2d_copy_c(
-    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -927,15 +913,15 @@
   assert(bits >= 0);
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
       res += round_offset;
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
+        int32_t tmp = dst16[y * dst16_stride + x];
         if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
@@ -944,10 +930,10 @@
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
@@ -1039,68 +1025,24 @@
   }
 }
 
-static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
-                                           uint16_t *dst, int dst_stride, int w,
-                                           int h, int subpel_x_q4,
-                                           int subpel_y_q4,
-                                           ConvolveParams *conv_params,
-                                           int bd) {
-  const InterpFilterParams *filter_params_x =
-      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
-  const InterpFilterParams *filter_params_y =
-      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
-  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
-    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params_x, filter_params_y, 0, 0,
-                                conv_params, bd);
-  } else if (subpel_x_q4 != 0) {
-    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params_x, filter_params_y, 0, 0,
-                               conv_params, bd);
-  } else {
-    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params_x, filter_params_y, 0, 0,
-                               conv_params, bd);
-  }
-}
-
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
-                                   InterpFilters interp_filters,
-                                   const int subpel_x_q4, int x_step_q4,
-                                   const int subpel_y_q4, int y_step_q4,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf,
-                                   int is_intrabc, int bd) {
-  assert(IMPLIES(is_intrabc, !scaled));
+                                   const struct scale_factors *sf, int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
-  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
-                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
-    return;
-  }
-
-  InterpFilter filter_x = 0;
-  InterpFilter filter_y = 0;
-  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
-  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
-  if (need_filter_params_x)
-    filter_x = av1_extract_interp_filter(interp_filters, 1);
-  if (need_filter_params_y)
-    filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
   const InterpFilterParams *filter_params_x =
-      need_filter_params_x
-          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
-          : NULL;
+      need_filter_params_x ? interp_filters[0] : NULL;
   const InterpFilterParams *filter_params_y =
-      need_filter_params_y
-          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
-          : NULL;
+      need_filter_params_y ? interp_filters[1] : NULL;
 
   if (scaled) {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1108,18 +1050,19 @@
       assert(conv_params->dst != NULL);
     }
     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
-                                 filter_params_x, filter_params_y, subpel_x_q4,
-                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                                 filter_params_x, filter_params_y, subpel_x_qn,
+                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
                                  bd);
   } else {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+    sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
                                           0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Note: Fixed size intermediate buffers, place limits on parameters
 // of some functions. 2d filtering proceeds in 2 steps:
@@ -1141,12 +1084,14 @@
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   return sum;
 }
+#endif
 
 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
@@ -1247,6 +1192,7 @@
                             y_step_q4, w, h, conv_params->round_1);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_convolve_add_src_horiz_hip(
     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
@@ -1325,3 +1271,4 @@
       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/av1/common/convolve.h b/libaom/av1/common/convolve.h
index e5479e6..04df86c 100644
--- a/libaom/av1/common/convolve.h
+++ b/libaom/av1/common/convolve.h

@@ -26,6 +26,7 @@
   int round_1;
   int plane;
   int is_compound;
+  int compound_index;  // 0: the first single in compound mode, 1: the second.
   int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
@@ -41,32 +42,34 @@
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params);
 
 typedef void (*aom_highbd_convolve_fn_t)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 
 struct AV1Common;
 struct scale_factors;
 
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilters interp_filters, const int subpel_x_q4,
-                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf, int is_intrabc);
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf);
 
-static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.do_average = do_average;
-  assert(IMPLIES(do_average, is_compound));
+  conv_params.compound_index = cmp_index;
+  assert(IMPLIES(cmp_index, is_compound));
+
   conv_params.is_compound = is_compound;
   conv_params.round_0 = ROUND0_BITS;
   conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
@@ -82,6 +85,10 @@
   conv_params.dst = dst;
   conv_params.dst_stride = dst_stride;
   conv_params.plane = plane;
+
+  // By default, set do average to 1 if this is the second single prediction
+  // in a compound mode.
+  conv_params.do_average = cmp_index;
   return conv_params;
 }
 
@@ -111,12 +118,11 @@
 
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
-                                   InterpFilters interp_filters,
-                                   const int subpel_x_q4, int x_step_q4,
-                                   const int subpel_y_q4, int y_step_q4,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf,
-                                   int is_intrabc, int bd);
+                                   const struct scale_factors *sf, int bd);
 
 // TODO(sarahparker) This will need to be integerized and optimized
 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,

diff --git a/libaom/av1/common/debugmodes.c b/libaom/av1/common/debugmodes.c
index b26c7dd..ff02ddd 100644
--- a/libaom/av1/common/debugmodes.c
+++ b/libaom/av1/common/debugmodes.c

@@ -11,14 +11,14 @@
 
 #include <stdio.h>
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
 
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
   fprintf(f, "%s", str);
   fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
-          cm->show_frame, cm->base_qindex);
+          cm->show_frame, cm->quant_params.base_qindex);
 }
 /* This function dereferences a pointer to the mbmi structure
  * and uses the passed in member offset to print out the value of an integer
@@ -26,32 +26,31 @@
  */
 static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
-  int mi_row, mi_col;
-  MB_MODE_INFO **mi = cm->mi_grid_visible;
-  int rows = cm->mi_rows;
-  int cols = cm->mi_cols;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  int rows = mi_params->mi_rows;
+  int cols = mi_params->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
-    mi += cm->mi_stride - cols;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(file, "\n");
 }
 
 void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
-  int mi_row;
-  int mi_col;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
   FILE *mvs = fopen(file, "a");
-  MB_MODE_INFO **mi = cm->mi_grid_visible;
-  int rows = cm->mi_rows;
-  int cols = cm->mi_cols;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  const int rows = mi_params->mi_rows;
+  const int cols = mi_params->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
@@ -61,28 +60,28 @@
 
   // output skip infomation.
   log_frame_info(cm, "Skips:", mvs);
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%2d ", mi[0]->skip);
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += cm->mi_stride - cols;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
   // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
-  mi = cm->mi_grid_visible;
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  mi = mi_params->mi_grid_base;
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += cm->mi_stride - cols;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
@@ -93,6 +92,13 @@
                                          const char *filename) {
   FILE *hdrFile = fopen(filename, "w");
   fwrite(data, size, sizeof(uint8_t), hdrFile);
+
+  // Reset order hints(7bit + a previous bit) to 0, so that all camera frame
+  // headers are identical in large scale coding.
+  uint8_t zero = 0;
+  fseek(hdrFile, 1, SEEK_SET);
+  // Reset second byte.
+  fwrite(&zero, 1, sizeof(uint8_t), hdrFile);
   fclose(hdrFile);
 }
 

diff --git a/libaom/av1/common/entropy.c b/libaom/av1/common/entropy.c
index f63ac98..1f7a0ef 100644
--- a/libaom/av1/common/entropy.c
+++ b/libaom/av1/common/entropy.c

@@ -13,10 +13,10 @@
 
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/scan.h"
 #include "av1/common/token_cdfs.h"
 #include "av1/common/txb_common.h"
@@ -29,7 +29,7 @@
 }
 
 void av1_default_coef_probs(AV1_COMMON *cm) {
-  const int index = get_q_ctx(cm->base_qindex);
+  const int index = get_q_ctx(cm->quant_params.base_qindex);
 #if CONFIG_ENTROPY_STATS
   cm->coef_cdf_category = index;
 #endif
@@ -50,8 +50,9 @@
   av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
 }
 
-static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
-                                     int cdf_stride, int nsymbs) {
+static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
+                                                int num_cdfs, int cdf_stride,
+                                                int nsymbs) {
   for (int i = 0; i < num_cdfs; i++) {
     cdf_ptr[i * cdf_stride + nsymbs] = 0;
   }
@@ -68,7 +69,7 @@
     reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
   } while (0)
 
-static void reset_nmv_counter(nmv_context *nmv) {
+static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
   RESET_CDF_COUNTER(nmv->joints_cdf, 4);
   for (int i = 0; i < 2; i++) {
     RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);

diff --git a/libaom/av1/common/entropy.h b/libaom/av1/common/entropy.h
index 41218d3..ee78f56 100644
--- a/libaom/av1/common/entropy.h
+++ b/libaom/av1/common/entropy.h

@@ -48,7 +48,7 @@
 #define BR_CDF_SIZE (4)
 #define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
 
-#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_BITS 3
 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
 #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
 

diff --git a/libaom/av1/common/entropymode.c b/libaom/av1/common/entropymode.c
index 90702ac..5f061be 100644
--- a/libaom/av1/common/entropymode.c
+++ b/libaom/av1/common/entropymode.c

@@ -11,9 +11,9 @@
 
 #include "aom_mem/aom_mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/scan.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/txb_common.h"
 
@@ -435,16 +435,16 @@
       { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
     };
 
-static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
-      { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+            { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
 
 static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
     2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
 
-static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
-      { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+            { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
 
 static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
   { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
@@ -470,11 +470,11 @@
             { AOM_CDF2(30237) } };
 
 static const aom_cdf_prob
-    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
-        { { AOM_CDF4(8192, 16384, 24576) },
-          { AOM_CDF4(1875, 11082, 27332) },
-          { AOM_CDF4(2473, 9996, 26388) },
-          { AOM_CDF4(4238, 11537, 25926) } };
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+        INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) },
+                               { AOM_CDF4(1875, 11082, 27332) },
+                               { AOM_CDF4(2473, 9996, 26388) },
+                               { AOM_CDF4(4238, 11537, 25926) } };
 
 static const aom_cdf_prob
     default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
@@ -500,51 +500,51 @@
   { AOM_CDF2(16384) }
 };
 
-static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
-    { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
-                  22362, 24127, 25702, 27752, 29450, 31171) },
-      { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
-                  18452, 19422, 22839, 26127, 29629) },
-      { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
-                  24520, 27470, 29456, 30529, 31656) },
-      { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
-                  20961, 22884, 24471, 26719, 28714, 30877) },
-      { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
-                  18114, 19313, 22521, 26012, 29550) },
-      { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
-                  20533, 23434, 25972, 27944, 29570, 31416) },
-      { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
-                  22038, 23963, 25311, 26988, 28766, 31012) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
-                  24985, 25684, 27259, 28883, 30911) },
-      { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
-                  27251, 29173, 30089, 30960, 31933) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) } };
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+                         20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+             { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323,
+                         17367, 18452, 19422, 22839, 26127, 29629) },
+             { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939,
+                         21332, 24520, 27470, 29456, 30529, 31656) },
+             { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+                         19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+             { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369,
+                         16730, 18114, 19313, 22521, 26012, 29550) },
+             { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+                         17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+             { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+                         20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703,
+                         24284, 24985, 25684, 27259, 28883, 30911) },
+             { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935,
+                         25057, 27251, 29173, 30089, 30960, 31933) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) } };
 
 static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
     MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
@@ -1071,7 +1071,7 @@
   *cm->default_frame_context = *cm->fc;
   // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
   // but could do with fuller testing
-  if (cm->large_scale_tile) {
+  if (cm->tiles.large_scale) {
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
       if (buf != NULL) buf->frame_context = *cm->fc;
@@ -1087,7 +1087,8 @@
   av1_clearall_segfeatures(&cm->seg);
 
   if (cm->cur_frame->seg_map)
-    memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
+    memset(cm->cur_frame->seg_map, 0,
+           (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
 
   // reset mode ref deltas
   av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1099,9 +1100,4 @@
   av1_init_mv_probs(cm);
   cm->fc->initialized = 1;
   av1_setup_frame_contexts(cm);
-
-  // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip)
-    memset(cm->prev_mip, 0,
-           cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
 }

diff --git a/libaom/av1/common/entropymode.h b/libaom/av1/common/entropymode.h
index 69b5218..bbbf55d 100644
--- a/libaom/av1/common/entropymode.h
+++ b/libaom/av1/common/entropymode.h

@@ -63,7 +63,6 @@
 typedef struct {
   const int16_t *scan;
   const int16_t *iscan;
-  const int16_t *neighbors;
 } SCAN_ORDER;
 
 typedef struct frame_contexts {

diff --git a/libaom/av1/common/entropymv.c b/libaom/av1/common/entropymv.c
index 4913373..e1e42f2 100644
--- a/libaom/av1/common/entropymv.c
+++ b/libaom/av1/common/entropymv.c

@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/entropymv.h"
 
 static const nmv_context default_nmv_context = {

diff --git a/libaom/av1/common/enums.h b/libaom/av1/common/enums.h
index fbacc89..0c09a1b 100644
--- a/libaom/av1/common/enums.h
+++ b/libaom/av1/common/enums.h

@@ -73,11 +73,6 @@
 #define DIST_PRECISION_BITS 4
 #define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
 
-// TODO(chengchen): Temporal flag serve as experimental flag for WIP
-// bitmask construction.
-// Shall be removed when bitmask code is completely checkedin
-#define LOOP_FILTER_BITMASK 0
-
 #define PROFILE_BITS 3
 // The following three profiles are currently defined.
 // Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
@@ -130,6 +125,27 @@
 // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
 #define SQR_BLOCK_SIZES 6
 
+//  Partition types.  R: Recursive
+//
+//  NONE          HORZ          VERT          SPLIT
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  HORZ_A        HORZ_B        VERT_A        VERT_B
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  HORZ_4        VERT_4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
 enum {
   PARTITION_NONE,
   PARTITION_HORZ,
@@ -244,6 +260,7 @@
   V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
   H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
   TX_TYPES,
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
 } UENUM1BYTE(TX_TYPE);
 
 enum {
@@ -274,8 +291,6 @@
   EXT_TX_SET_TYPES
 } UENUM1BYTE(TxSetType);
 
-#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
-
 #define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
@@ -395,6 +410,8 @@
   MB_MODE_COUNT,
   INTRA_MODE_START = DC_PRED,
   INTRA_MODE_END = NEARESTMV,
+  DIR_MODE_START = V_PRED,
+  DIR_MODE_END = D67_PRED + 1,
   INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
   SINGLE_INTER_MODE_START = NEARESTMV,
   SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
@@ -526,7 +543,9 @@
 
 #define DELTA_Q_SMALL 3
 #define DELTA_Q_PROBS (DELTA_Q_SMALL)
-#define DEFAULT_DELTA_Q_RES 4
+#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
+#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+
 #define DELTA_LF_SMALL 3
 #define DELTA_LF_PROBS (DELTA_LF_SMALL)
 #define DEFAULT_DELTA_LF_RES 2
@@ -535,6 +554,7 @@
 #define MAX_MV_REF_CANDIDATES 2
 
 #define MAX_REF_MV_STACK_SIZE 8
+#define USABLE_REF_MV_STACK_SIZE 4
 #define REF_CAT_LEVEL 640
 
 #define INTRA_INTER_CONTEXTS 4
@@ -625,6 +645,25 @@
   RESTORE_TYPES = 4,
 } UENUM1BYTE(RestorationType);
 
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
+
 #define SUPERRES_SCALE_BITS 3
 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
 

diff --git a/libaom/av1/common/filter.h b/libaom/av1/common/filter.h
index 184f5b2..91791d3 100644
--- a/libaom/av1/common/filter.h
+++ b/libaom/av1/common/filter.h

@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,6 +36,7 @@
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
   EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+  INTERP_INVALID = 0xff,
 } InterpFilter;
 
 enum {
@@ -44,24 +46,45 @@
   USE_8_TAPS,
 } UENUM1BYTE(SUBPEL_SEARCH_TYPE);
 
+enum {
+  INTERP_EVAL_LUMA_EVAL_CHROMA = 0,
+  INTERP_SKIP_LUMA_EVAL_CHROMA,
+  INTERP_EVAL_INVALID,
+  INTERP_SKIP_LUMA_SKIP_CHROMA,
+} UENUM1BYTE(INTERP_EVAL_PLANE);
+
+enum {
+  INTERP_HORZ_NEQ_VERT_NEQ = 0,
+  INTERP_HORZ_EQ_VERT_NEQ,
+  INTERP_HORZ_NEQ_VERT_EQ,
+  INTERP_HORZ_EQ_VERT_EQ,
+  INTERP_PRED_TYPE_ALL,
+} UENUM1BYTE(INTERP_PRED_TYPE);
 // Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
 // we can use 16 bits for each and have more than enough space. This reduces
 // argument passing and unifies the operation of setting a (pair of) filters.
-typedef uint32_t InterpFilters;
-static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
-                                                     int x_filter) {
-  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
+typedef struct InterpFilters {
+  uint16_t y_filter;
+  uint16_t x_filter;
+} InterpFilters;
+
+typedef union int_interpfilters {
+  uint32_t as_int;
+  InterpFilters as_filters;
+} int_interpfilters;
+
+static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+                                                     int dir) {
+  return (InterpFilter)((dir) ? filters.as_filters.x_filter
+                              : filters.as_filters.y_filter);
 }
 
-static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
-                                                    InterpFilter x_filter) {
-  uint16_t y16 = y_filter & 0xf;
-  uint16_t x16 = x_filter & 0xf;
-  return y16 | ((uint32_t)x16 << 16);
-}
-
-static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
-  return av1_make_interp_filters(filter, filter);
+static INLINE int_interpfilters
+av1_broadcast_interp_filter(InterpFilter filter) {
+  int_interpfilters filters;
+  filters.as_filters.x_filter = filter;
+  filters.as_filters.y_filter = filter;
+  return filters;
 }
 
 static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
@@ -71,10 +94,10 @@
 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
 #define LOG_SWITCHABLE_FILTERS 2
 
-#define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
 #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
 #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff)
 
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
@@ -145,9 +168,10 @@
 
 // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
 // MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
-DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
-  64,
-  64,
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+  128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static const InterpFilterParams av1_intrabc_filter_params = {
@@ -177,6 +201,16 @@
   { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
 };
 
+static const uint16_t
+    av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = {
+      { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG),
+        (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH),
+        (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) },
+      { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP),
+        (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP),
+        (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) }
+    };
+
 // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
 static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
   { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
@@ -196,11 +230,6 @@
   return &av1_interp_filter_params_list[interp_filter];
 }
 
-static INLINE const InterpFilterParams *get_4tap_interp_filter_params(
-    const InterpFilter interp_filter) {
-  return &av1_interp_4tap[interp_filter];
-}
-
 static INLINE const int16_t *av1_get_interp_filter_kernel(
     const InterpFilter interp_filter, int subpel_search) {
   assert(subpel_search >= USE_2_TAPS);
@@ -220,13 +249,29 @@
   assert(subpel_search >= USE_2_TAPS);
 
   switch (subpel_search) {
-    case USE_2_TAPS: return get_4tap_interp_filter_params(BILINEAR);
-    case USE_4_TAPS: return get_4tap_interp_filter_params(EIGHTTAP_REGULAR);
+    case USE_2_TAPS: return &av1_interp_4tap[BILINEAR];
+    case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR];
     case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
     default: assert(0); return NULL;
   }
 }
 
+static INLINE void reset_interp_filter_allowed_mask(
+    uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  uint16_t tmp = (~(1 << filt_type)) & 0xffff;
+  *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
+}
+
+static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+                                                  DUAL_FILTER_TYPE filt_type) {
+  *allow_interp_mask |= (1 << filt_type);
+}
+
+static INLINE uint8_t get_interp_filter_allowed_mask(
+    uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  return (allow_interp_mask >> filt_type) & 1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/common/loopfiltermask.c b/libaom/av1/common/loopfiltermask.c
new file mode 100644
index 0000000..157310f
--- /dev/null
+++ b/libaom/av1/common/loopfiltermask.c

@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+#if CONFIG_LPF_MASK
+static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,
+  2,  3,  -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
+  0,  47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
+  60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
+};
+
+static const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
+                                            int mi_row, int mi_col) {
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
+        } else if (mask_16x16_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
+        } else if (mask_8x8_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_16x16_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_8x8_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_8x8 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    const int row_uv = row | subsampling_y;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const int col_uv = col | subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+    const int col_uv = col | subsampling_x;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        const int row_uv = row | subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    const int has_next_row = row_next < cm->mi_params.mi_rows;
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_ver[row][col];
+        lfl2 = &lfm->lfl_u_ver[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_ver[row][col];
+        lfl2 = &lfm->lfl_v_ver[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+    if (!has_next_row) {
+      mask_16x16_1 = 0;
+      mask_8x8_1 = 0;
+      mask_4x4_1 = 0;
+    }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#else
+    filter_selectively_vert_row2(
+        ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+        mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
+    }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_hor[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_hor[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                             mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+    dst->buf += row_stride;
+  }
+  // reset buf pointer for next block
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_ver[row][col];
+          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_ver[row][col];
+          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_vert_row2(
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+#else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += 2 * MI_SIZE * dst->stride;
+  }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_hor[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_hor[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += MI_SIZE * dst->stride;
+  }
+}
+
+void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (tx_size - TX_4X16);
+  }
+  int index = 0;
+  const int row = mi_row % MI_SIZE_64X64;
+  const int col = mi_col % MI_SIZE_64X64;
+  const int shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  // Use a lookup table that provides one bitmask for a given block size and
+  // a univariant transform size.
+  int index;
+  int shift;
+  int row;
+  int col;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (mbmi->tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+    mask_id =
+        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (mbmi->tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (mbmi->tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (mbmi->tx_size - TX_4X16);
+  }
+  row = mi_row % MI_SIZE_64X64;
+  col = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border) {
+  int index;
+  int shift;
+  int row;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int row_start = mi_row % MI_SIZE_64X64;
+  const int col_start = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col_start, row_start, &index);
+  if (is_horz_coding_block_border) {
+    const int block_shift = shift + mi_size_wide[bsize];
+    assert(block_shift <= 64);
+    const uint64_t right_edge_shift =
+        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
+    const uint64_t left_edge_shift = (block_shift == 64)
+                                         ? (((uint64_t)1 << shift) - 1)
+                                         : ((uint64_t)1 << shift);
+    assert(right_edge_shift > left_edge_shift);
+    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
+    lfm->is_horz_border.bits[index] |= top_edge_mask;
+  }
+  if (is_vert_coding_block_border) {
+    const int is_vert_border = mask_id_table_vert_border[bsize];
+    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->is_vert_border.bits[i + index] |=
+          (left_mask_univariant_reordered[is_vert_border].bits[i]
+           << vert_shift);
+    }
+  }
+  const int is_skip = mbmi->skip && is_inter_block(mbmi);
+  if (is_skip) {
+    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->skip.bits[i + index] |=
+          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+    }
+  }
+  const uint8_t level_vert_y =
+      av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+  const uint8_t level_horz_y =
+      av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+  const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+  const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+    index = 0;
+    row = r % MI_SIZE_64X64;
+    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_ver[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_hor[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_ver[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_hor[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+  }
+}
+#endif  // CONFIG_LPF_MASK

diff --git a/libaom/av1/common/mv.h b/libaom/av1/common/mv.h
index d097f9e..be539e8 100644
--- a/libaom/av1/common/mv.h
+++ b/libaom/av1/common/mv.h

@@ -21,17 +21,34 @@
 #endif
 
 #define INVALID_MV 0x80008000
+#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
+#define GET_MV_SUBPEL(x) ((x)*8)
 
+#define MARK_MV_INVALID(mv)                \
+  do {                                     \
+    ((int_mv *)(mv))->as_int = INVALID_MV; \
+  } while (0);
+#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
+
+// The motion vector in units of full pixel
+typedef struct fullpel_mv {
+  int16_t row;
+  int16_t col;
+} FULLPEL_MV;
+
+// The motion vector in units of 1/8-pel
 typedef struct mv {
   int16_t row;
   int16_t col;
 } MV;
 
 static const MV kZeroMv = { 0, 0 };
+static const FULLPEL_MV kZeroFullMv = { 0, 0 };
 
 typedef union int_mv {
   uint32_t as_int;
   MV as_mv;
+  FULLPEL_MV as_fullmv;
 } int_mv; /* facilitates faster equality tests and copies */
 
 typedef struct mv32 {
@@ -39,6 +56,38 @@
   int32_t col;
 } MV32;
 
+// The mv limit for fullpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} FullMvLimits;
+
+// The mv limit for subpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} SubpelMvLimits;
+
+static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+  const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
+                               (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
+  return full_mv;
+}
+
+static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+  const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
+                         (int16_t)GET_MV_SUBPEL(full_mv->col) };
+  return subpel_mv;
+}
+
+static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+  mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
+}
+
 // Bits of precision used for the model
 #define WARPEDMODEL_PREC_BITS 16
 #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
@@ -225,7 +274,8 @@
     // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
     // bits of fractional precision. The offset for a translation is stored in
     // entries 0 and 1. For translations, all but the top three (two if
-    // cm->allow_high_precision_mv is false) fractional bits are always zero.
+    // cm->features.allow_high_precision_mv is false) fractional bits are always
+    // zero.
     //
     // After the right shifts, there are 3 fractional bits of precision. If
     // allow_hp is false, the bottom bit is always zero (so we don't need a
@@ -277,7 +327,6 @@
 typedef struct candidate_mv {
   int_mv this_mv;
   int_mv comp_mv;
-  int weight;
 } CANDIDATE_MV;
 
 static INLINE int is_zero_mv(const MV *mv) {
@@ -288,10 +337,14 @@
   return *((const uint32_t *)a) == *((const uint32_t *)b);
 }
 
-static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
-                            int max_row) {
-  mv->col = clamp(mv->col, min_col, max_col);
-  mv->row = clamp(mv->row, min_row, max_row);
+static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
 }
 
 #ifdef __cplusplus

diff --git a/libaom/av1/common/mvref_common.c b/libaom/av1/common/mvref_common.c
index e38891f..db3098c 100644
--- a/libaom/av1/common/mvref_common.c
+++ b/libaom/av1/common/mvref_common.c

@@ -23,7 +23,7 @@
 
 // TODO(jingning): Consider the use of lookup table for (num / den)
 // altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
+static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
   den = AOMMIN(den, MAX_FRAME_DISTANCE);
   num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
                 : AOMMAX(num, -MAX_FRAME_DISTANCE);
@@ -40,7 +40,7 @@
 void av1_copy_frame_mvs(const AV1_COMMON *const cm,
                         const MB_MODE_INFO *const mi, int mi_row, int mi_col,
                         int x_mis, int y_mis) {
-  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
   MV_REF *frame_mvs =
       cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
@@ -71,34 +71,35 @@
   }
 }
 
-static void add_ref_mv_candidate(
+static AOM_INLINE void add_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
     uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
-    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
-    const WarpedMotionParams *gm_params, int col, int weight) {
-  if (!is_inter_block(candidate)) return;  // for intrabc
-  int index = 0, ref;
+    CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
+    uint16_t weight) {
+  if (!is_inter_block(candidate)) return;
   assert(weight % 2 == 0);
+  int index, ref;
 
   if (rf[1] == NONE_FRAME) {
     // single reference frame
     for (ref = 0; ref < 2; ++ref) {
       if (candidate->ref_frame[ref] == rf[0]) {
-        int_mv this_refmv;
-        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
-          this_refmv = gm_mv_candidates[0];
-        else
-          this_refmv = get_sub_block_mv(candidate, ref, col);
-
-        for (index = 0; index < *refmv_count; ++index)
-          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
-        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+        const int is_gm_block =
+            is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
+        const int_mv this_refmv =
+            is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
+        for (index = 0; index < *refmv_count; ++index) {
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
+            ref_mv_weight[index] += weight;
+            break;
+          }
+        }
 
         // Add a new item to the list.
         if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
           ref_mv_stack[index].this_mv = this_refmv;
-          ref_mv_stack[index].weight = weight;
+          ref_mv_weight[index] = weight;
           ++(*refmv_count);
         }
         if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -114,21 +115,22 @@
         if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
           this_refmv[ref] = gm_mv_candidates[ref];
         else
-          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
+          this_refmv[ref] = get_block_mv(candidate, ref);
       }
 
-      for (index = 0; index < *refmv_count; ++index)
+      for (index = 0; index < *refmv_count; ++index) {
         if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
-            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
+          ref_mv_weight[index] += weight;
           break;
-
-      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+        }
+      }
 
       // Add a new item to the list.
       if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[index].this_mv = this_refmv[0];
         ref_mv_stack[index].comp_mv = this_refmv[1];
-        ref_mv_stack[index].weight = weight;
+        ref_mv_weight[index] = weight;
         ++(*refmv_count);
       }
       if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -137,42 +139,39 @@
   }
 }
 
-static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_row_offset,
-                          int *processed_rows) {
-  int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
+static AOM_INLINE void scan_row_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
+    const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+    int *processed_rows) {
+  int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
   end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
-  const int n8_w_8 = mi_size_wide[BLOCK_8X8];
-  const int n8_w_16 = mi_size_wide[BLOCK_16X16];
-  int i;
+  const int width_8x8 = mi_size_wide[BLOCK_8X8];
+  const int width_16x16 = mi_size_wide[BLOCK_16X16];
   int col_offset = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
   }
-  const int use_step_16 = (xd->n4_w >= 16);
+  const int use_step_16 = (xd->width >= 16);
   MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
-  (void)mi_row;
 
-  for (i = 0; i < end_mi;) {
+  for (int i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
     const int n4_w = mi_size_wide[candidate_bsize];
-    int len = AOMMIN(xd->n4_w, n4_w);
+    int len = AOMMIN(xd->width, n4_w);
     if (use_step_16)
-      len = AOMMAX(n8_w_16, len);
+      len = AOMMAX(width_16x16, len);
     else if (abs(row_offset) > 1)
-      len = AOMMAX(len, n8_w_8);
+      len = AOMMAX(len, width_8x8);
 
-    int weight = 2;
-    if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
-      int inc = AOMMIN(-max_row_offset + row_offset + 1,
-                       mi_size_high[candidate_bsize]);
+    uint16_t weight = 2;
+    if (xd->width >= width_8x8 && xd->width <= n4_w) {
+      uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
+                            mi_size_high[candidate_bsize]);
       // Obtain range used in weight calculation.
       weight = AOMMAX(weight, inc);
       // Update processed rows.
@@ -180,21 +179,20 @@
     }
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset + i, len * weight);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
 
     i += len;
   }
 }
 
-static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int col_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_col_offset,
-                          int *processed_cols) {
-  int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
+static AOM_INLINE void scan_col_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+    const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
+    int *processed_cols) {
+  int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
   end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
@@ -202,24 +200,23 @@
   int row_offset = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
   }
-  const int use_step_16 = (xd->n4_h >= 16);
-  (void)mi_col;
+  const int use_step_16 = (xd->height >= 16);
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
     const int candidate_bsize = candidate->sb_type;
     const int n4_h = mi_size_high[candidate_bsize];
-    int len = AOMMIN(xd->n4_h, n4_h);
+    int len = AOMMIN(xd->height, n4_h);
     if (use_step_16)
       len = AOMMAX(n8_h_16, len);
     else if (abs(col_offset) > 1)
       len = AOMMAX(len, n8_h_8);
 
     int weight = 2;
-    if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
+    if (xd->height >= n8_h_8 && xd->height <= n4_h) {
       int inc = AOMMIN(-max_col_offset + col_offset + 1,
                        mi_size_wide[candidate_bsize]);
       // Obtain range used in weight calculation.
@@ -229,20 +226,19 @@
     }
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset, len * weight);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
 
     i += len;
   }
 }
 
-static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          const int mi_row, const int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          int col_offset, CANDIDATE_MV *ref_mv_stack,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
+static AOM_INLINE void scan_blk_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
+    const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+    int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+    uint8_t *refmv_count) {
   const TileInfo *const tile = &xd->tile;
   POSITION mi_pos;
 
@@ -255,8 +251,8 @@
     const int len = mi_size_wide[BLOCK_8X8];
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, mi_pos.col, 2 * len);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, 2 * len);
   }  // Analyze a single 8x8 block motion information.
 }
 
@@ -291,19 +287,19 @@
 
   // The left hand of two vertical rectangles always has a top right (as the
   // block above will have been decoded)
-  if (xd->n4_w < xd->n4_h)
+  if (xd->width < xd->height)
     if (!xd->is_sec_rect) has_tr = 1;
 
   // The bottom of two horizontal rectangles never has a top right (as the block
   // to the right won't have been decoded)
-  if (xd->n4_w > xd->n4_h)
+  if (xd->width > xd->height)
     if (xd->is_sec_rect) has_tr = 0;
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
   if (xd->mi[0]->partition == PARTITION_VERT_A) {
-    if (xd->n4_w == xd->n4_h)
+    if (xd->width == xd->height)
       if (mask_row & bs) has_tr = 0;
   }
 
@@ -326,115 +322,98 @@
 static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
                           int blk_row, int blk_col, int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+                          uint8_t *const refmv_count,
+                          CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+                          uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
                           int16_t *mode_context) {
   POSITION mi_pos;
-  int idx;
-  const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
-
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
 
   if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
 
   const TPL_MV_REF *prev_frame_mvs =
-      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+      cm->tpl_mvs +
+      ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
       ((mi_col + mi_pos.col) >> 1);
+  if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
 
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
 
+  const uint16_t weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
+  const int cur_frame_index = cm->cur_frame->order_hint;
+  const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
+  const int frame0_index = buf_0->order_hint;
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                             cur_frame_index, frame0_index);
+  int idx;
+  const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+  const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+
+  int_mv this_refmv;
+  get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                    cur_offset_0, prev_frame_mvs->ref_frame_offset);
+  lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
+                     force_integer_mv);
+
   if (rf[1] == NONE_FRAME) {
-    int cur_frame_index = cm->cur_frame->order_hint;
-    const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
-    int frame0_index = buf_0->order_hint;
-    int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
-                                         cur_frame_index, frame0_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
 
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
+    for (idx = 0; idx < *refmv_count; ++idx)
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
 
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
 
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
-
-      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-      if (idx == refmv_count[rf[0]] &&
-          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[rf[0]]);
-      }
-      return 1;
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
     }
   } else {
     // Process compound inter mode
-    int cur_frame_index = cm->cur_frame->order_hint;
-    const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
-    int frame0_index = buf_0->order_hint;
-
-    int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
-                                         cur_frame_index, frame0_index);
     const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
-    int frame1_index = buf_1->order_hint;
-    int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
-                                         cur_frame_index, frame1_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
+    const int frame1_index = buf_1->order_hint;
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                               cur_frame_index, frame1_index);
+    int_mv comp_refmv;
+    get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                      cur_offset_1, prev_frame_mvs->ref_frame_offset);
+    lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
+                       force_integer_mv);
 
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
-      int_mv comp_refmv;
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+          abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+          abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
 
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
+    for (idx = 0; idx < *refmv_count; ++idx) {
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+          comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+        break;
+    }
 
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
-            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
-            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
 
-      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
-            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
-          break;
-
-      if (idx < refmv_count[ref_frame])
-        ref_mv_stack[idx].weight += 2 * weight_unit;
-
-      if (idx == refmv_count[ref_frame] &&
-          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[ref_frame]);
-      }
-      return 1;
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
     }
   }
-  return 0;
+
+  return 1;
 }
 
-static void process_compound_ref_mv_candidate(
+static AOM_INLINE void process_compound_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
     const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
     int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
@@ -459,10 +438,11 @@
   }
 }
 
-static void process_single_ref_mv_candidate(
+static AOM_INLINE void process_single_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
-    MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+    MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
   for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
     if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
       int_mv this_mv = candidate->mv[rf_idx];
@@ -472,49 +452,50 @@
         this_mv.as_mv.col = -this_mv.as_mv.col;
       }
       int stack_idx;
-      for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-        const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+      for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
         if (this_mv.as_int == stack_mv.as_int) break;
       }
 
-      if (stack_idx == refmv_count[ref_frame]) {
-        ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+      if (stack_idx == *refmv_count) {
+        ref_mv_stack[stack_idx].this_mv = this_mv;
 
         // TODO(jingning): Set an arbitrary small number here. The weight
         // doesn't matter as long as it is properly initialized.
-        ref_mv_stack[ref_frame][stack_idx].weight = 2;
-        ++refmv_count[ref_frame];
+        ref_mv_weight[stack_idx] = 2;
+        ++(*refmv_count);
       }
     }
   }
 }
 
-static void setup_ref_mv_list(
+static AOM_INLINE void setup_ref_mv_list(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
-    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
-    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
     int mi_row, int mi_col, int16_t *mode_context) {
-  const int bs = AOMMAX(xd->n4_w, xd->n4_h);
+  const int bs = AOMMAX(xd->width, xd->height);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-  const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
-  const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
   int processed_rows = 0;
   int processed_cols = 0;
 
   av1_set_ref_frame(rf, ref_frame);
   mode_context[ref_frame] = 0;
-  refmv_count[ref_frame] = 0;
+  *refmv_count = 0;
 
   // Find valid maximum row/col offset.
   if (xd->up_available) {
     max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
 
-    if (xd->n4_h < mi_size_high[BLOCK_8X8])
+    if (xd->height < mi_size_high[BLOCK_8X8])
       max_row_offset = -(2 << 1) + row_adj;
 
     max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
@@ -523,7 +504,7 @@
   if (xd->left_available) {
     max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
 
-    if (xd->n4_w < mi_size_wide[BLOCK_8X8])
+    if (xd->width < mi_size_wide[BLOCK_8X8])
       max_col_offset = -(2 << 1) + col_adj;
 
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -535,48 +516,48 @@
 
   // Scan the first above row mode info. row_offset = -1;
   if (abs(max_row_offset) >= 1)
-    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, max_row_offset, &processed_rows);
+    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+                  max_row_offset, &processed_rows);
   // Scan the first left column mode info. col_offset = -1;
   if (abs(max_col_offset) >= 1)
-    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
-                  gm_mv_candidates, max_col_offset, &processed_cols);
+    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+                  max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
-                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, &refmv_count[ref_frame]);
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
+                  ref_mv_weight, &row_match_count, &newmv_count,
+                  gm_mv_candidates, refmv_count);
 
   const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
-  const uint8_t nearest_refmv_count = refmv_count[ref_frame];
+  const uint8_t nearest_refmv_count = *refmv_count;
 
   // TODO(yunqing): for comp_search, do it for all 3 cases.
   for (int idx = 0; idx < nearest_refmv_count; ++idx)
-    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+    ref_mv_weight[idx] += REF_CAT_LEVEL;
 
-  if (cm->allow_ref_frame_mvs) {
+  if (cm->features.allow_ref_frame_mvs) {
     int is_available = 0;
-    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
-    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
-    const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
-    const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
+    const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
 
     const int tpl_sample_pos[3][2] = {
       { voffset, -2 },
       { voffset, hoffset },
       { voffset - 2, hoffset },
     };
-    const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
-                                (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
-                                (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
-                                (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+    const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->height < mi_size_high[BLOCK_64X64]) &&
+                                (xd->width >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->width < mi_size_wide[BLOCK_64X64]);
 
-    const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+    const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
                            ? mi_size_high[BLOCK_16X16]
                            : mi_size_high[BLOCK_8X8];
-    const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+    const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
                            ? mi_size_wide[BLOCK_16X16]
                            : mi_size_wide[BLOCK_8X8];
 
@@ -584,7 +565,7 @@
       for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
         int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
                                  blk_col, gm_mv_candidates, refmv_count,
-                                 ref_mv_stack, mode_context);
+                                 ref_mv_stack, ref_mv_weight, mode_context);
         if (blk_row == 0 && blk_col == 0) is_available = ret;
       }
     }
@@ -597,16 +578,17 @@
 
       if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
       add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
-                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
+                     gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
+                     mode_context);
     }
   }
 
   uint8_t dummy_newmv_count = 0;
 
   // Scan the second outer area.
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
                 &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                &refmv_count[ref_frame]);
+                refmv_count);
 
   for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
     const int row_offset = -(idx << 1) + 1 + row_adj;
@@ -614,24 +596,21 @@
 
     if (abs(row_offset) <= abs(max_row_offset) &&
         abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_row_offset, &processed_rows);
+      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &row_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_row_offset, &processed_rows);
 
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_col_offset, &processed_cols);
+      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &col_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_col_offset, &processed_cols);
   }
 
   const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
 
   switch (nearest_match) {
     case 0:
-      mode_context[ref_frame] |= 0;
       if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
       if (ref_match_count == 1)
         mode_context[ref_frame] |= (1 << REFMV_OFFSET);
@@ -661,45 +640,48 @@
   while (len > 0) {
     int nr_len = 0;
     for (int idx = 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
-  len = refmv_count[ref_frame];
+  len = *refmv_count;
   while (len > nearest_refmv_count) {
     int nr_len = nearest_refmv_count;
     for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
+  int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
+  mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
+  int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
+  mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
+  const int mi_size = AOMMIN(mi_width, mi_height);
   if (rf[1] > NONE_FRAME) {
     // TODO(jingning, yunqing): Refactor and consolidate the compound and
     // single reference frame modes. Reduce unnecessary redundancy.
-    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+    if (*refmv_count < MAX_MV_REF_CANDIDATES) {
       int_mv ref_id[2][2], ref_diff[2][2];
       int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
 
-      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
-      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
-      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-      int mi_size = AOMMIN(mi_width, mi_height);
-
       for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
         process_compound_ref_mv_candidate(
@@ -715,95 +697,82 @@
       }
 
       // Build up the compound mv predictor
-      int_mv comp_list[3][2];
+      int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
 
       for (int idx = 0; idx < 2; ++idx) {
         int comp_idx = 0;
-        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+        for (int list_idx = 0;
+             list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
              ++list_idx, ++comp_idx)
           comp_list[comp_idx][idx] = ref_id[idx][list_idx];
-        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+        for (int list_idx = 0;
+             list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
              ++list_idx, ++comp_idx)
           comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
-        for (; comp_idx < 3; ++comp_idx)
+        for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
           comp_list[comp_idx][idx] = gm_mv_candidates[idx];
       }
 
-      if (refmv_count[ref_frame]) {
-        assert(refmv_count[ref_frame] == 1);
-        if (comp_list[0][0].as_int ==
-                ref_mv_stack[ref_frame][0].this_mv.as_int &&
-            comp_list[0][1].as_int ==
-                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[1][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[1][1];
+      if (*refmv_count) {
+        assert(*refmv_count == 1);
+        if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
+            comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
+          ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
         } else {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[0][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[0][1];
+          ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
         }
-        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-        ++refmv_count[ref_frame];
+        ref_mv_weight[*refmv_count] = 2;
+        ++*refmv_count;
       } else {
         for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[idx][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[idx][1];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-          ++refmv_count[ref_frame];
+          ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
+          ref_mv_weight[*refmv_count] = 2;
+          ++*refmv_count;
         }
       }
     }
 
-    assert(refmv_count[ref_frame] >= 2);
+    assert(*refmv_count >= 2);
 
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
     }
   } else {
     // Handle single reference frame extension
-    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
-    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
-    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-    int mi_size = AOMMIN(mi_width, mi_height);
-
     for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
-                                      ref_mv_stack);
+                                      ref_mv_stack, ref_mv_weight);
       idx += mi_size_wide[candidate->sb_type];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
-                                      ref_mv_stack);
+                                      ref_mv_stack, ref_mv_weight);
       idx += mi_size_high[candidate->sb_type];
     }
 
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
     }
 
     if (mv_ref_list != NULL) {
-      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
-        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
+      for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
 
-      for (int idx = 0;
-           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
-        mv_ref_list[rf[0]][idx].as_int =
-            ref_mv_stack[ref_frame][idx].this_mv.as_int;
+      for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
+           ++idx) {
+        mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
       }
     }
   }
@@ -813,43 +782,44 @@
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
                       CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
                       int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context) {
-  int_mv zeromv[2];
-  BLOCK_SIZE bsize = mi->sb_type;
-  MV_REFERENCE_FRAME rf[2];
-  av1_set_ref_frame(rf, ref_frame);
+                      int_mv *global_mvs, int16_t *mode_context) {
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int_mv gm_mv[2];
 
-  if (global_mvs != NULL && ref_frame < REF_FRAMES) {
-    if (ref_frame != INTRA_FRAME) {
-      global_mvs[ref_frame] = gm_get_motion_vector(
-          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
-          mi_col, mi_row, cm->cur_frame_force_integer_mv);
-    } else {
+  if (ref_frame == INTRA_FRAME) {
+    gm_mv[0].as_int = gm_mv[1].as_int = 0;
+    if (global_mvs != NULL) {
       global_mvs[ref_frame].as_int = INVALID_MV;
     }
+  } else {
+    const BLOCK_SIZE bsize = mi->sb_type;
+    const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+    const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    if (ref_frame < REF_FRAMES) {
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1].as_int = 0;
+      if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
+    } else {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_frame);
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+    }
   }
 
-  if (ref_frame != INTRA_FRAME) {
-    zeromv[0].as_int =
-        gm_get_motion_vector(&cm->global_motion[rf[0]],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             cm->cur_frame_force_integer_mv)
-            .as_int;
-    zeromv[1].as_int =
-        (rf[1] != NONE_FRAME)
-            ? gm_get_motion_vector(&cm->global_motion[rf[1]],
-                                   cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, cm->cur_frame_force_integer_mv)
-                  .as_int
-            : 0;
-  } else {
-    zeromv[0].as_int = zeromv[1].as_int = 0;
-  }
-
-  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
-                    zeromv, mi_row, mi_col, mode_context);
+  setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
+                    ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
+                    mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
+                    mi_col, mode_context);
 }
 
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
@@ -865,12 +835,16 @@
 
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->order_hint = cm->current_frame.order_hint;
+  cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
 
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    if (buf != NULL)
+    if (buf != NULL) {
       cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
+      cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
+          buf->display_order_hint;
+    }
   }
 }
 
@@ -910,8 +884,8 @@
   const int col =
       (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
-  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
-      col >= (cm->mi_cols >> 1))
+  if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_params.mi_cols >> 1))
     return 0;
 
   if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
@@ -945,8 +919,8 @@
       start_frame_buf->frame_type == INTRA_ONLY_FRAME)
     return 0;
 
-  if (start_frame_buf->mi_rows != cm->mi_rows ||
-      start_frame_buf->mi_cols != cm->mi_cols)
+  if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
+      start_frame_buf->mi_cols != cm->mi_params.mi_cols)
     return 0;
 
   const int start_frame_order_hint = start_frame_buf->order_hint;
@@ -965,8 +939,8 @@
   if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
 
   MV_REF *mv_ref_base = start_frame_buf->mvs;
-  const int mvs_rows = (cm->mi_rows + 1) >> 1;
-  const int mvs_cols = (cm->mi_cols + 1) >> 1;
+  const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
 
   for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
     for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
@@ -991,7 +965,7 @@
         }
 
         if (pos_valid) {
-          const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+          const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
 
           tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
           tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1011,7 +985,8 @@
   if (!order_hint_info->enable_order_hint) return;
 
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
-  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
+             (cm->mi_params.mi_stride >> 1);
   for (int idx = 0; idx < size; ++idx) {
     tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
     tpl_mvs_base[idx].ref_frame_offset = 0;
@@ -1072,28 +1047,29 @@
   if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
 }
 
-static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
-                                  int row_offset, int sign_r, int col_offset,
-                                  int sign_c) {
+static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+                                  int *pts_inref, int row_offset, int sign_r,
+                                  int col_offset, int sign_c) {
   int bw = block_size_wide[mbmi->sb_type];
   int bh = block_size_high[mbmi->sb_type];
   int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
   int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
 
-  pts[0] = (x * 8);
-  pts[1] = (y * 8);
-  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+  pts[0] = GET_MV_SUBPEL(x);
+  pts[1] = GET_MV_SUBPEL(y);
+  pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
 }
 
 // Select samples according to the motion vector difference.
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
   int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
   int i, j, k, l = len;
-  int ret = 0;
+  uint8_t ret = 0;
   assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Obtain the motion vector difference.
@@ -1134,30 +1110,32 @@
 // Note: Samples returned are at 1/8-pel precision
 // Sample are the neighbor block center point's coordinates relative to the
 // left-top pixel of current block.
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref) {
-  MB_MODE_INFO *const mbmi0 = xd->mi[0];
-  int ref_frame = mbmi0->ref_frame[0];
-  int up_available = xd->up_available;
-  int left_available = xd->left_available;
-  int i, mi_step = 1, np = 0;
-
-  const TileInfo *const tile = &xd->tile;
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref) {
+  const MB_MODE_INFO *const mbmi0 = xd->mi[0];
+  const int ref_frame = mbmi0->ref_frame[0];
+  const int up_available = xd->up_available;
+  const int left_available = xd->left_available;
+  int i, mi_step;
+  uint8_t np = 0;
   int do_tl = 1;
   int do_tr = 1;
+  const int mi_stride = xd->mi_stride;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
   // scan the nearest above rows
   if (up_available) {
-    int mi_row_offset = -1;
-    MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
-    uint8_t n4_w = mi_size_wide[mbmi->sb_type];
+    const int mi_row_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
+    uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
 
-    if (xd->n4_w <= n4_w) {
+    if (xd->width <= superblock_width) {
       // Handle "current block width <= above block width" case.
-      int col_offset = -mi_col % n4_w;
+      const int col_offset = -mi_col % superblock_width;
 
       if (col_offset < 0) do_tl = 0;
-      if (col_offset + n4_w > xd->n4_w) do_tr = 0;
+      if (col_offset + superblock_width > xd->width) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1168,11 +1146,11 @@
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
-        int mi_col_offset = i;
-        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n4_w = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n4_w, n4_w);
+      for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+           i += mi_step) {
+        mbmi = xd->mi[i + mi_row_offset * mi_stride];
+        superblock_width = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->width, superblock_width);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1189,14 +1167,13 @@
 
   // scan the nearest left columns
   if (left_available) {
-    int mi_col_offset = -1;
+    const int mi_col_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+    uint8_t superblock_height = mi_size_high[mbmi->sb_type];
 
-    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t n4_h = mi_size_high[mbmi->sb_type];
-
-    if (xd->n4_h <= n4_h) {
+    if (xd->height <= superblock_height) {
       // Handle "current block height <= above block height" case.
-      int row_offset = -mi_row % n4_h;
+      const int row_offset = -mi_row % superblock_height;
 
       if (row_offset < 0) do_tl = 0;
 
@@ -1209,11 +1186,11 @@
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
-        int mi_row_offset = i;
-        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n4_h = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n4_h, n4_h);
+      for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+           i += mi_step) {
+        mbmi = xd->mi[mi_col_offset + i * mi_stride];
+        superblock_height = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->height, superblock_height);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1230,10 +1207,9 @@
 
   // Top-left block
   if (do_tl && left_available && up_available) {
-    int mi_row_offset = -1;
-    int mi_col_offset = -1;
-
-    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    const int mi_row_offset = -1;
+    const int mi_col_offset = -1;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 
     if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
       record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
@@ -1247,18 +1223,17 @@
 
   // Top-right block
   if (do_tr &&
-      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
-    POSITION trb_pos = { -1, xd->n4_w };
-
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+    const POSITION trb_pos = { -1, xd->width };
+    const TileInfo *const tile = &xd->tile;
     if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
-      int mi_row_offset = -1;
-      int mi_col_offset = xd->n4_w;
-
-      MB_MODE_INFO *mbmi =
-          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      const int mi_row_offset = -1;
+      const int mi_col_offset = xd->width;
+      const MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
         np++;
         if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
@@ -1362,8 +1337,8 @@
   return info_a->map_idx - info_b->map_idx;
 }
 
-static void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
-                               REF_FRAME_INFO *ref_info) {
+static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+                                          REF_FRAME_INFO *ref_info) {
   assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
   remapped_ref_idx[frame_idx] = ref_info->map_idx;
@@ -1520,7 +1495,8 @@
     fwd_end_idx--;
   }
 
-  // Assign all the remaining frame(s), if any, to the earliest reference frame.
+  // Assign all the remaining frame(s), if any, to the earliest reference
+  // frame.
   for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
     const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
     if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;

diff --git a/libaom/av1/common/mvref_common.h b/libaom/av1/common/mvref_common.h
index 0aa9d38..05a0dbc 100644
--- a/libaom/av1/common/mvref_common.h
+++ b/libaom/av1/common/mvref_common.h

@@ -11,7 +11,7 @@
 #ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
 #define AOM_AV1_COMMON_MVREF_COMMON_H_
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 
 #ifdef __cplusplus
@@ -50,23 +50,16 @@
 }
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
-           xd->mb_to_right_edge + bw * 8 + MV_BORDER,
-           xd->mb_to_top_edge - bh * 8 - MV_BORDER,
-           xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
+    xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
+    xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
+    xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
+  };
+  clamp_mv(mv, &mv_limits);
 }
 
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
-                                      int which_mv, int search_col) {
-  (void)search_col;
-  return candidate->mv[which_mv];
-}
-
-static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
-                                           int which_mv, int search_col) {
-  (void)search_col;
+static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
   return candidate->mv[which_mv];
 }
 
@@ -189,18 +182,17 @@
   return comp_ctx;
 }
 
-static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
-                                  int ref_idx) {
-  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
     return 0;
 
-  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
     return 1;
 
-  if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+  if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
     return 2;
 
   return 0;
@@ -251,9 +243,9 @@
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
                       CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
                       int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context);
+                      int_mv *global_mvs, int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -261,25 +253,24 @@
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
                            int_mv *near_mv, int is_integer);
 
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref);
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize);
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref);
 
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
 
 static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
-                                   int mib_size, int mi_row, int mi_col) {
-  (void)mi_col;
+                                   int mib_size, int mi_row) {
   if (mi_row - mib_size < tile->mi_row_start) {
-    ref_dv->as_mv.row = 0;
-    ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+    ref_dv->as_fullmv.row = 0;
+    ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
   } else {
-    ref_dv->as_mv.row = -MI_SIZE * mib_size;
-    ref_dv->as_mv.col = 0;
+    ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
+    ref_dv->as_fullmv.col = 0;
   }
-  ref_dv->as_mv.row *= 8;
-  ref_dv->as_mv.col *= 8;
+  convert_fullmv_to_mv(ref_dv);
 }
 
 static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
@@ -311,15 +302,12 @@
 
   // Special case for sub 8x8 chroma cases, to prevent referring to chroma
   // pixels outside current tile.
-  for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                            pd->subsampling_y)) {
-      if (bw < 8 && pd->subsampling_x)
-        if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
-      if (bh < 8 && pd->subsampling_y)
-        if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
-    }
+  if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
+    const struct macroblockd_plane *const pd = &xd->plane[1];
+    if (bw < 8 && pd->subsampling_x)
+      if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+    if (bh < 8 && pd->subsampling_y)
+      if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
   }
 
   // Is the bottom right within an already coded SB? Also consider additional

diff --git a/libaom/av1/common/obmc.h b/libaom/av1/common/obmc.h
index 1c90cd9..cc97b6b 100644
--- a/libaom/av1/common/obmc.h
+++ b/libaom/av1/common/obmc.h

@@ -12,25 +12,24 @@
 #ifndef AOM_AV1_COMMON_OBMC_H_
 #define AOM_AV1_COMMON_OBMC_H_
 
-typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
-                                          uint8_t nb_mi_size,
-                                          MB_MODE_INFO *nb_mi, void *fun_ctxt,
-                                          const int num_planes);
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
+                                          int rel_mi_col, uint8_t op_mi_size,
+                                          int dir, MB_MODE_INFO *nb_mi,
+                                          void *fun_ctxt, const int num_planes);
 
 static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
-                                                 MACROBLOCKD *xd, int mi_col,
-                                                 int nb_max,
+                                                 MACROBLOCKD *xd, int nb_max,
                                                  overlappable_nb_visitor_t fun,
                                                  void *fun_ctxt) {
-  const int num_planes = av1_num_planes(cm);
   if (!xd->up_available) return;
 
+  const int num_planes = av1_num_planes(cm);
   int nb_count = 0;
-
+  const int mi_col = xd->mi_col;
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
   MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
-  const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
+  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
@@ -49,26 +48,25 @@
     }
     if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
-      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
-          fun_ctxt, num_planes);
+      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
+          *above_mi, fun_ctxt, num_planes);
     }
   }
 }
 
 static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row,
-                                                int nb_max,
+                                                MACROBLOCKD *xd, int nb_max,
                                                 overlappable_nb_visitor_t fun,
                                                 void *fun_ctxt) {
-  const int num_planes = av1_num_planes(cm);
   if (!xd->left_available) return;
 
+  const int num_planes = av1_num_planes(cm);
   int nb_count = 0;
-
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
+  const int mi_row = xd->mi_row;
   MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
-  const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
+  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
@@ -82,7 +80,7 @@
     }
     if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
-      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
+      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
           fun_ctxt, num_planes);
     }
   }

diff --git a/libaom/av1/common/ppc/cfl_ppc.c b/libaom/av1/common/ppc/cfl_ppc.c
index 61d8dc1..6f88768 100644
--- a/libaom/av1/common/ppc/cfl_ppc.c
+++ b/libaom/av1/common/ppc/cfl_ppc.c

@@ -124,27 +124,27 @@
 
 // Based on observation, for small blocks VSX does not outperform C (no 64bit
 // load and store intrinsics). So we call the C code for block widths 4.
-cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
-    subtract_average_4x4_c,     /* 4x4 */
-    subtract_average_8x8_vsx,   /* 8x8 */
-    subtract_average_16x16_vsx, /* 16x16 */
-    subtract_average_32x32_vsx, /* 32x32 */
-    NULL,                       /* 64x64 (invalid CFL size) */
-    subtract_average_4x8_c,     /* 4x8 */
-    subtract_average_8x4_vsx,   /* 8x4 */
-    subtract_average_8x16_vsx,  /* 8x16 */
-    subtract_average_16x8_vsx,  /* 16x8 */
-    subtract_average_16x32_vsx, /* 16x32 */
-    subtract_average_32x16_vsx, /* 32x16 */
-    NULL,                       /* 32x64 (invalid CFL size) */
-    NULL,                       /* 64x32 (invalid CFL size) */
-    subtract_average_4x16_c,    /* 4x16 */
-    subtract_average_16x4_vsx,  /* 16x4 */
-    subtract_average_8x32_vsx,  /* 8x32 */
-    subtract_average_32x8_vsx,  /* 32x8 */
-    NULL,                       /* 16x64 (invalid CFL size) */
-    NULL,                       /* 64x16 (invalid CFL size) */
+    cfl_subtract_average_4x4_c,     /* 4x4 */
+    cfl_subtract_average_8x8_vsx,   /* 8x8 */
+    cfl_subtract_average_16x16_vsx, /* 16x16 */
+    cfl_subtract_average_32x32_vsx, /* 32x32 */
+    NULL,                           /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_c,     /* 4x8 */
+    cfl_subtract_average_8x4_vsx,   /* 8x4 */
+    cfl_subtract_average_8x16_vsx,  /* 8x16 */
+    cfl_subtract_average_16x8_vsx,  /* 16x8 */
+    cfl_subtract_average_16x32_vsx, /* 16x32 */
+    cfl_subtract_average_32x16_vsx, /* 32x16 */
+    NULL,                           /* 32x64 (invalid CFL size) */
+    NULL,                           /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_c,    /* 4x16 */
+    cfl_subtract_average_16x4_vsx,  /* 16x4 */
+    cfl_subtract_average_8x32_vsx,  /* 8x32 */
+    cfl_subtract_average_32x8_vsx,  /* 32x8 */
+    NULL,                           /* 16x64 (invalid CFL size) */
+    NULL,                           /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
   // index the function pointer array out of bounds.

diff --git a/libaom/av1/common/pred_common.h b/libaom/av1/common/pred_common.h
index d9b30a9..d1dab97 100644
--- a/libaom/av1/common/pred_common.h
+++ b/libaom/av1/common/pred_common.h

@@ -12,29 +12,31 @@
 #ifndef AOM_AV1_COMMON_PRED_COMMON_H_
 #define AOM_AV1_COMMON_PRED_COMMON_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
-#include "av1/common/onyxc_int.h"
 #include "aom_dsp/aom_dsp_common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE int get_segment_id(const AV1_COMMON *const cm,
+static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params,
                                  const uint8_t *segment_ids, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
-  int x, y, segment_id = MAX_SEGMENTS;
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+  int segment_id = MAX_SEGMENTS;
 
-  for (y = 0; y < ymis; ++y)
-    for (x = 0; x < xmis; ++x)
-      segment_id =
-          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+  for (int y = 0; y < ymis; ++y) {
+    for (int x = 0; x < xmis; ++x) {
+      segment_id = AOMMIN(segment_id,
+                          segment_ids[mi_offset + y * mi_params->mi_cols + x]);
+    }
+  }
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
@@ -42,22 +44,25 @@
 
 static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
                                            const MACROBLOCKD *const xd,
-                                           int mi_row, int mi_col,
                                            int *cdf_index) {
   int prev_ul = -1;  // top left segment_id
   int prev_l = -1;   // left segment_id
   int prev_u = -1;   // top segment_id
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const uint8_t *seg_map = cm->cur_frame->seg_map;
   if ((xd->up_available) && (xd->left_available)) {
-    prev_ul = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
-                             mi_col - 1);
+    prev_ul =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1);
   }
   if (xd->up_available) {
-    prev_u = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
-                            mi_col - 0);
+    prev_u =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0);
   }
   if (xd->left_available) {
-    prev_l = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 0,
-                            mi_col - 1);
+    prev_l =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1);
   }
   // This property follows from the fact that get_segment_id() returns a
   // nonnegative value. This allows us to test for all edge cases with a simple
@@ -182,6 +187,7 @@
                           uint16_t *cache);
 
 static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
   return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
 }
 
@@ -202,6 +208,10 @@
   return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
 
+static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+}
+
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
 
 // == Uni-directional contexts ==

diff --git a/libaom/av1/common/quant_common.c b/libaom/av1/common/quant_common.c
index d4bdb98..e96d71a 100644
--- a/libaom/av1/common/quant_common.c
+++ b/libaom/av1/common/quant_common.c

@@ -9,14 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
 #include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/entropy.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
-#include "av1/common/blockd.h"
 
-static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
   19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
   31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
@@ -38,7 +38,7 @@
   1184, 1232, 1282, 1336,
 };
 
-static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = {
   4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
   40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
   86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
@@ -61,7 +61,7 @@
   3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
 };
 
-static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = {
   4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
   103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
   251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
@@ -88,7 +88,7 @@
   19718, 20521, 21387,
 };
 
-static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = {
   4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -111,7 +111,7 @@
   1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = {
   4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
   44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
   96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
@@ -134,7 +134,7 @@
   6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
 };
 
-static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = {
   4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
   112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
   280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
@@ -190,39 +190,28 @@
 // addition, the minimum allowable quantizer is 4; smaller values will
 // underflow to 0 in the actual quantization routines.
 
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
-  switch (bit_depth) {
-    case AOM_BITS_8: return dc_qlookup_Q3[q_clamped];
-    case AOM_BITS_10: return dc_qlookup_10_Q3[q_clamped];
-    case AOM_BITS_12: return dc_qlookup_12_Q3[q_clamped];
-    default:
-      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
-      return -1;
-  }
-}
-
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
-  switch (bit_depth) {
-    case AOM_BITS_8: return ac_qlookup_Q3[q_clamped];
-    case AOM_BITS_10: return ac_qlookup_10_Q3[q_clamped];
-    case AOM_BITS_12: return ac_qlookup_12_Q3[q_clamped];
-    default:
-      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
-      return -1;
-  }
-}
-
-// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
-// bits), so QTX == Q3.
-
 int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  return av1_dc_quant_Q3(qindex, delta, bit_depth);
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+  switch (bit_depth) {
+    case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
 }
 
 int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  return av1_ac_quant_Q3(qindex, delta, bit_depth);
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+  switch (bit_depth) {
+    case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
 }
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
@@ -236,13 +225,53 @@
   }
 }
 
-const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
-                             TX_SIZE tx_size) {
-  return &cm->giqmatrix[qmlevel][plane][tx_size][0];
+bool av1_use_qmatrix(const CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id) {
+  // True if explicit Q matrix levels and this is not a lossless segment.
+  return quant_params->using_qmatrix && !xd->lossless[segment_id];
 }
-const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
-                            TX_SIZE tx_size) {
-  return &cm->gqmatrix[qmlevel][plane][tx_size][0];
+
+const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                             int plane, TX_SIZE tx_size) {
+  assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->giqmatrix[qmlevel][plane][tx_size];
+}
+const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                            int plane, TX_SIZE tx_size) {
+  assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->gqmatrix[qmlevel][plane][tx_size];
+}
+
+// Returns true if the tx_type corresponds to non-identity transform in both
+// horizontal and vertical directions.
+static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+
+const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
+                                 const MACROBLOCKD *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+             : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params,
+                                const MACROBLOCKD *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_qmatrix[seg_id][qm_tx_size]
+             : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
 }
 
 #define QM_TOTAL_SIZE 3344
@@ -251,26 +280,27 @@
 static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
 static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
 
-void av1_qm_init(AV1_COMMON *cm) {
-  const int num_planes = av1_num_planes(cm);
-  int q, c, t;
-  int current;
-  for (q = 0; q < NUM_QM_LEVELS; ++q) {
-    for (c = 0; c < num_planes; ++c) {
-      current = 0;
-      for (t = 0; t < TX_SIZES_ALL; ++t) {
+void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
+  for (int q = 0; q < NUM_QM_LEVELS; ++q) {
+    for (int c = 0; c < num_planes; ++c) {
+      int current = 0;
+      for (int t = 0; t < TX_SIZES_ALL; ++t) {
         const int size = tx_size_2d[t];
         const int qm_tx_size = av1_get_adjusted_tx_size(t);
         if (q == NUM_QM_LEVELS - 1) {
-          cm->gqmatrix[q][c][t] = NULL;
-          cm->giqmatrix[q][c][t] = NULL;
+          quant_params->gqmatrix[q][c][t] = NULL;
+          quant_params->giqmatrix[q][c][t] = NULL;
         } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
-          cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
-          cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+          assert(t > qm_tx_size);
+          quant_params->gqmatrix[q][c][t] =
+              quant_params->gqmatrix[q][c][qm_tx_size];
+          quant_params->giqmatrix[q][c][t] =
+              quant_params->giqmatrix[q][c][qm_tx_size];
         } else {
           assert(current + size <= QM_TOTAL_SIZE);
-          cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
-          cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+          quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          quant_params->giqmatrix[q][c][t] =
+              &iwt_matrix_ref[q][c >= 1][current];
           current += size;
         }
       }

diff --git a/libaom/av1/common/quant_common.h b/libaom/av1/common/quant_common.h
index d1f52a6..9c30204 100644
--- a/libaom/av1/common/quant_common.h
+++ b/libaom/av1/common/quant_common.h

@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
 #define AOM_AV1_COMMON_QUANT_COMMON_H_
 
+#include <stdbool.h>
 #include "aom/aom_codec.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/enums.h"
@@ -37,24 +38,43 @@
 #define DEFAULT_QM_LAST 9
 
 struct AV1Common;
+struct CommonQuantParams;
+struct macroblockd;
 
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
+
+// Returns true if we are using quantization matrix.
+bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id);
+
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
 static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
   return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
 }
-void av1_qm_init(struct AV1Common *cm);
-const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
-                             TX_SIZE tx_size);
-const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
-                            TX_SIZE tx_size);
+
+// Initialize all global quant/dequant matrices.
+void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes);
+
+// Get global dequant matrix.
+const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params,
+                             int qmlevel, int plane, TX_SIZE tx_size);
+// Get global quant matrix.
+const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params,
+                            int qmlevel, int plane, TX_SIZE tx_size);
+
+// Get either local / global dequant matrix as appropriate.
+const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params,
+                                 const struct macroblockd *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type);
+// Get either local / global quant matrix as appropriate.
+const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params,
+                                const struct macroblockd *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/common/reconinter.c b/libaom/av1/common/reconinter.c
index ea351cf..287addd 100644
--- a/libaom/av1/common/reconinter.c
+++ b/libaom/av1/common/reconinter.c

@@ -20,15 +20,12 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/obmc.h"
-
-#define USE_PRECOMPUTED_WEDGE_MASK 1
-#define USE_PRECOMPUTED_WEDGE_SIGN 1
 
 // This function will determine whether or not to create a warped
 // prediction.
@@ -59,46 +56,114 @@
   return 0;
 }
 
-void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, const SubpelParams *subpel_params,
-                              const struct scale_factors *sf, int w, int h,
-                              ConvolveParams *conv_params,
-                              InterpFilters interp_filters,
-                              const WarpTypesAllowed *warp_types, int p_col,
-                              int p_row, int plane, int ref,
-                              const MB_MODE_INFO *mi, int build_for_obmc,
-                              const MACROBLOCKD *xd, int can_use_previous) {
-  // Make sure the selected motion mode is valid for this configuration
-  assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
-                           can_use_previous);
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters) {
+  inter_pred_params->block_width = block_width;
+  inter_pred_params->block_height = block_height;
+  inter_pred_params->pix_row = pix_row;
+  inter_pred_params->pix_col = pix_col;
+  inter_pred_params->subsampling_x = subsampling_x;
+  inter_pred_params->subsampling_y = subsampling_y;
+  inter_pred_params->bit_depth = bit_depth;
+  inter_pred_params->use_hbd_buf = use_hbd_buf;
+  inter_pred_params->is_intrabc = is_intrabc;
+  inter_pred_params->scale_factors = sf;
+  inter_pred_params->ref_frame_buf = *ref_buf;
+  inter_pred_params->mode = TRANSLATION_PRED;
+  inter_pred_params->comp_mode = UNIFORM_SINGLE;
 
-  WarpedMotionParams final_warp_params;
-  const int do_warp =
-      (w >= 8 && h >= 8 &&
-       av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
-                      build_for_obmc, sf, &final_warp_params));
-  const int is_intrabc = mi->use_intrabc;
-  assert(IMPLIES(is_intrabc, !do_warp));
-
-  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const struct buf_2d *const pre_buf = &pd->pre[ref];
-    av1_warp_plane(&final_warp_params, is_cur_buf_hbd(xd), xd->bd,
-                   pre_buf->buf0, pre_buf->width, pre_buf->height,
-                   pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
-                   pd->subsampling_x, pd->subsampling_y, conv_params);
-  } else if (is_cur_buf_hbd(xd)) {
-    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
-                           w, h, conv_params, interp_filters, is_intrabc,
-                           xd->bd);
+  if (is_intrabc) {
+    inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params;
+    inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params;
   } else {
-    inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
-                    conv_params, interp_filters, is_intrabc);
+    inter_pred_params->interp_filter_params[0] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.x_filter, block_width);
+    inter_pred_params->interp_filter_params[1] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.y_filter, block_height);
   }
 }
 
-#if USE_PRECOMPUTED_WEDGE_MASK
+void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+  inter_pred_params->comp_mode = UNIFORM_COMP;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
+  if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
+    return;
+
+  if (xd->cur_frame_force_integer_mv) return;
+
+  if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
+                     inter_pred_params->scale_factors,
+                     &inter_pred_params->warp_params))
+    inter_pred_params->mode = WARP_PRED;
+}
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp) {
+  inter_pred_params->sb_type = bsize;
+  inter_pred_params->mask_comp = *mask_comp;
+
+  if (inter_pred_params->conv_params.compound_index == 1) {
+    inter_pred_params->conv_params.do_average = 0;
+    inter_pred_params->comp_mode = MASK_COMP;
+  }
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params) {
+  assert(IMPLIES(inter_pred_params->conv_params.is_compound,
+                 inter_pred_params->conv_params.dst != NULL));
+
+  // TODO(jingning): av1_warp_plane() can be further cleaned up.
+  if (inter_pred_params->mode == WARP_PRED) {
+    av1_warp_plane(
+        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
+        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+        inter_pred_params->ref_frame_buf.width,
+        inter_pred_params->ref_frame_buf.height,
+        inter_pred_params->ref_frame_buf.stride, dst,
+        inter_pred_params->pix_col, inter_pred_params->pix_row,
+        inter_pred_params->block_width, inter_pred_params->block_height,
+        dst_stride, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
+  } else if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (inter_pred_params->use_hbd_buf) {
+      highbd_inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params,
+          inter_pred_params->bit_depth);
+    } else {
+      inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params);
+    }
+#else
+    inter_predictor(
+        src, src_stride, dst, dst_stride, subpel_params,
+        inter_pred_params->scale_factors, inter_pred_params->block_width,
+        inter_pred_params->block_height, &inter_pred_params->conv_params,
+        inter_pred_params->interp_filter_params);
+#endif
+  }
+}
+
 static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
@@ -118,7 +183,8 @@
   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
 
-static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+                                  int width) {
   if (shift >= 0) {
     memcpy(dst + shift, src, width - shift);
     memset(dst, src[0], shift);
@@ -128,9 +194,7 @@
     memset(dst + width - shift, src[width - 1], shift);
   }
 }
-#endif  // USE_PRECOMPUTED_WEDGE_MASK
 
-#if USE_PRECOMPUTED_WEDGE_SIGN
 /* clang-format off */
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
@@ -158,10 +222,6 @@
   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
 };
 /* clang-format on */
-#else
-DECLARE_ALIGNED(16, static uint8_t,
-                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
-#endif  // USE_PRECOMPUTED_WEDGE_SIGN
 
 // [negative][direction]
 DECLARE_ALIGNED(
@@ -173,6 +233,10 @@
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
 
+DECLARE_ALIGNED(16, static uint8_t,
+                smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL]
+                                          [MAX_WEDGE_SQUARE]);
+
 static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
 
 static const wedge_code_type wedge_codebook_16_hgtw[16] = {
@@ -208,23 +272,23 @@
   { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };
 
-const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = {
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
     wedge_masks[BLOCK_8X8] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
     wedge_masks[BLOCK_8X16] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
     wedge_masks[BLOCK_16X8] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
     wedge_masks[BLOCK_16X16] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
     wedge_masks[BLOCK_16X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
     wedge_masks[BLOCK_32X16] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
     wedge_masks[BLOCK_32X32] },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
@@ -234,9 +298,9 @@
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
     wedge_masks[BLOCK_8X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
     wedge_masks[BLOCK_32X8] },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
@@ -248,12 +312,12 @@
   const int bh = block_size_high[sb_type];
   const int bw = block_size_wide[sb_type];
   const wedge_code_type *a =
-      wedge_params_lookup[sb_type].codebook + wedge_index;
+      av1_wedge_params_lookup[sb_type].codebook + wedge_index;
   int woff, hoff;
-  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+  const uint8_t wsignflip =
+      av1_wedge_params_lookup[sb_type].signflip[wedge_index];
 
-  assert(wedge_index >= 0 &&
-         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
   woff = (a->x_offset * bw) >> 3;
   hoff = (a->y_offset * bh) >> 3;
   master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
@@ -275,10 +339,10 @@
   }
 }
 
-static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
-                             const CONV_BUF_TYPE *src0, int src0_stride,
-                             const CONV_BUF_TYPE *src1, int src1_stride, int h,
-                             int w, ConvolveParams *conv_params, int bd) {
+static AOM_INLINE void diffwtd_mask_d16(
+    uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
   int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   int i, j, m, diff;
@@ -309,9 +373,10 @@
   }
 }
 
-static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
-                         const uint8_t *src0, int src0_stride,
-                         const uint8_t *src1, int src1_stride, int h, int w) {
+static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
+                                    int mask_base, const uint8_t *src0,
+                                    int src0_stride, const uint8_t *src1,
+                                    int src1_stride, int h, int w) {
   int i, j, m, diff;
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
@@ -419,13 +484,12 @@
   }
 }
 
-static void init_wedge_master_masks() {
+static AOM_INLINE void init_wedge_master_masks() {
   int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
   const int stride = MASK_MASTER_STRIDE;
-// Note: index [0] stores the masters, and [1] its complement.
-#if USE_PRECOMPUTED_WEDGE_MASK
+  // Note: index [0] stores the masters, and [1] its complement.
   // Generate prototype by shifting the masters
   int shift = h / 4;
   for (i = 0; i < h; i += 2) {
@@ -443,22 +507,7 @@
            wedge_master_vertical,
            MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
   }
-#else
-  static const double smoother_param = 2.85;
-  const int a[2] = { 2, 1 };
-  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; ++j) {
-      int x = (2 * j + 1 - w);
-      int y = (2 * i + 1 - h);
-      double d = (a[0] * x + a[1] * y) / asqrt;
-      const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
-      wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
-      const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
-      wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
-    }
-  }
-#endif  // USE_PRECOMPUTED_WEDGE_MASK
+
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
@@ -480,57 +529,18 @@
   }
 }
 
-#if !USE_PRECOMPUTED_WEDGE_SIGN
-// If the signs for the wedges for various blocksizes are
-// inconsistent flip the sign flag. Do it only once for every
-// wedge codebook.
-static void init_wedge_signs() {
-  BLOCK_SIZE sb_type;
-  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
-  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
-    const int bw = block_size_wide[sb_type];
-    const int bh = block_size_high[sb_type];
-    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
-    const int wbits = wedge_params.bits;
-    const int wtypes = 1 << wbits;
-    int i, w;
-    if (wbits) {
-      for (w = 0; w < wtypes; ++w) {
-        // Get the mask master, i.e. index [0]
-        const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
-        int avg = 0;
-        for (i = 0; i < bw; ++i) avg += mask[i];
-        for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
-        avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
-        // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
-        // If default sign is 1:
-        //   If sign requested is 0, we need to flip the sign and return
-        //   the complement i.e. index [1] instead. If sign requested is 1
-        //   we need to flip the sign and return index [0] instead.
-        // If default sign is 0:
-        //   If sign requested is 0, we need to return index [0] the master
-        //   if sign requested is 1, we need to return the complement index [1]
-        //   instead.
-        wedge_params.signflip[w] = (avg < 32);
-      }
-    }
-  }
-}
-#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
-
-static void init_wedge_masks() {
+static AOM_INLINE void init_wedge_masks() {
   uint8_t *dst = wedge_mask_buf;
   BLOCK_SIZE bsize;
   memset(wedge_masks, 0, sizeof(wedge_masks));
   for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
+    const int wtypes = wedge_params->wedge_types;
+    if (wtypes == 0) continue;
     const uint8_t *mask;
     const int bw = block_size_wide[bsize];
     const int bh = block_size_high[bsize];
-    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
-    const int wbits = wedge_params->bits;
-    const int wtypes = 1 << wbits;
     int w;
-    if (wbits == 0) continue;
     for (w = 0; w < wtypes; ++w) {
       mask = get_wedge_mask_inplace(w, 0, bsize);
       aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
@@ -548,82 +558,356 @@
   }
 }
 
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
+  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+    32, 16, 16, 16, 8, 8, 8, 4,
+    4,  4,  2,  2,  2, 1, 1, 1,
+    8,  8,  4,  4,  2, 2
+};
+/* clang-format on */
+
+static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                                    BLOCK_SIZE plane_bsize,
+                                                    INTERINTRA_MODE mode) {
+  int i, j;
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_SMOOTH_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+  }
+}
+
+static AOM_INLINE void init_smooth_interintra_masks() {
+  for (int m = 0; m < INTERINTRA_MODES; ++m) {
+    for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
+      const int bw = block_size_wide[bs];
+      const int bh = block_size_high[bs];
+      if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
+      build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
+                                   m);
+    }
+  }
+}
+
 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
 void av1_init_wedge_masks() {
   init_wedge_master_masks();
-#if !USE_PRECOMPUTED_WEDGE_SIGN
-  init_wedge_signs();
-#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_masks();
+  init_smooth_interintra_masks();
 }
 
-static void build_masked_compound_no_round(
+static AOM_INLINE void build_masked_compound_no_round(
     uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+    int w, InterPredParams *inter_pred_params) {
+  const int ssy = inter_pred_params->subsampling_y;
+  const int ssx = inter_pred_params->subsampling_x;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  if (is_cur_buf_hbd(xd)) {
+  const int mask_stride = block_size_wide[sb_type];
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (inter_pred_params->use_hbd_buf) {
     aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, block_size_wide[sb_type],
-                                  w, h, subw, subh, conv_params, xd->bd);
+                                  src1_stride, mask, mask_stride, w, h, ssx,
+                                  ssy, &inter_pred_params->conv_params,
+                                  inter_pred_params->bit_depth);
   } else {
     aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, block_size_wide[sb_type], w,
-                                 h, subw, subh, conv_params);
+                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                                 &inter_pred_params->conv_params);
+  }
+#else
+  aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                               &inter_pred_params->conv_params);
+#endif
+}
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params) {
+  const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
+  BLOCK_SIZE sb_type = inter_pred_params->sb_type;
+
+  // We're going to call av1_make_inter_predictor to generate a prediction into
+  // a temporary buffer, then will blend that temporary buffer with that from
+  // the other reference.
+  DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst =
+      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
+
+  const int tmp_buf_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
+  int org_dst_stride = inter_pred_params->conv_params.dst_stride;
+  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+  inter_pred_params->conv_params.dst = tmp_buf16;
+  inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
+  assert(inter_pred_params->conv_params.do_average == 0);
+
+  // This will generate a prediction in tmp_buf for the second reference
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+                           inter_pred_params, subpel_params);
+
+  if (!inter_pred_params->conv_params.plane &&
+      comp_data->type == COMPOUND_DIFFWTD) {
+    av1_build_compound_diffwtd_mask_d16(
+        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+        tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
+        inter_pred_params->block_width, &inter_pred_params->conv_params,
+        inter_pred_params->bit_depth);
+  }
+  build_masked_compound_no_round(
+      dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
+      comp_data, sb_type, inter_pred_params->block_height,
+      inter_pred_params->block_width, inter_pred_params);
+}
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+  SubpelParams subpel_params;
+  uint8_t *src;
+  int src_stride;
+  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
+                          &subpel_params, &src_stride);
+
+  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+      inter_pred_params->comp_mode == UNIFORM_COMP) {
+    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+                             inter_pred_params, &subpel_params);
+  } else {
+    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                    inter_pred_params, &subpel_params);
   }
 }
 
-void av1_make_masked_inter_predictor(
-    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-    MACROBLOCKD *xd, int can_use_previous) {
-  MB_MODE_INFO *mi = xd->mi[0];
-  (void)dst;
-  (void)dst_stride;
-  mi->interinter_comp.seg_mask = xd->seg_mask;
-  const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
-
-// We're going to call av1_make_inter_predictor to generate a prediction into
-// a temporary buffer, then will blend that temporary buffer with that from
-// the other reference.
-//
-#define INTER_PRED_BYTES_PER_PIXEL 2
-
-  DECLARE_ALIGNED(32, uint8_t,
-                  tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
-#undef INTER_PRED_BYTES_PER_PIXEL
-
-  uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
-
-  const int tmp_buf_stride = MAX_SB_SIZE;
-  CONV_BUF_TYPE *org_dst = conv_params->dst;
-  int org_dst_stride = conv_params->dst_stride;
-  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
-  conv_params->dst = tmp_buf16;
-  conv_params->dst_stride = tmp_buf_stride;
-  assert(conv_params->do_average == 0);
-
-  // This will generate a prediction in tmp_buf for the second reference
-  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
-                           sf, w, h, conv_params, interp_filters, warp_types,
-                           p_col, p_row, plane, ref, mi, 0, xd,
-                           can_use_previous);
-
-  if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-    av1_build_compound_diffwtd_mask_d16(
-        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
-        tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
+// True if the following hold:
+//  1. Not intrabc and not build_for_obmc
+//  2. A U or V plane
+//  3. If the block size differs from the base block size
+//  4. If sub-sampled, none of the previous blocks around the sub-sample
+//     are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+                            int is_intrabc, int build_for_obmc) {
+  if (is_intrabc || build_for_obmc) {
+    return false;
   }
-  build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
-                                 tmp_buf16, tmp_buf_stride, comp_data,
-                                 mi->sb_type, h, w, conv_params, xd);
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  if ((block_size_wide[bsize] >= 8 || !ss_x) &&
+      (block_size_high[bsize] >= 8 || !ss_y)) {
+    return false;
+  }
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+
+  for (int row = row_start; row <= 0; ++row) {
+    for (int col = col_start; col <= 0; ++col) {
+      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      if (!is_inter_block(this_mbmi)) return false;
+      if (is_intrabc_block(this_mbmi)) return false;
+    }
+  }
+  return true;
+}
+
+static void build_inter_predictors_sub8x8(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const BLOCK_SIZE bsize = mi->sb_type;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool ss_x = pd->subsampling_x;
+  const bool ss_y = pd->subsampling_y;
+  const int b4_w = block_size_wide[bsize] >> ss_x;
+  const int b4_h = block_size_high[bsize] >> ss_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int b8_w = block_size_wide[plane_bsize];
+  const int b8_h = block_size_high[plane_bsize];
+  const int is_compound = has_second_ref(mi);
+  assert(!is_compound);
+  assert(!is_intrabc_block(mi));
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row = row_start;
+  for (int y = 0; y < b8_h; y += b4_h) {
+    int col = col_start;
+    for (int x = 0; x < b8_w; x += b4_w) {
+      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      int tmp_dst_stride = 8;
+      assert(bw < 8 || bh < 8);
+      (void)bw;
+      (void)bh;
+      struct buf_2d *const dst_buf = &pd->dst;
+      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+      int ref = 0;
+      const RefCntBuffer *ref_buf =
+          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *const sf = ref_scale_factors;
+      const struct buf_2d pre_buf = {
+        NULL,
+        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+        ref_buf->buf.uv_crop_width,
+        ref_buf->buf.uv_crop_height,
+        ref_buf->buf.uv_stride,
+      };
+
+      const MV mv = this_mbmi->mv[ref].as_mv;
+
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
+                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+                            &pre_buf, this_mbmi->interp_filters);
+      inter_pred_params.conv_params = get_conv_params_no_round(
+          ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+      inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+
+      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
+                                    ref, calc_subpel_params_func);
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+static void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const int is_compound = has_second_ref(mi);
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+
+  int is_global[2] = { 0, 0 };
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+    const MV mv = mi->mv[ref].as_mv;
+    const WarpTypesAllowed warp_types = { is_global[ref],
+                                          mi->motion_mode == WARPED_CAUSAL };
+
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+                          pd->subsampling_x, pd->subsampling_y, xd->bd,
+                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+                          mi->interp_filters);
+    if (is_compound) av1_init_comp_mode(&inter_pred_params);
+    inter_pred_params.conv_params = get_conv_params_no_round(
+        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        &inter_pred_params.conv_params.bck_offset,
+        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+    if (!build_for_obmc)
+      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    if (is_masked_compound_type(mi->interinter_comp.type)) {
+      av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
+      // Assign physical buffer.
+      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+    }
+
+    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+                                  xd, mi_x, mi_y, ref, calc_subpel_params_func);
+  }
+}
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func) {
+  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
+                                  calc_subpel_params_func);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y,
+                                          calc_subpel_params_func);
+  }
 }
 
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
@@ -742,19 +1026,21 @@
   }
 }
 
-static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
-                                     uint8_t mi_hw, MB_MODE_INFO *mi,
-                                     void *fun_ctxt, const int num_planes) {
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+                                     int rel_mi_col, uint8_t op_mi_size,
+                                     int dir, MB_MODE_INFO *mi, void *fun_ctxt,
+                                     const int num_planes) {
   (void)xd;
-  (void)rel_mi_rc;
-  (void)mi_hw;
+  (void)rel_mi_row;
+  (void)rel_mi_col;
+  (void)op_mi_size;
+  (void)dir;
   (void)mi;
   ++*(int *)fun_ctxt;
   (void)num_planes;
 }
 
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col) {
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
 
   mbmi->overlappable_neighbors[0] = 0;
@@ -762,9 +1048,9 @@
 
   if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
 
-  foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+  foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
                                 &mbmi->overlappable_neighbors[0]);
-  foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+  foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
                                &mbmi->overlappable_neighbors[1]);
 }
 
@@ -805,21 +1091,20 @@
   int *adjacent_stride;
 };
 
-static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
-                                               uint8_t above_mi_width,
-                                               MB_MODE_INFO *above_mi,
-                                               void *fun_ctxt,
-                                               const int num_planes) {
+static INLINE void build_obmc_inter_pred_above(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
   (void)above_mi;
+  (void)rel_mi_row;
+  (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int is_hbd = is_cur_buf_hbd(xd);
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
     const int bh = overlap >> pd->subsampling_y;
     const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
 
@@ -830,32 +1115,36 @@
     const int tmp_stride = ctxt->adjacent_stride[plane];
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
     const uint8_t *const mask = av1_get_obmc_mask(bh);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
     if (is_hbd)
       aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
                                  tmp_stride, mask, bw, bh, xd->bd);
     else
       aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                           mask, bw, bh);
+#else
+    aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
   }
 }
 
-static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
-                                              uint8_t left_mi_height,
-                                              MB_MODE_INFO *left_mi,
-                                              void *fun_ctxt,
-                                              const int num_planes) {
+static INLINE void build_obmc_inter_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
   (void)left_mi;
+  (void)rel_mi_col;
+  (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-  const int is_hbd = is_cur_buf_hbd(xd);
 
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = overlap >> pd->subsampling_x;
-    const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+    const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
     const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
@@ -866,12 +1155,18 @@
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
     const uint8_t *const mask = av1_get_obmc_mask(bw);
 
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
     if (is_hbd)
       aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
                                  tmp_stride, mask, bw, bh, xd->bd);
     else
       aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                           mask, bw, bh);
+#else
+    aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
   }
 }
 
@@ -880,7 +1175,6 @@
 // prediction. We assume the original prediction (bmc) is stored in
 // xd->plane[].dst.buf
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
@@ -889,23 +1183,54 @@
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
-  foreach_overlappable_nb_above(cm, xd, mi_col,
+  foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_inter_pred_above, &ctxt_above);
 
   // handle left column
   struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
-  foreach_overlappable_nb_left(cm, xd, mi_row,
+  foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if ((!av1_is_valid_scale(sf)))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
     const int num_planes) {
   const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  const int above_mi_col = xd->mi_col + rel_mi_col;
 
   av1_modify_neighbor_predictor_for_obmc(above_mbmi);
 
@@ -928,13 +1253,14 @@
     if ((!av1_is_valid_scale(sf)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, &ref_buf->buf, ctxt->mi_row, above_mi_col, sf,
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
                          num_planes);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
-  xd->mb_to_right_edge = ctxt->mb_to_far_edge +
-                         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+  xd->mb_to_right_edge =
+      ctxt->mb_to_far_edge +
+      (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
 }
 
 void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -943,7 +1269,7 @@
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes) {
   const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  const int left_mi_row = xd->mi_row + rel_mi_row;
 
   av1_modify_neighbor_predictor_for_obmc(left_mbmi);
 
@@ -967,86 +1293,26 @@
     if ((!av1_is_valid_scale(ref_scale_factors)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, ctxt->mi_col,
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
                          ref_scale_factors, num_planes);
   }
 
-  xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+  xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
-      (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+      GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
 }
 
-/* clang-format off */
-static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
-  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
-  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
-  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
-  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
-  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
-  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-};
-static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
-    32, 16, 16, 16, 8, 8, 8, 4,
-    4,  4,  2,  2,  2, 1, 1, 1,
-    8,  8,  4,  4,  2, 2
-};
-/* clang-format on */
-
-static void build_smooth_interintra_mask(uint8_t *mask, int stride,
-                                         BLOCK_SIZE plane_bsize,
-                                         INTERINTRA_MODE mode) {
-  int i, j;
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
-  const int size_scale = ii_size_scales[plane_bsize];
-
-  switch (mode) {
-    case II_V_PRED:
-      for (i = 0; i < bh; ++i) {
-        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
-        mask += stride;
-      }
-      break;
-
-    case II_H_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
-        mask += stride;
-      }
-      break;
-
-    case II_SMOOTH_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j)
-          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
-        mask += stride;
-      }
-      break;
-
-    case II_DC_PRED:
-    default:
-      for (i = 0; i < bh; ++i) {
-        memset(mask, 32, bw * sizeof(mask[0]));
-        mask += stride;
-      }
-      break;
-  }
-}
-
-static void combine_interintra(INTERINTRA_MODE mode,
-                               int8_t use_wedge_interintra, int wedge_index,
-                               int wedge_sign, BLOCK_SIZE bsize,
-                               BLOCK_SIZE plane_bsize, uint8_t *comppred,
-                               int compstride, const uint8_t *interpred,
-                               int interstride, const uint8_t *intrapred,
-                               int intrastride) {
+static AOM_INLINE void combine_interintra(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred, int compstride, const uint8_t *interpred,
+    int interstride, const uint8_t *intrapred, int intrastride) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
 
   if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
+    if (av1_is_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
       const int subw = 2 * mi_size_wide[bsize] == bw;
@@ -1058,22 +1324,22 @@
     return;
   }
 
-  uint8_t mask[MAX_SB_SQUARE];
-  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
   aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
                      interstride, mask, bw, bw, bh, 0, 0);
 }
 
-static void combine_interintra_highbd(
-    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int wedge_index,
-    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
     uint8_t *comppred8, int compstride, const uint8_t *interpred8,
     int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
 
   if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
+    if (av1_is_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
       const int subh = 2 * mi_size_high[bsize] == bh;
@@ -1091,6 +1357,7 @@
                             interpred8, interstride, mask, bw, bw, bh, 0, 0,
                             bd);
 }
+#endif
 
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
@@ -1119,27 +1386,29 @@
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     combine_interintra_highbd(
         xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
-        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
-        bsize, plane_bsize, xd->plane[plane].dst.buf,
-        xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
-        intra_stride, xd->bd);
+        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
     return;
   }
+#endif
   combine_interintra(
       xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
-      xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
-      bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+      plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
       inter_pred, inter_stride, intra_pred, intra_stride);
 }
 
 // build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *pred, int stride,
-                                         const BUFFER_SET *ctx, int plane,
-                                         BLOCK_SIZE bsize) {
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
   if (is_cur_buf_hbd(xd)) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
@@ -1155,12 +1424,3 @@
                            MAX_SB_SIZE);
   }
 }
-
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          uint8_t *upred, uint8_t *vpred,
-                                          int ustride, int vstride,
-                                          const BUFFER_SET *ctx,
-                                          BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
-  av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
-}

diff --git a/libaom/av1/common/reconinter.h b/libaom/av1/common/reconinter.h
index 9d562f9..fe3c6a6 100644
--- a/libaom/av1/common/reconinter.h
+++ b/libaom/av1/common/reconinter.h

@@ -12,9 +12,9 @@
 #ifndef AOM_AV1_COMMON_RECONINTER_H_
 #define AOM_AV1_COMMON_RECONINTER_H_
 
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/convolve.h"
+#include "av1/common/filter.h"
 #include "av1/common/warped_motion.h"
 #include "aom/aom_integer.h"
 
@@ -35,8 +35,7 @@
 extern "C" {
 #endif
 
-// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
-#define MAX_WEDGE_TYPES (1 << 4)
+#define MAX_WEDGE_TYPES 16
 
 #define MAX_WEDGE_SIZE_LOG2 5  // 32x32
 #define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
@@ -67,13 +66,13 @@
 typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
 
 typedef struct {
-  int bits;
+  int wedge_types;
   const wedge_code_type *codebook;
   uint8_t *signflip;
   wedge_masks_type *masks;
 } wedge_params_type;
 
-extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
+extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL];
 
 typedef struct SubpelParams {
   int xs;
@@ -84,8 +83,6 @@
 
 struct build_prediction_ctxt {
   const AV1_COMMON *cm;
-  int mi_row;
-  int mi_col;
   uint8_t **tmp_buf;
   int *tmp_width;
   int *tmp_height;
@@ -93,6 +90,55 @@
   int mb_to_far_edge;
 };
 
+typedef enum InterPredMode {
+  TRANSLATION_PRED,
+  WARP_PRED,
+} InterPredMode;
+
+typedef enum InterCompMode {
+  UNIFORM_SINGLE,
+  UNIFORM_COMP,
+  MASK_COMP,
+} InterCompMode;
+
+typedef struct InterPredParams {
+  InterPredMode mode;
+  InterCompMode comp_mode;
+  WarpedMotionParams warp_params;
+  ConvolveParams conv_params;
+  const InterpFilterParams *interp_filter_params[2];
+  int block_width;
+  int block_height;
+  int pix_row;
+  int pix_col;
+  struct buf_2d ref_frame_buf;
+  int subsampling_x;
+  int subsampling_y;
+  const struct scale_factors *scale_factors;
+  int bit_depth;
+  int use_hbd_buf;
+  INTERINTER_COMPOUND_DATA mask_comp;
+  BLOCK_SIZE sb_type;
+  int is_intrabc;
+} InterPredParams;
+
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters);
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params);
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp);
+
 static INLINE int has_scale(int xs, int ys) {
   return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
 }
@@ -108,53 +154,47 @@
   assert(sp->ys <= SUBPEL_SHIFTS);
 }
 
-static INLINE void inter_predictor(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride,
-                                   const SubpelParams *subpel_params,
-                                   const struct scale_factors *sf, int w, int h,
-                                   ConvolveParams *conv_params,
-                                   InterpFilters interp_filters,
-                                   int is_intrabc) {
+static INLINE void inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2]) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
-  assert(IMPLIES(is_intrabc, !is_scaled));
   if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf, is_intrabc);
+                           subpel_params->ys, 1, conv_params, sf);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf, is_intrabc);
+                           sp.ys, 0, conv_params, sf);
   }
 }
 
-static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
-                                          uint8_t *dst, int dst_stride,
-                                          const SubpelParams *subpel_params,
-                                          const struct scale_factors *sf, int w,
-                                          int h, ConvolveParams *conv_params,
-                                          InterpFilters interp_filters,
-                                          int is_intrabc, int bd) {
+static INLINE void highbd_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2], int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
-  assert(IMPLIES(is_intrabc, !is_scaled));
   if (is_scaled) {
-    av1_highbd_convolve_2d_facade(
-        src, src_stride, dst, dst_stride, w, h, interp_filters,
-        subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
-        subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, subpel_params->subpel_x,
+                                  subpel_params->xs, subpel_params->subpel_y,
+                                  subpel_params->ys, 1, conv_params, sf, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
-    av1_highbd_convolve_2d_facade(
-        src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
-        sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, sp.subpel_x, sp.xs,
+                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
   }
 }
 
@@ -170,7 +210,7 @@
     case COMPOUND_DISTWTD:
     case COMPOUND_DIFFWTD: return comp_allowed;
     case COMPOUND_WEDGE:
-      return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
+      return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
     default: assert(0); return 0;
   }
 }
@@ -188,39 +228,41 @@
   return 0;
 }
 
-static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits;
+static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types;
 }
 
-static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
-  const int wbits = wedge_params_lookup[sb_type].bits;
-  return (wbits > 0) ? wbits + 1 : 0;
-}
-
-static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits > 0;
-}
-
-static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits;
+static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
 }
 
 void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, const SubpelParams *subpel_params,
-                              const struct scale_factors *sf, int w, int h,
-                              ConvolveParams *conv_params,
-                              InterpFilters interp_filters,
-                              const WarpTypesAllowed *warp_types, int p_col,
-                              int p_row, int plane, int ref,
-                              const MB_MODE_INFO *mi, int build_for_obmc,
-                              const MACROBLOCKD *xd, int can_use_previous);
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params);
 
-void av1_make_masked_inter_predictor(
-    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-    MACROBLOCKD *xd, int can_use_previous);
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params);
+
+typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
+                                     InterPredParams *const inter_pred_params,
+                                     MACROBLOCKD *xd, int mi_x, int mi_y,
+                                     int ref, uint8_t **pre,
+                                     SubpelParams *subpel_params,
+                                     int *src_stride);
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -237,11 +279,14 @@
                     (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+    xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+    xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+    xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
+  };
 
-  clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
-           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
-           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
-           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, &mv_limits);
 
   return clamped_mv;
 }
@@ -298,6 +343,11 @@
   return 1;
 }
 
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes);
+
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -308,45 +358,39 @@
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes);
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]);
 
 const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
 
 void av1_init_wedge_masks();
 
-static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
-                                                          int wedge_sign,
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+                                                          int8_t wedge_sign,
                                                           BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+  return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
 // build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *pred, int stride,
-                                         const BUFFER_SET *ctx, int plane,
-                                         BLOCK_SIZE bsize);
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize);
 
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          uint8_t *upred, uint8_t *vpred,
-                                          int ustride, int vstride,
-                                          const BUFFER_SET *ctx,
-                                          BLOCK_SIZE bsize);
-
-void av1_build_intra_predictors_for_interintra(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-    const BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride);
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,

diff --git a/libaom/av1/common/reconintra.c b/libaom/av1/common/reconintra.c
index 559e499..efb3794 100644
--- a/libaom/av1/common/reconintra.c
+++ b/libaom/av1/common/reconintra.c

@@ -20,9 +20,9 @@
 #include "aom_ports/aom_once.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
-#include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
 
 enum {
   NEED_LEFT = 1 << 1,
@@ -198,7 +198,7 @@
                          int col_off, int ss_x, int ss_y) {
   if (!top_available || !right_available) return 0;
 
-  const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int bw_unit = mi_size_wide[bsize];
   const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
   const int top_right_count_unit = tx_size_wide_unit[txsz];
 
@@ -405,7 +405,7 @@
     // Bottom-left pixels are in the bottom-left block, which is not available.
     return 0;
   } else {
-    const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+    const int bh_unit = mi_size_high[bsize];
     const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
     const int bottom_left_count_unit = tx_size_high_unit[txsz];
 
@@ -422,10 +422,9 @@
     // and/or bottom-left superblocks. But only the left superblock is
     // available, so check if all required pixels fall in that superblock.
     if (blk_col_in_sb == 0) {
-      const int blk_start_row_off = blk_row_in_sb
-                                        << (bh_in_mi_log2 + MI_SIZE_LOG2 -
-                                            tx_size_wide_log2[0]) >>
-                                    ss_y;
+      const int blk_start_row_off =
+          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+          ss_y;
       const int row_off_in_sb = blk_start_row_off + row_off;
       const int sb_height_unit = sb_mi_size >> ss_y;
       return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
@@ -453,11 +452,13 @@
 static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
 static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
 static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
 static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+#endif
 
 static void init_intra_predictors_internal(void) {
   assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
@@ -499,7 +500,7 @@
   INIT_ALL_SIZES(dc_pred[0][1], dc_top);
   INIT_ALL_SIZES(dc_pred[1][0], dc_left);
   INIT_ALL_SIZES(dc_pred[1][1], dc);
-
+#if CONFIG_AV1_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
   INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
@@ -510,6 +511,7 @@
   INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
   INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
   INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif
 #undef intra_pred_allsizes
 }
 
@@ -647,6 +649,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Directional prediction, zone 1: 0 < angle < 90
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint16_t *above,
@@ -785,6 +788,7 @@
     pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 DECLARE_ALIGNED(16, const int8_t,
                 av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
@@ -888,6 +892,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
                                           TX_SIZE tx_size,
                                           const uint16_t *above,
@@ -938,6 +943,7 @@
     dst += stride;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
   if (plane == 0) {
@@ -1015,9 +1021,9 @@
 void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   if (!strength) return;
 
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
-    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
-  };
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
   const int filt = strength - 1;
   uint8_t edge[129];
 
@@ -1048,9 +1054,9 @@
 void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
-    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
-  };
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
   const int filt = strength - 1;
   uint16_t edge[129];
 
@@ -1068,6 +1074,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
   const int kernel[3] = { 5, 6, 5 };
 
@@ -1077,6 +1084,7 @@
   p_above[-1] = s;
   p_left[-1] = s;
 }
+#endif
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   // interpolate half-sample positions
@@ -1124,7 +1132,7 @@
     p[2 * i] = in[i + 2];
   }
 }
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static void build_intra_predictors_high(
     const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
     int dst_stride, PREDICTION_MODE mode, int angle_delta,
@@ -1151,7 +1159,7 @@
   int base = 128 << (xd->bd - 8);
 
   // The default values if ref pixels are not available:
-  // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
   // base+1   A      B  ..     Y      Z
   // base+1   C      D  ..     W      X
   // base+1   E      F  ..     U      V
@@ -1189,7 +1197,7 @@
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
@@ -1214,7 +1222,7 @@
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
     if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1309,6 +1317,7 @@
     pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
@@ -1335,7 +1344,7 @@
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 
   // The default values if ref pixels are not available:
-  // 127 127 127 .. 127 127 127 127 127 127
+  // 128 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
@@ -1374,10 +1383,13 @@
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+    // due to the avx2 mask load is with dword granularity.
+    // so we initialize 3 extra bytes to silence valgrind complain.
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
@@ -1399,7 +1411,7 @@
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
     if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1493,6 +1505,57 @@
   }
 }
 
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  BLOCK_SIZE bs = bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
+  }
+  return bs;
+}
+
 void av1_predict_intra_block(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
     TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
@@ -1501,8 +1564,8 @@
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-  const int x = col_off << tx_size_wide_log2[0];
-  const int y = row_off << tx_size_high_log2[0];
+  const int x = col_off << MI_SIZE_LOG2;
+  const int y = row_off << MI_SIZE_LOG2;
 
   if (use_palette) {
     int r, c;
@@ -1528,15 +1591,15 @@
     return;
   }
 
-  BLOCK_SIZE bsize = mbmi->sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide_unit[tx_size];
   const int txh = tx_size_high_unit[tx_size];
-  const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
-                                                     : xd->up_available);
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int have_top =
+      row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
   const int have_left =
-      col_off ||
-      (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+      col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   const int xr_chr_offset = 0;
@@ -1544,31 +1607,34 @@
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
-  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
-                 (wpx - x - txwpx) - xr_chr_offset;
+  const int xr =
+      (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
   // Distance between the bottom edge of this prediction block to
   // the frame bottom edge
-  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
-                 (hpx - y - txhpx) - yd_chr_offset;
+  const int yd =
+      (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
   const int right_available =
-      mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+      mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
   const int bottom_available =
-      (yd > 0) &&
-      (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+      (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
 
   const PARTITION_TYPE partition = mbmi->partition;
 
+  BLOCK_SIZE bsize = mbmi->sb_type;
   // force 4x4 chroma component block size.
-  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+  if (ss_x || ss_y) {
+    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+  }
 
-  const int have_top_right = has_top_right(
-      cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
-      row_off, col_off, pd->subsampling_x, pd->subsampling_y);
-  const int have_bottom_left = has_bottom_left(
-      cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
-      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+  const int have_top_right =
+      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+                    partition, tx_size, row_off, col_off, ss_x, ss_y);
+  const int have_bottom_left =
+      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
+                      partition, tx_size, row_off, col_off, ss_x, ss_y);
 
   const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
         xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
@@ -1579,7 +1645,7 @@
         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
     return;
   }
-
+#endif
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
                          angle_delta, filter_intra_mode, tx_size,
                          disable_edge_filter,
@@ -1595,8 +1661,7 @@
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   const PREDICTION_MODE mode =
       (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
   const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;

diff --git a/libaom/av1/common/reconintra.h b/libaom/av1/common/reconintra.h
index 3c781ab..9d20356 100644
--- a/libaom/av1/common/reconintra.h
+++ b/libaom/av1/common/reconintra.h

@@ -15,8 +15,8 @@
 #include <stdlib.h>
 
 #include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,13 +26,11 @@
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size);
-void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int bw, int bh, TX_SIZE tx_size,
-                             PREDICTION_MODE mode, int angle_delta,
-                             int use_palette,
-                             FILTER_INTRA_MODE filter_intra_mode,
-                             const uint8_t *ref, int ref_stride, uint8_t *dst,
-                             int dst_stride, int aoff, int loff, int plane);
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
 
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -56,8 +54,8 @@
 }
 
 static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
-  return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
-         cm->allow_intrabc;
+  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
+         cm->features.allow_intrabc;
 }
 
 static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
@@ -76,6 +74,40 @@
 
 extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
 
+static const int16_t dr_intra_derivative[90] = {
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
+};
+
 // Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
 // If angle > 0 && angle < 90, dx = -((int)(256 / t));
 // If angle > 90 && angle < 180, dx = (int)(256 / t);

diff --git a/libaom/av1/common/resize.c b/libaom/av1/common/resize.c
index d668eae..98f28f7 100644
--- a/libaom/av1/common/resize.c
+++ b/libaom/av1/common/resize.c

@@ -758,6 +758,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_interpolate_core(const uint16_t *const input, int in_length,
                                     uint16_t *output, int out_length, int bd,
                                     const int16_t *interp_filters,
@@ -1105,6 +1106,7 @@
     aom_free(tmp_right);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -1142,6 +1144,7 @@
                    ouv_stride);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
                                 const uint8_t *const u, const uint8_t *const v,
                                 int uv_stride, int height, int width,
@@ -1183,6 +1186,7 @@
   av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
                           ouv_stride, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                  YV12_BUFFER_CONFIG *dst, int bd,
@@ -1193,6 +1197,7 @@
   // the static analysis warnings.
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
+#if CONFIG_AV1_HIGHBITDEPTH
     if (src->flags & YV12_FLAG_HIGHBITDEPTH)
       av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
                               src->crop_widths[is_uv], src->strides[is_uv],
@@ -1203,6 +1208,13 @@
                        src->crop_widths[is_uv], src->strides[is_uv],
                        dst->buffers[i], dst->crop_heights[is_uv],
                        dst->crop_widths[is_uv], dst->strides[is_uv]);
+#else
+    (void)bd;
+    av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                     src->crop_widths[is_uv], src->strides[is_uv],
+                     dst->buffers[i], dst->crop_heights[is_uv],
+                     dst->crop_widths[is_uv], dst->strides[is_uv]);
+#endif
   }
   aom_extend_frame_borders(dst, num_planes);
 }
@@ -1223,7 +1235,7 @@
   int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
                                           upscaled_plane_width, x_step_qn);
 
-  for (int j = 0; j < cm->tile_cols; j++) {
+  for (int j = 0; j < cm->tiles.cols; j++) {
     av1_tile_set_col(&tile_col, cm, j);
     // Determine the limits of this tile column in both the source
     // and destination images.
@@ -1236,7 +1248,7 @@
 
     const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
     int upscaled_x1;
-    if (j == cm->tile_cols - 1) {
+    if (j == cm->tiles.cols - 1) {
       // Note that we can't just use AOMMIN here - due to rounding,
       // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
       // upscaled_plane_width.
@@ -1250,8 +1262,9 @@
     const int dst_width = upscaled_x1 - upscaled_x0;
 
     const int pad_left = (j == 0);
-    const int pad_right = (j == cm->tile_cols - 1);
+    const int pad_right = (j == cm->tiles.cols - 1);
 
+#if CONFIG_AV1_HIGHBITDEPTH
     if (cm->seq_params.use_highbitdepth)
       highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
                                     dst_ptr, rows, dst_width, dst_stride,
@@ -1261,7 +1274,11 @@
       upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
                              rows, dst_width, dst_stride, x_step_qn, x0_qn,
                              pad_left, pad_right);
-
+#else
+    upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows,
+                           dst_width, dst_stride, x_step_qn, x0_qn, pad_left,
+                           pad_right);
+#endif
     // Update the fractional pixel offset to prepare for the next tile column.
     x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
   }
@@ -1354,6 +1371,7 @@
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
   const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
 
   YV12_BUFFER_CONFIG copy_buffer;
   memset(&copy_buffer, 0, sizeof(copy_buffer));
@@ -1364,7 +1382,7 @@
   if (aom_alloc_frame_buffer(
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+          AOM_BORDER_IN_PIXELS, byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
@@ -1383,21 +1401,26 @@
     aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
     void *cb_priv = pool->cb_priv;
 
+    lock_buffer_pool(pool);
     // Realloc with callback does not release the frame buffer - release first.
-    if (release_fb_cb(cb_priv, fb))
+    if (release_fb_cb(cb_priv, fb)) {
+      unlock_buffer_pool(pool);
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
-
+    }
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
     if (aom_realloc_frame_buffer(
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+      unlock_buffer_pool(pool);
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
+    }
+    unlock_buffer_pool(pool);
   } else {
     // Make a copy of the config data for frame_to_show in copy_buffer
     copy_buffer_config(frame_to_show, &copy_buffer);
@@ -1408,7 +1431,7 @@
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+            AOM_BORDER_IN_PIXELS, byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");

diff --git a/libaom/av1/common/resize.h b/libaom/av1/common/resize.h
index 43bea58..8ee859e 100644
--- a/libaom/av1/common/resize.h
+++ b/libaom/av1/common/resize.h

@@ -14,7 +14,7 @@
 
 #include <stdio.h>
 #include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/libaom/av1/common/restoration.c b/libaom/av1/common/restoration.c
index 9e472b8..a0f37ad 100644
--- a/libaom/av1/common/restoration.c
+++ b/libaom/av1/common/restoration.c

@@ -17,7 +17,7 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom_mem/aom_mem.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -28,7 +28,7 @@
 // The 's' values are calculated based on original 'r' and 'e' values in the
 // spec using GenSgrprojVtable().
 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
-const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
@@ -111,7 +111,7 @@
 
 static void GenSgrprojVtable() {
   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
-    const sgr_params_type *const params = &sgr_params[i];
+    const sgr_params_type *const params = &av1_sgr_params[i];
     for (int j = 0; j < 2; ++j) {
       const int e = params->e[j];
       const int r = params->r[j];
@@ -153,6 +153,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void extend_frame_highbd(uint16_t *data, int width, int height,
                                 int stride, int border_horz, int border_vert) {
   uint16_t *data_p;
@@ -173,13 +174,24 @@
   }
 }
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert, int highbd) {
-  if (highbd)
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+#endif
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
                         border_horz, border_vert);
-  else
-    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+    return;
+  }
+#endif
+  (void)highbd;
+  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
 }
 
 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
@@ -188,19 +200,17 @@
     memcpy(dst + i * dst_stride, src + i * src_stride, width);
 }
 
-static void copy_tile_highbd(int width, int height, const uint16_t *src,
-                             int src_stride, uint16_t *dst, int dst_stride) {
-  for (int i = 0; i < height; ++i)
-    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
-}
-
 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
                       uint8_t *dst, int dst_stride, int highbd) {
-  if (highbd)
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
                      CONVERT_TO_SHORTPTR(dst), dst_stride);
-  else
-    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+    return;
+  }
+#endif
+  (void)highbd;
+  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
 }
 
 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
@@ -212,11 +222,10 @@
 // rules:
 //
 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
-//   This extension is done by a call to extend_frame() at the start of the loop
-//   restoration process, so the value of copy_above/copy_below doesn't strictly
-//   matter.
-//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
-//   across tiles is disabled, we can allow
+//   This extension is done by a call to av1_extend_frame() at the start of the
+//   loop restoration process, so the value of copy_above/copy_below doesn't
+//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
+//   loop filtering across tiles is disabled, we can allow
 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
 //   data has always been copied, simplifying the behaviour at the left and
 //   right edges of tiles.
@@ -620,7 +629,7 @@
     assert(0 && "Invalid value of r in self-guided filter");
 }
 
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
   if (params->r[0] == 0) {
     xq[0] = 0;
     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
@@ -633,7 +642,7 @@
   }
 }
 
-const int32_t x_by_xplus1[256] = {
+const int32_t av1_x_by_xplus1[256] = {
   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
   // instead of 0. See comments in selfguided_restoration_internal() for why
   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
@@ -656,7 +665,7 @@
   256,
 };
 
-const int32_t one_by_x[MAX_NELEM] = {
+const int32_t av1_one_by_x[MAX_NELEM] = {
   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
 };
@@ -665,7 +674,7 @@
                                           int dgd_stride, int bit_depth,
                                           int sgr_params_idx, int radius_idx,
                                           int pass, int32_t *A, int32_t *B) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -726,7 +735,7 @@
       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
       // slightly above 2^(8 + bit depth), due to rounding in the value of
-      // one_by_x[25-1].
+      // av1_one_by_x[25-1].
       //
       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
@@ -738,17 +747,17 @@
       // would be a bad idea, as that corresponds to the case where the image
       // is very variable, when we want to preserve the local pixel value as
       // much as possible.
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
 
       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
-      // one_by_x[n - 1] = round(2^12 / n)
+      // av1_one_by_x[n - 1] = round(2^12 / n)
       // => the product here is < 2^(20 + bit_depth) <= 2^32,
       // and B[k] is set to a value < 2^(8 + bit depth)
-      // This holds even with the rounding in one_by_x and in the overall
+      // This holds even with the rounding in av1_one_by_x and in the overall
       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                              (uint32_t)B[k] *
-                                             (uint32_t)one_by_x[n - 1],
+                                             (uint32_t)av1_one_by_x[n - 1],
                                          SGRPROJ_RECIP_BITS);
     }
   }
@@ -757,7 +766,7 @@
 static void selfguided_restoration_fast_internal(
     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
@@ -883,7 +892,7 @@
     }
   }
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
   // skipping SGR entirely.
@@ -899,11 +908,11 @@
   return 0;
 }
 
-void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
-                                    int stride, int eps, const int *xqd,
-                                    uint8_t *dst8, int dst_stride,
-                                    int32_t *tmpbuf, int bit_depth,
-                                    int highbd) {
+void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+                                        int height, int stride, int eps,
+                                        const int *xqd, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -912,9 +921,9 @@
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
   for (int i = 0; i < height; ++i) {
     for (int j = 0; j < width; ++j) {
       const int k = i * width + j;
@@ -950,12 +959,13 @@
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
-                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
-                                 dst + j, dst_stride, tmpbuf, bit_depth, 0);
+    av1_apply_selfguided_restoration(
+        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
                                         int stripe_width, int stripe_height,
                                         int procunit_width, const uint8_t *src8,
@@ -984,11 +994,12 @@
                                          int32_t *tmpbuf, int bit_depth) {
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
-                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
-                                 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+    av1_apply_selfguided_restoration(
+        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
                                   int stripe_width, int stripe_height,
@@ -996,12 +1007,18 @@
                                   int src_stride, uint8_t *dst, int dst_stride,
                                   int32_t *tmpbuf, int bit_depth);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #define NUM_STRIPE_FILTERS 4
-
 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
   sgrproj_filter_stripe_highbd
 };
+#else
+#define NUM_STRIPE_FILTERS 2
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe
+};
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Filter one restoration unit
 void av1_loop_restoration_filter_unit(
@@ -1100,7 +1117,7 @@
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
-          cm->byte_alignment, NULL, NULL, NULL) < 0)
+          cm->features.byte_alignment, NULL, NULL, NULL) < 0)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
@@ -1120,9 +1137,9 @@
     const int plane_height = frame->crop_heights[is_uv];
     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
 
-    extend_frame(frame->buffers[plane], plane_width, plane_height,
-                 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
-                 highbd);
+    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+                     frame->strides[is_uv], RESTORATION_BORDER,
+                     RESTORATION_BORDER, highbd);
 
     lr_plane_ctxt->rsi = rsi;
     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
@@ -1146,7 +1163,7 @@
   static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
                                          aom_yv12_partial_coloc_copy_u,
                                          aom_yv12_partial_coloc_copy_v };
-
+  assert(num_planes <= 3);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
@@ -1173,7 +1190,7 @@
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
                                        AV1_COMMON *cm, int optimized_lr,
                                        void *lr_ctxt) {
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
   const int num_planes = av1_num_planes(cm);
 
   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
@@ -1301,7 +1318,7 @@
   if (bsize != cm->seq_params.sb_size) return 0;
   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int is_uv = plane > 0;
 

diff --git a/libaom/av1/common/restoration.h b/libaom/av1/common/restoration.h
index 6d6ba37..3b80dd5 100644
--- a/libaom/av1/common/restoration.h
+++ b/libaom/av1/common/restoration.h

@@ -122,6 +122,7 @@
 // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
 // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
 #define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN_REDUCED (WIENER_WIN - 2)
 #define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
 
 #define WIENER_FILT_PREC_BITS 7
@@ -277,18 +278,18 @@
   YV12_BUFFER_CONFIG *dst;
 } AV1LrStruct;
 
-extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS];
 extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
-extern const int32_t x_by_xplus1[256];
-extern const int32_t one_by_x[MAX_NELEM];
+extern const int32_t av1_x_by_xplus1[256];
+extern const int32_t av1_one_by_x[MAX_NELEM];
 
 void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
                                   int is_uv);
 void av1_free_restoration_struct(RestorationInfo *rst_info);
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert, int highbd);
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd);
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
 
 // Filter a single loop restoration unit.
 //

diff --git a/libaom/av1/common/scale.c b/libaom/av1/common/scale.c
index bac7bd9..3b14c0a 100644
--- a/libaom/av1/common/scale.c
+++ b/libaom/av1/common/scale.c

@@ -37,7 +37,7 @@
 // Note: Expect val to be in q4 precision
 static int unscaled_value(int val, const struct scale_factors *sf) {
   (void)sf;
-  return val << SCALE_EXTRA_BITS;
+  return val * (1 << SCALE_EXTRA_BITS);
 }
 
 static int get_fixed_point_scale_factor(int other_size, int this_size) {
@@ -88,39 +88,41 @@
   // AV1 convolve functions
   // Special case convolve functions should produce the same result as
   // av1_convolve_2d.
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->convolve[0][1][0] = av1_convolve_y_sr;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->convolve[1][0][0] = av1_convolve_x_sr;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->convolve[1][1][0] = av1_convolve_2d_sr;
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
+#if CONFIG_AV1_HIGHBITDEPTH
   // AV1 High BD convolve functions
   // Special case convolve functions should produce the same result as
   // av1_highbd_convolve_2d.
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
+#endif
 }

diff --git a/libaom/av1/common/scale.h b/libaom/av1/common/scale.h
index 748e958..16b40bd 100644
--- a/libaom/av1/common/scale.h
+++ b/libaom/av1/common/scale.h

@@ -45,11 +45,13 @@
                                        int other_h, int this_w, int this_h);
 
 static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+  assert(sf != NULL);
   return sf->x_scale_fp != REF_INVALID_SCALE &&
          sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
 static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+  assert(sf != NULL);
   return av1_is_valid_scale(sf) &&
          (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }

diff --git a/libaom/av1/common/scan.c b/libaom/av1/common/scan.c
index 31a787b..c1d4f35 100644
--- a/libaom/av1/common/scan.c
+++ b/libaom/av1/common/scan.c

@@ -14,9 +14,9 @@
 #include "av1/common/common_data.h"
 #include "av1/common/scan.h"
 
-DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
-  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
-};
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -839,1546 +839,9 @@
   927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
 };
 
-// Neighborhood 2-tuples for various scans and blocksizes,
-// in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0,  0, 4, 4, 1, 4, 1,  1,  2,  2,  2,  5, 5,
-  8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
-  1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
-  4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
-  9,  2,  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16,
-  11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
-  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
-  0, 1, 4,  5,  8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 1,  1,
-  2, 5, 6,  9,  10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2,  2,  3,
-  6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,
-  4,  5,  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12,
-  13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
-  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0, 1,  8,  1,  1,  8,  8,  2,  9,  9, 16, 10,
-  17, 2,  2,  16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3,  3, 4,  11,
-  19, 26, 12, 19, 4,  4, 20, 27, 5,  12, 13, 20, 21, 28, 5, 5,  6,
-  13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
-  1,  2,  9,  10, 17, 18, 25, 2,  2,  3,  10, 11, 18, 19, 26, 3,  3,
-  4,  11, 12, 19, 20, 27, 4,  4,  5,  12, 13, 20, 21, 28, 5,  5,  6,
-  13, 14, 21, 22, 29, 6,  6,  7,  14, 15, 22, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,
-  0,  1,  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,
-  9,  16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
-  24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,  9,  2,
-  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17,
-  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
-  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
-  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
-  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
-  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  16, 1,  1,  16, 16, 2,  17, 17, 32, 18, 33, 2,
-  2,  32, 32, 3,  18, 33, 48, 19, 34, 34, 49, 3,  3,  4,  19, 35, 50, 20, 35,
-  4,  4,  36, 51, 5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
-  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
-  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
-  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
-  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,  4,  5,
-  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17,
-  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
-  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
-  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
-  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
-  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
-  8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0,  0,  1,  16, 2,  17,
-  3,  18, 4,  19, 5,  20, 6,  21, 7,  22, 8,  23, 9,  24, 10, 25, 11, 26, 12,
-  27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
-  22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
-  46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
-  41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
-  32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0,  0,  1,  4,  5,  8,
-  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
-  48, 49, 52, 53, 56, 57, 60, 1,  1,  2,  5,  6,  9,  10, 13, 14, 17, 18, 21,
-  22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
-  61, 2,  2,  3,  6,  7,  10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
-  35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  16, 16, 32, 32, 0,  0,  1,  16, 17, 32, 33, 48, 1,  1,  2,
-  17, 18, 33, 34, 49, 2,  2,  3,  18, 19, 34, 35, 50, 3,  3,  4,  19, 20, 35,
-  36, 51, 4,  4,  5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
-  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
-  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
-  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
-  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
-  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
-  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
-  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
-  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
-  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
-  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
-  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
-  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
-  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
-  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
-  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
-  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
-  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
-  106, 113, 113, 120, 120, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107,
-  114, 114, 121, 121, 128, 128, 128, 87,  94,  94,  101, 101, 108, 108, 115,
-  115, 122, 122, 129, 129, 136, 136, 136, 95,  102, 102, 109, 109, 116, 116,
-  123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
-  124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
-  132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
-  133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
-  141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
-  142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
-  150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
-  151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
-  200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
-  208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
-  216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
-  217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
-  225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
-  226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
-  234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
-  242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
-  237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
-  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
-  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
-  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
-  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
-  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 8,   8,
-  8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
-  9,   9,   40,  40,  71,  71,  102, 102, 133, 133, 164, 164, 195, 195, 226,
-  10,  10,  10,  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196,
-  227, 11,  11,  11,  42,  42,  73,  73,  104, 104, 135, 135, 166, 166, 197,
-  197, 228, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
-  198, 198, 229, 13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168,
-  168, 199, 199, 230, 14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138,
-  169, 169, 200, 200, 231, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139,
-  139, 170, 170, 201, 201, 232, 16,  16,  16,  47,  47,  78,  78,  109, 109,
-  140, 140, 171, 171, 202, 202, 233, 17,  17,  17,  48,  48,  79,  79,  110,
-  110, 141, 141, 172, 172, 203, 203, 234, 18,  18,  18,  49,  49,  80,  80,
-  111, 111, 142, 142, 173, 173, 204, 204, 235, 19,  19,  19,  50,  50,  81,
-  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 20,  20,  20,  51,  51,
-  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 21,  21,  21,  52,
-  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238, 22,  22,  22,
-  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208, 208, 239, 23,  23,
-  23,  54,  54,  85,  85,  116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
-  24,  24,  55,  55,  86,  86,  117, 117, 148, 148, 179, 179, 210, 210, 241,
-  25,  25,  25,  56,  56,  87,  87,  118, 118, 149, 149, 180, 180, 211, 211,
-  242, 26,  26,  26,  57,  57,  88,  88,  119, 119, 150, 150, 181, 181, 212,
-  212, 243, 27,  27,  27,  58,  58,  89,  89,  120, 120, 151, 151, 182, 182,
-  213, 213, 244, 28,  28,  28,  59,  59,  90,  90,  121, 121, 152, 152, 183,
-  183, 214, 214, 245, 29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153,
-  184, 184, 215, 215, 246, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154,
-  154, 185, 185, 216, 216, 247, 31,  62,  62,  93,  93,  124, 124, 155, 155,
-  186, 186, 217, 217, 248, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218,
-  218, 249, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
-  189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
-  223, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
-  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
-  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
-  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
-  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
-  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
-  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
-  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
-  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
-  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
-  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
-  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
-  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
-  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
-  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
-  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
-  126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
-  127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
-  141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
-  142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
-  156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
-  157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
-  171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
-  172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
-  186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
-  187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
-  201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
-  202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
-  216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
-  217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
-  224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
-  232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
-  246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
-  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
-  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
-  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
-  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
-  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
-  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
-  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
-  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
-  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
-  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
-  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
-  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
-  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
-  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
-  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
-  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
-  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
-  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
-  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
-  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
-  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
-  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
-  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
-  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
-  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
-  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
-  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
-  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
-  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
-  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
-  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
-  223, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   8,   8,   16,  16,  24,  24,  32,  32,  40,  40,  48,
-  48,  56,  56,  64,  64,  72,  72,  80,  80,  88,  88,  96,  96,  104, 104,
-  112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
-  168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
-  232, 232, 240, 240, 0,   0,   1,   8,   9,   16,  17,  24,  25,  32,  33,
-  40,  41,  48,  49,  56,  57,  64,  65,  72,  73,  80,  81,  88,  89,  96,
-  97,  104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
-  160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
-  217, 224, 225, 232, 233, 240, 241, 248, 1,   1,   2,   9,   10,  17,  18,
-  25,  26,  33,  34,  41,  42,  49,  50,  57,  58,  65,  66,  73,  74,  81,
-  82,  89,  90,  97,  98,  105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
-  145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
-  202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2,   2,   3,
-  10,  11,  18,  19,  26,  27,  34,  35,  42,  43,  50,  51,  58,  59,  66,
-  67,  74,  75,  82,  83,  90,  91,  98,  99,  106, 107, 114, 115, 122, 123,
-  130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
-  187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
-  250, 3,   3,   4,   11,  12,  19,  20,  27,  28,  35,  36,  43,  44,  51,
-  52,  59,  60,  67,  68,  75,  76,  83,  84,  91,  92,  99,  100, 107, 108,
-  115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
-  172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
-  235, 236, 243, 244, 251, 4,   4,   5,   12,  13,  20,  21,  28,  29,  36,
-  37,  44,  45,  52,  53,  60,  61,  68,  69,  76,  77,  84,  85,  92,  93,
-  100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
-  157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
-  220, 221, 228, 229, 236, 237, 244, 245, 252, 5,   5,   6,   13,  14,  21,
-  22,  29,  30,  37,  38,  45,  46,  53,  54,  61,  62,  69,  70,  77,  78,
-  85,  86,  93,  94,  101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
-  142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
-  205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6,   6,
-  7,   14,  15,  22,  23,  30,  31,  38,  39,  46,  47,  54,  55,  62,  63,
-  70,  71,  78,  79,  86,  87,  94,  95,  102, 103, 110, 111, 118, 119, 126,
-  127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
-  190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  32, 32, 64, 64,  96,  96,  128, 128, 160, 160, 192, 192,
-  0,  0,  1,  32, 33, 64, 65, 96,  97,  128, 129, 160, 161, 192, 193, 224,
-  1,  1,  2,  33, 34, 65, 66, 97,  98,  129, 130, 161, 162, 193, 194, 225,
-  2,  2,  3,  34, 35, 66, 67, 98,  99,  130, 131, 162, 163, 194, 195, 226,
-  3,  3,  4,  35, 36, 67, 68, 99,  100, 131, 132, 163, 164, 195, 196, 227,
-  4,  4,  5,  36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
-  5,  5,  6,  37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
-  6,  6,  7,  38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
-  7,  7,  8,  39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
-  8,  8,  9,  40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
-  9,  9,  10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
-  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
-  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
-  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
-  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
-  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
-  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
-  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
-  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
-  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
-  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
-  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
-  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
-  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
-  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
-  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
-  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
-  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
-  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
-  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
-  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
-  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
-  8,  9,  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1,  1,  2,  9,  10, 17,
-  18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2,  2,  3,  10, 11, 18, 19, 26, 27,
-  34, 35, 42, 43, 50, 51, 58, 3,  3,  4,  11, 12, 19, 20, 27, 28, 35, 36, 43,
-  44, 51, 52, 59, 4,  4,  5,  12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
-  60, 5,  5,  6,  13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6,  6,
-  7,  14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,  0,  1,
-  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,  9,  16, 10, 17,
-  11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
-  27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
-  30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
-  46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
-  49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  2,  2,  2,  9,  9,  16, 16,
-  16, 24, 24, 17, 24, 10, 17, 3,  10, 3,  3,  4,  4,  4,  11, 11, 18, 18, 25,
-  25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5,  12, 5,  5,  6,
-  6,  6,  13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
-  35, 42, 28, 35, 21, 28, 14, 21, 7,  14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
-  50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
-  52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
-  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
-  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
-  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
-  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
-  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
-  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
-  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
-  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
-  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
-  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
-  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
-  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
-  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
-  106, 113, 113, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107, 114, 114,
-  121, 87,  94,  94,  101, 101, 108, 108, 115, 115, 122, 95,  102, 102, 109,
-  109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,  1,  1,   1,   16,  16,  16,  2,   2,   2,
-  17,  17,  32,  32,  32,  3,  3,  3,   18,  18,  33,  33,  48,  48,  48,
-  4,   4,   4,   19,  19,  34, 34, 49,  49,  64,  64,  64,  5,   5,   5,
-  20,  20,  35,  35,  50,  50, 65, 65,  80,  80,  80,  6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66, 66, 81,  81,  96,  96,  96,  7,   7,   7,
-  22,  22,  37,  37,  52,  52, 67, 67,  82,  82,  97,  97,  112, 8,   8,
-  8,   23,  23,  38,  38,  53, 53, 68,  68,  83,  83,  98,  98,  113, 9,
-  9,   9,   24,  24,  39,  39, 54, 54,  69,  69,  84,  84,  99,  99,  114,
-  10,  10,  10,  25,  25,  40, 40, 55,  55,  70,  70,  85,  85,  100, 100,
-  115, 11,  11,  11,  26,  26, 41, 41,  56,  56,  71,  71,  86,  86,  101,
-  101, 116, 12,  12,  12,  27, 27, 42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 13,  13,  13, 28, 28,  43,  43,  58,  58,  73,  73,  88,
-  88,  103, 103, 118, 14,  14, 14, 29,  29,  44,  44,  59,  59,  74,  74,
-  89,  89,  104, 104, 119, 15, 30, 30,  45,  45,  60,  60,  75,  75,  90,
-  90,  105, 105, 120, 31,  46, 46, 61,  61,  76,  76,  91,  91,  106, 106,
-  121, 47,  62,  62,  77,  77, 92, 92,  107, 107, 122, 63,  78,  78,  93,
-  93,  108, 108, 123, 79,  94, 94, 109, 109, 124, 95,  110, 110, 125, 111,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
-  56, 56, 64, 64, 72, 72, 80, 80, 88, 88,  96,  96,  104, 104, 112, 112,
-  0,  0,  1,  8,  9,  16, 17, 24, 25, 32,  33,  40,  41,  48,  49,  56,
-  57, 64, 65, 72, 73, 80, 81, 88, 89, 96,  97,  104, 105, 112, 113, 120,
-  1,  1,  2,  9,  10, 17, 18, 25, 26, 33,  34,  41,  42,  49,  50,  57,
-  58, 65, 66, 73, 74, 81, 82, 89, 90, 97,  98,  105, 106, 113, 114, 121,
-  2,  2,  3,  10, 11, 18, 19, 26, 27, 34,  35,  42,  43,  50,  51,  58,
-  59, 66, 67, 74, 75, 82, 83, 90, 91, 98,  99,  106, 107, 114, 115, 122,
-  3,  3,  4,  11, 12, 19, 20, 27, 28, 35,  36,  43,  44,  51,  52,  59,
-  60, 67, 68, 75, 76, 83, 84, 91, 92, 99,  100, 107, 108, 115, 116, 123,
-  4,  4,  5,  12, 13, 20, 21, 28, 29, 36,  37,  44,  45,  52,  53,  60,
-  61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
-  5,  5,  6,  13, 14, 21, 22, 29, 30, 37,  38,  45,  46,  53,  54,  61,
-  62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
-  6,  6,  7,  14, 15, 22, 23, 30, 31, 38,  39,  46,  47,  54,  55,  62,
-  63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  16, 16, 32, 32, 48, 48, 64, 64, 80, 80,  96,  96,
-  0,  0,  1,  16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96,  97,  112,
-  1,  1,  2,  17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97,  98,  113,
-  2,  2,  3,  18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98,  99,  114,
-  3,  3,  4,  19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,  100, 115,
-  4,  4,  5,  20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
-  5,  5,  6,  21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
-  6,  6,  7,  22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
-  7,  7,  8,  23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
-  8,  8,  9,  24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
-  9,  9,  10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
-  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
-  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
-  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
-  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
-  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
-  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
-  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
-  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
-  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
-  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
-  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
-  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
-  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
-  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
-  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
-  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
-  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
-  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
-  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
-  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   16,  16,  16,  2,   2,   2,
-  17,  17,  32,  32,  32,  3,   3,   3,   18,  18,  33,  33,  48,  48,  48,
-  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  5,   5,   5,
-  20,  20,  35,  35,  50,  50,  65,  65,  80,  80,  80,  6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  7,   7,   7,
-  22,  22,  37,  37,  52,  52,  67,  67,  82,  82,  97,  97,  112, 112, 112,
-  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
-  113, 113, 128, 128, 128, 9,   9,   9,   24,  24,  39,  39,  54,  54,  69,
-  69,  84,  84,  99,  99,  114, 114, 129, 129, 144, 144, 144, 10,  10,  10,
-  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
-  130, 145, 145, 160, 160, 160, 11,  11,  11,  26,  26,  41,  41,  56,  56,
-  71,  71,  86,  86,  101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
-  176, 176, 12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
-  13,  13,  13,  28,  28,  43,  43,  58,  58,  73,  73,  88,  88,  103, 103,
-  118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
-  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
-  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
-  224, 224, 15,  30,  30,  45,  45,  60,  60,  75,  75,  90,  90,  105, 105,
-  120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
-  225, 240, 240, 240, 31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106,
-  121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
-  226, 241, 241, 256, 256, 256, 47,  62,  62,  77,  77,  92,  92,  107, 107,
-  122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
-  227, 242, 242, 257, 257, 272, 272, 272, 63,  78,  78,  93,  93,  108, 108,
-  123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
-  228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79,  94,  94,  109, 109,
-  124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
-  229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95,  110, 110,
-  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
-  230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
-  126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
-  231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
-  336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
-  232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
-  352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
-  233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
-  353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
-  234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
-  354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
-  235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
-  355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
-  236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
-  356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
-  237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
-  357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
-  238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
-  358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
-  239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
-  359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
-  464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
-  360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
-  465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
-  361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
-  466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
-  377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
-  482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
-  408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
-  334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
-  454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
-  395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
-  366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
-  471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
-  457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
-  443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
-  459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
-  475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
-  462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
-  494, 509, 495, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
-  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
-  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
-  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
-  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
-  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
-  8,   8,   8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194,
-  225, 225, 256, 256, 256, 9,   9,   9,   40,  40,  71,  71,  102, 102, 133,
-  133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10,  10,  10,
-  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
-  258, 289, 289, 320, 320, 320, 11,  11,  11,  42,  42,  73,  73,  104, 104,
-  135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
-  352, 352, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
-  198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
-  13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168, 168, 199, 199,
-  230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
-  14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138, 169, 169, 200, 200,
-  231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
-  448, 448, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139, 139, 170, 170,
-  201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
-  418, 449, 449, 480, 16,  16,  16,  47,  47,  78,  78,  109, 109, 140, 140,
-  171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
-  388, 419, 419, 450, 450, 481, 17,  17,  17,  48,  48,  79,  79,  110, 110,
-  141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
-  358, 389, 389, 420, 420, 451, 451, 482, 18,  18,  18,  49,  49,  80,  80,
-  111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
-  328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19,  19,  19,  50,  50,
-  81,  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
-  298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20,  20,  20,
-  51,  51,  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
-  268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
-  21,  21,  52,  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238,
-  238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
-  486, 22,  22,  22,  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208,
-  208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
-  456, 456, 487, 23,  23,  23,  54,  54,  85,  85,  116, 116, 147, 147, 178,
-  178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
-  426, 426, 457, 457, 488, 24,  24,  24,  55,  55,  86,  86,  117, 117, 148,
-  148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
-  396, 396, 427, 427, 458, 458, 489, 25,  25,  25,  56,  56,  87,  87,  118,
-  118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
-  366, 366, 397, 397, 428, 428, 459, 459, 490, 26,  26,  26,  57,  57,  88,
-  88,  119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
-  336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27,  27,  27,  58,
-  58,  89,  89,  120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
-  306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28,  28,
-  28,  59,  59,  90,  90,  121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
-  276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
-  29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153, 184, 184, 215, 215,
-  246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
-  463, 494, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154, 154, 185, 185,
-  216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
-  433, 464, 464, 495, 31,  62,  62,  93,  93,  124, 124, 155, 155, 186, 186,
-  217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
-  434, 465, 465, 496, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218, 218,
-  249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
-  466, 497, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
-  312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
-  158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
-  406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
-  283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
-  222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
-  439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
-  409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
-  379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
-  411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
-  443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
-  414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
-  478, 509, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
-  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
-  224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
-  336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
-  464, 464, 480, 480, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,
-  80,  81,  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
-  193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
-  320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
-  433, 448, 449, 464, 465, 480, 481, 496, 1,   1,   2,   17,  18,  33,  34,
-  49,  50,  65,  66,  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161,
-  162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
-  289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
-  402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2,   2,   3,
-  18,  19,  34,  35,  50,  51,  66,  67,  82,  83,  98,  99,  114, 115, 130,
-  131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
-  258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
-  371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
-  498, 3,   3,   4,   19,  20,  35,  36,  51,  52,  67,  68,  83,  84,  99,
-  100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
-  227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
-  340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
-  467, 468, 483, 484, 499, 4,   4,   5,   20,  21,  36,  37,  52,  53,  68,
-  69,  84,  85,  100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
-  196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
-  309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
-  436, 437, 452, 453, 468, 469, 484, 485, 500, 5,   5,   6,   21,  22,  37,
-  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133, 134, 149, 150,
-  165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
-  278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
-  405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6,   6,
-  7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118, 119,
-  134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
-  247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
-  374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
-  487, 502, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,
-  103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
-  216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
-  343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
-  456, 471, 472, 487, 488, 503, 8,   8,   9,   24,  25,  40,  41,  56,  57,
-  72,  73,  88,  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
-  185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
-  312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
-  425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9,   9,   10,  25,  26,
-  41,  42,  57,  58,  73,  74,  89,  90,  105, 106, 121, 122, 137, 138, 153,
-  154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
-  281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
-  394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
-  10,  11,  26,  27,  42,  43,  58,  59,  74,  75,  90,  91,  106, 107, 122,
-  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
-  250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
-  363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
-  490, 491, 506, 11,  11,  12,  27,  28,  43,  44,  59,  60,  75,  76,  91,
-  92,  107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
-  219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
-  332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
-  459, 460, 475, 476, 491, 492, 507, 12,  12,  13,  28,  29,  44,  45,  60,
-  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
-  188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
-  301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
-  428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13,  13,  14,  29,
-  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126, 141, 142,
-  157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
-  270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
-  397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
-  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
-  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
-  239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
-  366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
-  479, 494, 495, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   32,  32,  64,  64,  96,  96,  128, 128, 160, 160, 192,
-  192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
-  448, 448, 0,   0,   1,   32,  33,  64,  65,  96,  97,  128, 129, 160, 161,
-  192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
-  417, 448, 449, 480, 1,   1,   2,   33,  34,  65,  66,  97,  98,  129, 130,
-  161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
-  386, 417, 418, 449, 450, 481, 2,   2,   3,   34,  35,  66,  67,  98,  99,
-  130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
-  355, 386, 387, 418, 419, 450, 451, 482, 3,   3,   4,   35,  36,  67,  68,
-  99,  100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
-  324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4,   4,   5,   36,  37,
-  68,  69,  100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
-  293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5,   5,   6,
-  37,  38,  69,  70,  101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
-  262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
-  6,   7,   38,  39,  70,  71,  102, 103, 134, 135, 166, 167, 198, 199, 230,
-  231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
-  486, 7,   7,   8,   39,  40,  71,  72,  103, 104, 135, 136, 167, 168, 199,
-  200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
-  455, 456, 487, 8,   8,   9,   40,  41,  72,  73,  104, 105, 136, 137, 168,
-  169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
-  424, 425, 456, 457, 488, 9,   9,   10,  41,  42,  73,  74,  105, 106, 137,
-  138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
-  393, 394, 425, 426, 457, 458, 489, 10,  10,  11,  42,  43,  74,  75,  106,
-  107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
-  362, 363, 394, 395, 426, 427, 458, 459, 490, 11,  11,  12,  43,  44,  75,
-  76,  107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
-  331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12,  12,  13,  44,
-  45,  76,  77,  108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
-  300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13,  13,
-  14,  45,  46,  77,  78,  109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
-  269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
-  14,  14,  15,  46,  47,  78,  79,  110, 111, 142, 143, 174, 175, 206, 207,
-  238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
-  463, 494, 15,  15,  16,  47,  48,  79,  80,  111, 112, 143, 144, 175, 176,
-  207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
-  432, 463, 464, 495, 16,  16,  17,  48,  49,  80,  81,  112, 113, 144, 145,
-  176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
-  401, 432, 433, 464, 465, 496, 17,  17,  18,  49,  50,  81,  82,  113, 114,
-  145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
-  370, 401, 402, 433, 434, 465, 466, 497, 18,  18,  19,  50,  51,  82,  83,
-  114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
-  339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19,  19,  20,  51,  52,
-  83,  84,  115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
-  308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20,  20,  21,
-  52,  53,  84,  85,  116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
-  277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
-  21,  22,  53,  54,  85,  86,  117, 118, 149, 150, 181, 182, 213, 214, 245,
-  246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
-  501, 22,  22,  23,  54,  55,  86,  87,  118, 119, 150, 151, 182, 183, 214,
-  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
-  470, 471, 502, 23,  23,  24,  55,  56,  87,  88,  119, 120, 151, 152, 183,
-  184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
-  439, 440, 471, 472, 503, 24,  24,  25,  56,  57,  88,  89,  120, 121, 152,
-  153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
-  408, 409, 440, 441, 472, 473, 504, 25,  25,  26,  57,  58,  89,  90,  121,
-  122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
-  377, 378, 409, 410, 441, 442, 473, 474, 505, 26,  26,  27,  58,  59,  90,
-  91,  122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
-  346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27,  27,  28,  59,
-  60,  91,  92,  123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
-  315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28,  28,
-  29,  60,  61,  92,  93,  124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
-  284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
-  29,  29,  30,  61,  62,  93,  94,  125, 126, 157, 158, 189, 190, 221, 222,
-  253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
-  478, 509, 30,  30,  31,  62,  63,  94,  95,  126, 127, 158, 159, 190, 191,
-  222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
-  447, 478, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
-  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
-  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
-  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
-  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
-  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
-  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
-  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
-  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
-  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
-  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
-  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
-  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
-  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
-  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
-  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
-  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
-  239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
-  261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
-  254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
-  276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
-  269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
-  291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
-  284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
-  306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
-  299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
-  321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
-  314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
-  336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
-  329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
-  336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
-  344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
-  366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
-  359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
-  381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
-  374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
-  396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
-  389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
-  411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
-  404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
-  426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
-  419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
-  441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
-  434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
-  456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
-  449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
-  471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
-  464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
-  486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
-  479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
-  501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
-  494, 509, 495, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
-  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
-  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
-  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
-  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
-  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
-  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
-  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
-  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
-  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
-  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
-  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
-  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
-  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
-  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
-  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
-  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
-  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
-  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
-  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
-  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
-  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
-  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
-  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
-  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
-  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
-  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
-  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
-  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
-  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
-  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
-  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
-  223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
-  261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
-  238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
-  276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
-  253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
-  291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
-  268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
-  306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
-  283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
-  321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
-  298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
-  336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
-  313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
-  320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
-  328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
-  366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
-  343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
-  381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
-  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
-  396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
-  373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
-  411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
-  388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
-  426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
-  403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
-  441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
-  418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
-  456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
-  433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
-  471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
-  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
-  486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
-  463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
-  501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
-  478, 509, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
-  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
-  224, 224, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,  80,  81,
-  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
-  209, 224, 225, 240, 1,   1,   2,   17,  18,  33,  34,  49,  50,  65,  66,
-  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
-  194, 209, 210, 225, 226, 241, 2,   2,   3,   18,  19,  34,  35,  50,  51,
-  66,  67,  82,  83,  98,  99,  114, 115, 130, 131, 146, 147, 162, 163, 178,
-  179, 194, 195, 210, 211, 226, 227, 242, 3,   3,   4,   19,  20,  35,  36,
-  51,  52,  67,  68,  83,  84,  99,  100, 115, 116, 131, 132, 147, 148, 163,
-  164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4,   4,   5,   20,  21,
-  36,  37,  52,  53,  68,  69,  84,  85,  100, 101, 116, 117, 132, 133, 148,
-  149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5,   5,   6,
-  21,  22,  37,  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133,
-  134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
-  6,   7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118,
-  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
-  246, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,  103,
-  104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
-  231, 232, 247, 8,   8,   9,   24,  25,  40,  41,  56,  57,  72,  73,  88,
-  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
-  216, 217, 232, 233, 248, 9,   9,   10,  25,  26,  41,  42,  57,  58,  73,
-  74,  89,  90,  105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
-  201, 202, 217, 218, 233, 234, 249, 10,  10,  11,  26,  27,  42,  43,  58,
-  59,  74,  75,  90,  91,  106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
-  186, 187, 202, 203, 218, 219, 234, 235, 250, 11,  11,  12,  27,  28,  43,
-  44,  59,  60,  75,  76,  91,  92,  107, 108, 123, 124, 139, 140, 155, 156,
-  171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12,  12,  13,  28,
-  29,  44,  45,  60,  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141,
-  156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13,  13,
-  14,  29,  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126,
-  141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
-  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
-  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
-  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
-  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
-  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
-  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
-  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
-  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
-  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
-  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
-  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
-  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
-  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
-  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
-  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
-  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
-  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
-  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   2,   2,   2,
-  17,  17,  32,  32,  32,  48,  48,  33,  48,  18,  33,  3,   18,  3,   3,
-  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  80,  80,  65,
-  80,  50,  65,  35,  50,  20,  35,  5,   20,  5,   5,   6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  112, 112, 97,
-  112, 82,  97,  67,  82,  52,  67,  37,  52,  22,  37,  7,   22,  7,   7,
-  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
-  113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99,  114, 84,  99,
-  69,  84,  54,  69,  39,  54,  24,  39,  9,   24,  9,   9,   10,  10,  10,
-  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
-  130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
-  131, 101, 116, 86,  101, 71,  86,  56,  71,  41,  56,  26,  41,  11,  26,
-  11,  11,  12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
-  208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
-  118, 88,  103, 73,  88,  58,  73,  43,  58,  28,  43,  13,  28,  13,  13,
-  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
-  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
-  224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
-  150, 120, 135, 105, 120, 90,  105, 75,  90,  60,  75,  45,  60,  30,  45,
-  15,  30,  31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106, 121, 121,
-  136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
-  227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
-  137, 107, 122, 92,  107, 77,  92,  62,  77,  47,  62,  63,  78,  78,  93,
-  93,  108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
-  213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
-  154, 169, 139, 154, 124, 139, 109, 124, 94,  109, 79,  94,  95,  110, 110,
-  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
-  230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
-  156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
-  202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
-  188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
-  234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
-  221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
-  239, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
-  192, 192,  224, 224,  256, 256,  288, 288,  320, 320,  352, 352,  384, 384,
-  416, 416,  448, 448,  480, 480,  512, 512,  544, 544,  576, 576,  608, 608,
-  640, 640,  672, 672,  704, 704,  736, 736,  768, 768,  800, 800,  832, 832,
-  864, 864,  896, 896,  928, 928,  960, 960,  0,   0,    1,   32,   33,  64,
-  65,  96,   97,  128,  129, 160,  161, 192,  193, 224,  225, 256,  257, 288,
-  289, 320,  321, 352,  353, 384,  385, 416,  417, 448,  449, 480,  481, 512,
-  513, 544,  545, 576,  577, 608,  609, 640,  641, 672,  673, 704,  705, 736,
-  737, 768,  769, 800,  801, 832,  833, 864,  865, 896,  897, 928,  929, 960,
-  961, 992,  1,   1,    2,   33,   34,  65,   66,  97,   98,  129,  130, 161,
-  162, 193,  194, 225,  226, 257,  258, 289,  290, 321,  322, 353,  354, 385,
-  386, 417,  418, 449,  450, 481,  482, 513,  514, 545,  546, 577,  578, 609,
-  610, 641,  642, 673,  674, 705,  706, 737,  738, 769,  770, 801,  802, 833,
-  834, 865,  866, 897,  898, 929,  930, 961,  962, 993,  2,   2,    3,   34,
-  35,  66,   67,  98,   99,  130,  131, 162,  163, 194,  195, 226,  227, 258,
-  259, 290,  291, 322,  323, 354,  355, 386,  387, 418,  419, 450,  451, 482,
-  483, 514,  515, 546,  547, 578,  579, 610,  611, 642,  643, 674,  675, 706,
-  707, 738,  739, 770,  771, 802,  803, 834,  835, 866,  867, 898,  899, 930,
-  931, 962,  963, 994,  3,   3,    4,   35,   36,  67,   68,  99,   100, 131,
-  132, 163,  164, 195,  196, 227,  228, 259,  260, 291,  292, 323,  324, 355,
-  356, 387,  388, 419,  420, 451,  452, 483,  484, 515,  516, 547,  548, 579,
-  580, 611,  612, 643,  644, 675,  676, 707,  708, 739,  740, 771,  772, 803,
-  804, 835,  836, 867,  868, 899,  900, 931,  932, 963,  964, 995,  4,   4,
-  5,   36,   37,  68,   69,  100,  101, 132,  133, 164,  165, 196,  197, 228,
-  229, 260,  261, 292,  293, 324,  325, 356,  357, 388,  389, 420,  421, 452,
-  453, 484,  485, 516,  517, 548,  549, 580,  581, 612,  613, 644,  645, 676,
-  677, 708,  709, 740,  741, 772,  773, 804,  805, 836,  837, 868,  869, 900,
-  901, 932,  933, 964,  965, 996,  5,   5,    6,   37,   38,  69,   70,  101,
-  102, 133,  134, 165,  166, 197,  198, 229,  230, 261,  262, 293,  294, 325,
-  326, 357,  358, 389,  390, 421,  422, 453,  454, 485,  486, 517,  518, 549,
-  550, 581,  582, 613,  614, 645,  646, 677,  678, 709,  710, 741,  742, 773,
-  774, 805,  806, 837,  838, 869,  870, 901,  902, 933,  934, 965,  966, 997,
-  6,   6,    7,   38,   39,  70,   71,  102,  103, 134,  135, 166,  167, 198,
-  199, 230,  231, 262,  263, 294,  295, 326,  327, 358,  359, 390,  391, 422,
-  423, 454,  455, 486,  487, 518,  519, 550,  551, 582,  583, 614,  615, 646,
-  647, 678,  679, 710,  711, 742,  743, 774,  775, 806,  807, 838,  839, 870,
-  871, 902,  903, 934,  935, 966,  967, 998,  7,   7,    8,   39,   40,  71,
-  72,  103,  104, 135,  136, 167,  168, 199,  200, 231,  232, 263,  264, 295,
-  296, 327,  328, 359,  360, 391,  392, 423,  424, 455,  456, 487,  488, 519,
-  520, 551,  552, 583,  584, 615,  616, 647,  648, 679,  680, 711,  712, 743,
-  744, 775,  776, 807,  808, 839,  840, 871,  872, 903,  904, 935,  936, 967,
-  968, 999,  8,   8,    9,   40,   41,  72,   73,  104,  105, 136,  137, 168,
-  169, 200,  201, 232,  233, 264,  265, 296,  297, 328,  329, 360,  361, 392,
-  393, 424,  425, 456,  457, 488,  489, 520,  521, 552,  553, 584,  585, 616,
-  617, 648,  649, 680,  681, 712,  713, 744,  745, 776,  777, 808,  809, 840,
-  841, 872,  873, 904,  905, 936,  937, 968,  969, 1000, 9,   9,    10,  41,
-  42,  73,   74,  105,  106, 137,  138, 169,  170, 201,  202, 233,  234, 265,
-  266, 297,  298, 329,  330, 361,  362, 393,  394, 425,  426, 457,  458, 489,
-  490, 521,  522, 553,  554, 585,  586, 617,  618, 649,  650, 681,  682, 713,
-  714, 745,  746, 777,  778, 809,  810, 841,  842, 873,  874, 905,  906, 937,
-  938, 969,  970, 1001, 10,  10,   11,  42,   43,  74,   75,  106,  107, 138,
-  139, 170,  171, 202,  203, 234,  235, 266,  267, 298,  299, 330,  331, 362,
-  363, 394,  395, 426,  427, 458,  459, 490,  491, 522,  523, 554,  555, 586,
-  587, 618,  619, 650,  651, 682,  683, 714,  715, 746,  747, 778,  779, 810,
-  811, 842,  843, 874,  875, 906,  907, 938,  939, 970,  971, 1002, 11,  11,
-  12,  43,   44,  75,   76,  107,  108, 139,  140, 171,  172, 203,  204, 235,
-  236, 267,  268, 299,  300, 331,  332, 363,  364, 395,  396, 427,  428, 459,
-  460, 491,  492, 523,  524, 555,  556, 587,  588, 619,  620, 651,  652, 683,
-  684, 715,  716, 747,  748, 779,  780, 811,  812, 843,  844, 875,  876, 907,
-  908, 939,  940, 971,  972, 1003, 12,  12,   13,  44,   45,  76,   77,  108,
-  109, 140,  141, 172,  173, 204,  205, 236,  237, 268,  269, 300,  301, 332,
-  333, 364,  365, 396,  397, 428,  429, 460,  461, 492,  493, 524,  525, 556,
-  557, 588,  589, 620,  621, 652,  653, 684,  685, 716,  717, 748,  749, 780,
-  781, 812,  813, 844,  845, 876,  877, 908,  909, 940,  941, 972,  973, 1004,
-  13,  13,   14,  45,   46,  77,   78,  109,  110, 141,  142, 173,  174, 205,
-  206, 237,  238, 269,  270, 301,  302, 333,  334, 365,  366, 397,  398, 429,
-  430, 461,  462, 493,  494, 525,  526, 557,  558, 589,  590, 621,  622, 653,
-  654, 685,  686, 717,  718, 749,  750, 781,  782, 813,  814, 845,  846, 877,
-  878, 909,  910, 941,  942, 973,  974, 1005, 14,  14,   15,  46,   47,  78,
-  79,  110,  111, 142,  143, 174,  175, 206,  207, 238,  239, 270,  271, 302,
-  303, 334,  335, 366,  367, 398,  399, 430,  431, 462,  463, 494,  495, 526,
-  527, 558,  559, 590,  591, 622,  623, 654,  655, 686,  687, 718,  719, 750,
-  751, 782,  783, 814,  815, 846,  847, 878,  879, 910,  911, 942,  943, 974,
-  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111,  112, 143,  144, 175,
-  176, 207,  208, 239,  240, 271,  272, 303,  304, 335,  336, 367,  368, 399,
-  400, 431,  432, 463,  464, 495,  496, 527,  528, 559,  560, 591,  592, 623,
-  624, 655,  656, 687,  688, 719,  720, 751,  752, 783,  784, 815,  816, 847,
-  848, 879,  880, 911,  912, 943,  944, 975,  976, 1007, 16,  16,   17,  48,
-  49,  80,   81,  112,  113, 144,  145, 176,  177, 208,  209, 240,  241, 272,
-  273, 304,  305, 336,  337, 368,  369, 400,  401, 432,  433, 464,  465, 496,
-  497, 528,  529, 560,  561, 592,  593, 624,  625, 656,  657, 688,  689, 720,
-  721, 752,  753, 784,  785, 816,  817, 848,  849, 880,  881, 912,  913, 944,
-  945, 976,  977, 1008, 17,  17,   18,  49,   50,  81,   82,  113,  114, 145,
-  146, 177,  178, 209,  210, 241,  242, 273,  274, 305,  306, 337,  338, 369,
-  370, 401,  402, 433,  434, 465,  466, 497,  498, 529,  530, 561,  562, 593,
-  594, 625,  626, 657,  658, 689,  690, 721,  722, 753,  754, 785,  786, 817,
-  818, 849,  850, 881,  882, 913,  914, 945,  946, 977,  978, 1009, 18,  18,
-  19,  50,   51,  82,   83,  114,  115, 146,  147, 178,  179, 210,  211, 242,
-  243, 274,  275, 306,  307, 338,  339, 370,  371, 402,  403, 434,  435, 466,
-  467, 498,  499, 530,  531, 562,  563, 594,  595, 626,  627, 658,  659, 690,
-  691, 722,  723, 754,  755, 786,  787, 818,  819, 850,  851, 882,  883, 914,
-  915, 946,  947, 978,  979, 1010, 19,  19,   20,  51,   52,  83,   84,  115,
-  116, 147,  148, 179,  180, 211,  212, 243,  244, 275,  276, 307,  308, 339,
-  340, 371,  372, 403,  404, 435,  436, 467,  468, 499,  500, 531,  532, 563,
-  564, 595,  596, 627,  628, 659,  660, 691,  692, 723,  724, 755,  756, 787,
-  788, 819,  820, 851,  852, 883,  884, 915,  916, 947,  948, 979,  980, 1011,
-  20,  20,   21,  52,   53,  84,   85,  116,  117, 148,  149, 180,  181, 212,
-  213, 244,  245, 276,  277, 308,  309, 340,  341, 372,  373, 404,  405, 436,
-  437, 468,  469, 500,  501, 532,  533, 564,  565, 596,  597, 628,  629, 660,
-  661, 692,  693, 724,  725, 756,  757, 788,  789, 820,  821, 852,  853, 884,
-  885, 916,  917, 948,  949, 980,  981, 1012, 21,  21,   22,  53,   54,  85,
-  86,  117,  118, 149,  150, 181,  182, 213,  214, 245,  246, 277,  278, 309,
-  310, 341,  342, 373,  374, 405,  406, 437,  438, 469,  470, 501,  502, 533,
-  534, 565,  566, 597,  598, 629,  630, 661,  662, 693,  694, 725,  726, 757,
-  758, 789,  790, 821,  822, 853,  854, 885,  886, 917,  918, 949,  950, 981,
-  982, 1013, 22,  22,   23,  54,   55,  86,   87,  118,  119, 150,  151, 182,
-  183, 214,  215, 246,  247, 278,  279, 310,  311, 342,  343, 374,  375, 406,
-  407, 438,  439, 470,  471, 502,  503, 534,  535, 566,  567, 598,  599, 630,
-  631, 662,  663, 694,  695, 726,  727, 758,  759, 790,  791, 822,  823, 854,
-  855, 886,  887, 918,  919, 950,  951, 982,  983, 1014, 23,  23,   24,  55,
-  56,  87,   88,  119,  120, 151,  152, 183,  184, 215,  216, 247,  248, 279,
-  280, 311,  312, 343,  344, 375,  376, 407,  408, 439,  440, 471,  472, 503,
-  504, 535,  536, 567,  568, 599,  600, 631,  632, 663,  664, 695,  696, 727,
-  728, 759,  760, 791,  792, 823,  824, 855,  856, 887,  888, 919,  920, 951,
-  952, 983,  984, 1015, 24,  24,   25,  56,   57,  88,   89,  120,  121, 152,
-  153, 184,  185, 216,  217, 248,  249, 280,  281, 312,  313, 344,  345, 376,
-  377, 408,  409, 440,  441, 472,  473, 504,  505, 536,  537, 568,  569, 600,
-  601, 632,  633, 664,  665, 696,  697, 728,  729, 760,  761, 792,  793, 824,
-  825, 856,  857, 888,  889, 920,  921, 952,  953, 984,  985, 1016, 25,  25,
-  26,  57,   58,  89,   90,  121,  122, 153,  154, 185,  186, 217,  218, 249,
-  250, 281,  282, 313,  314, 345,  346, 377,  378, 409,  410, 441,  442, 473,
-  474, 505,  506, 537,  538, 569,  570, 601,  602, 633,  634, 665,  666, 697,
-  698, 729,  730, 761,  762, 793,  794, 825,  826, 857,  858, 889,  890, 921,
-  922, 953,  954, 985,  986, 1017, 26,  26,   27,  58,   59,  90,   91,  122,
-  123, 154,  155, 186,  187, 218,  219, 250,  251, 282,  283, 314,  315, 346,
-  347, 378,  379, 410,  411, 442,  443, 474,  475, 506,  507, 538,  539, 570,
-  571, 602,  603, 634,  635, 666,  667, 698,  699, 730,  731, 762,  763, 794,
-  795, 826,  827, 858,  859, 890,  891, 922,  923, 954,  955, 986,  987, 1018,
-  27,  27,   28,  59,   60,  91,   92,  123,  124, 155,  156, 187,  188, 219,
-  220, 251,  252, 283,  284, 315,  316, 347,  348, 379,  380, 411,  412, 443,
-  444, 475,  476, 507,  508, 539,  540, 571,  572, 603,  604, 635,  636, 667,
-  668, 699,  700, 731,  732, 763,  764, 795,  796, 827,  828, 859,  860, 891,
-  892, 923,  924, 955,  956, 987,  988, 1019, 28,  28,   29,  60,   61,  92,
-  93,  124,  125, 156,  157, 188,  189, 220,  221, 252,  253, 284,  285, 316,
-  317, 348,  349, 380,  381, 412,  413, 444,  445, 476,  477, 508,  509, 540,
-  541, 572,  573, 604,  605, 636,  637, 668,  669, 700,  701, 732,  733, 764,
-  765, 796,  797, 828,  829, 860,  861, 892,  893, 924,  925, 956,  957, 988,
-  989, 1020, 29,  29,   30,  61,   62,  93,   94,  125,  126, 157,  158, 189,
-  190, 221,  222, 253,  254, 285,  286, 317,  318, 349,  350, 381,  382, 413,
-  414, 445,  446, 477,  478, 509,  510, 541,  542, 573,  574, 605,  606, 637,
-  638, 669,  670, 701,  702, 733,  734, 765,  766, 797,  798, 829,  830, 861,
-  862, 893,  894, 925,  926, 957,  958, 989,  990, 1021, 30,  30,   31,  62,
-  63,  94,   95,  126,  127, 158,  159, 190,  191, 222,  223, 254,  255, 286,
-  287, 318,  319, 350,  351, 382,  383, 414,  415, 446,  447, 478,  479, 510,
-  511, 542,  543, 574,  575, 606,  607, 638,  639, 670,  671, 702,  703, 734,
-  735, 766,  767, 798,  799, 830,  831, 862,  863, 894,  895, 926,  927, 958,
-  959, 990,  991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    1,   1,    2,   2,    3,   3,    4,   4,    5,   5,
-  6,   6,    7,   7,    8,   8,    9,   9,    10,  10,   11,  11,   12,  12,
-  13,  13,   14,  14,   15,  15,   16,  16,   17,  17,   18,  18,   19,  19,
-  20,  20,   21,  21,   22,  22,   23,  23,   24,  24,   25,  25,   26,  26,
-  27,  27,   28,  28,   29,  29,   30,  30,   0,   0,    1,   32,   2,   33,
-  3,   34,   4,   35,   5,   36,   6,   37,   7,   38,   8,   39,   9,   40,
-  10,  41,   11,  42,   12,  43,   13,  44,   14,  45,   15,  46,   16,  47,
-  17,  48,   18,  49,   19,  50,   20,  51,   21,  52,   22,  53,   23,  54,
-  24,  55,   25,  56,   26,  57,   27,  58,   28,  59,   29,  60,   30,  61,
-  31,  62,   32,  32,   33,  64,   34,  65,   35,  66,   36,  67,   37,  68,
-  38,  69,   39,  70,   40,  71,   41,  72,   42,  73,   43,  74,   44,  75,
-  45,  76,   46,  77,   47,  78,   48,  79,   49,  80,   50,  81,   51,  82,
-  52,  83,   53,  84,   54,  85,   55,  86,   56,  87,   57,  88,   58,  89,
-  59,  90,   60,  91,   61,  92,   62,  93,   63,  94,   64,  64,   65,  96,
-  66,  97,   67,  98,   68,  99,   69,  100,  70,  101,  71,  102,  72,  103,
-  73,  104,  74,  105,  75,  106,  76,  107,  77,  108,  78,  109,  79,  110,
-  80,  111,  81,  112,  82,  113,  83,  114,  84,  115,  85,  116,  86,  117,
-  87,  118,  88,  119,  89,  120,  90,  121,  91,  122,  92,  123,  93,  124,
-  94,  125,  95,  126,  96,  96,   97,  128,  98,  129,  99,  130,  100, 131,
-  101, 132,  102, 133,  103, 134,  104, 135,  105, 136,  106, 137,  107, 138,
-  108, 139,  109, 140,  110, 141,  111, 142,  112, 143,  113, 144,  114, 145,
-  115, 146,  116, 147,  117, 148,  118, 149,  119, 150,  120, 151,  121, 152,
-  122, 153,  123, 154,  124, 155,  125, 156,  126, 157,  127, 158,  128, 128,
-  129, 160,  130, 161,  131, 162,  132, 163,  133, 164,  134, 165,  135, 166,
-  136, 167,  137, 168,  138, 169,  139, 170,  140, 171,  141, 172,  142, 173,
-  143, 174,  144, 175,  145, 176,  146, 177,  147, 178,  148, 179,  149, 180,
-  150, 181,  151, 182,  152, 183,  153, 184,  154, 185,  155, 186,  156, 187,
-  157, 188,  158, 189,  159, 190,  160, 160,  161, 192,  162, 193,  163, 194,
-  164, 195,  165, 196,  166, 197,  167, 198,  168, 199,  169, 200,  170, 201,
-  171, 202,  172, 203,  173, 204,  174, 205,  175, 206,  176, 207,  177, 208,
-  178, 209,  179, 210,  180, 211,  181, 212,  182, 213,  183, 214,  184, 215,
-  185, 216,  186, 217,  187, 218,  188, 219,  189, 220,  190, 221,  191, 222,
-  192, 192,  193, 224,  194, 225,  195, 226,  196, 227,  197, 228,  198, 229,
-  199, 230,  200, 231,  201, 232,  202, 233,  203, 234,  204, 235,  205, 236,
-  206, 237,  207, 238,  208, 239,  209, 240,  210, 241,  211, 242,  212, 243,
-  213, 244,  214, 245,  215, 246,  216, 247,  217, 248,  218, 249,  219, 250,
-  220, 251,  221, 252,  222, 253,  223, 254,  224, 224,  225, 256,  226, 257,
-  227, 258,  228, 259,  229, 260,  230, 261,  231, 262,  232, 263,  233, 264,
-  234, 265,  235, 266,  236, 267,  237, 268,  238, 269,  239, 270,  240, 271,
-  241, 272,  242, 273,  243, 274,  244, 275,  245, 276,  246, 277,  247, 278,
-  248, 279,  249, 280,  250, 281,  251, 282,  252, 283,  253, 284,  254, 285,
-  255, 286,  256, 256,  257, 288,  258, 289,  259, 290,  260, 291,  261, 292,
-  262, 293,  263, 294,  264, 295,  265, 296,  266, 297,  267, 298,  268, 299,
-  269, 300,  270, 301,  271, 302,  272, 303,  273, 304,  274, 305,  275, 306,
-  276, 307,  277, 308,  278, 309,  279, 310,  280, 311,  281, 312,  282, 313,
-  283, 314,  284, 315,  285, 316,  286, 317,  287, 318,  288, 288,  289, 320,
-  290, 321,  291, 322,  292, 323,  293, 324,  294, 325,  295, 326,  296, 327,
-  297, 328,  298, 329,  299, 330,  300, 331,  301, 332,  302, 333,  303, 334,
-  304, 335,  305, 336,  306, 337,  307, 338,  308, 339,  309, 340,  310, 341,
-  311, 342,  312, 343,  313, 344,  314, 345,  315, 346,  316, 347,  317, 348,
-  318, 349,  319, 350,  320, 320,  321, 352,  322, 353,  323, 354,  324, 355,
-  325, 356,  326, 357,  327, 358,  328, 359,  329, 360,  330, 361,  331, 362,
-  332, 363,  333, 364,  334, 365,  335, 366,  336, 367,  337, 368,  338, 369,
-  339, 370,  340, 371,  341, 372,  342, 373,  343, 374,  344, 375,  345, 376,
-  346, 377,  347, 378,  348, 379,  349, 380,  350, 381,  351, 382,  352, 352,
-  353, 384,  354, 385,  355, 386,  356, 387,  357, 388,  358, 389,  359, 390,
-  360, 391,  361, 392,  362, 393,  363, 394,  364, 395,  365, 396,  366, 397,
-  367, 398,  368, 399,  369, 400,  370, 401,  371, 402,  372, 403,  373, 404,
-  374, 405,  375, 406,  376, 407,  377, 408,  378, 409,  379, 410,  380, 411,
-  381, 412,  382, 413,  383, 414,  384, 384,  385, 416,  386, 417,  387, 418,
-  388, 419,  389, 420,  390, 421,  391, 422,  392, 423,  393, 424,  394, 425,
-  395, 426,  396, 427,  397, 428,  398, 429,  399, 430,  400, 431,  401, 432,
-  402, 433,  403, 434,  404, 435,  405, 436,  406, 437,  407, 438,  408, 439,
-  409, 440,  410, 441,  411, 442,  412, 443,  413, 444,  414, 445,  415, 446,
-  416, 416,  417, 448,  418, 449,  419, 450,  420, 451,  421, 452,  422, 453,
-  423, 454,  424, 455,  425, 456,  426, 457,  427, 458,  428, 459,  429, 460,
-  430, 461,  431, 462,  432, 463,  433, 464,  434, 465,  435, 466,  436, 467,
-  437, 468,  438, 469,  439, 470,  440, 471,  441, 472,  442, 473,  443, 474,
-  444, 475,  445, 476,  446, 477,  447, 478,  448, 448,  449, 480,  450, 481,
-  451, 482,  452, 483,  453, 484,  454, 485,  455, 486,  456, 487,  457, 488,
-  458, 489,  459, 490,  460, 491,  461, 492,  462, 493,  463, 494,  464, 495,
-  465, 496,  466, 497,  467, 498,  468, 499,  469, 500,  470, 501,  471, 502,
-  472, 503,  473, 504,  474, 505,  475, 506,  476, 507,  477, 508,  478, 509,
-  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
-  486, 517,  487, 518,  488, 519,  489, 520,  490, 521,  491, 522,  492, 523,
-  493, 524,  494, 525,  495, 526,  496, 527,  497, 528,  498, 529,  499, 530,
-  500, 531,  501, 532,  502, 533,  503, 534,  504, 535,  505, 536,  506, 537,
-  507, 538,  508, 539,  509, 540,  510, 541,  511, 542,  512, 512,  513, 544,
-  514, 545,  515, 546,  516, 547,  517, 548,  518, 549,  519, 550,  520, 551,
-  521, 552,  522, 553,  523, 554,  524, 555,  525, 556,  526, 557,  527, 558,
-  528, 559,  529, 560,  530, 561,  531, 562,  532, 563,  533, 564,  534, 565,
-  535, 566,  536, 567,  537, 568,  538, 569,  539, 570,  540, 571,  541, 572,
-  542, 573,  543, 574,  544, 544,  545, 576,  546, 577,  547, 578,  548, 579,
-  549, 580,  550, 581,  551, 582,  552, 583,  553, 584,  554, 585,  555, 586,
-  556, 587,  557, 588,  558, 589,  559, 590,  560, 591,  561, 592,  562, 593,
-  563, 594,  564, 595,  565, 596,  566, 597,  567, 598,  568, 599,  569, 600,
-  570, 601,  571, 602,  572, 603,  573, 604,  574, 605,  575, 606,  576, 576,
-  577, 608,  578, 609,  579, 610,  580, 611,  581, 612,  582, 613,  583, 614,
-  584, 615,  585, 616,  586, 617,  587, 618,  588, 619,  589, 620,  590, 621,
-  591, 622,  592, 623,  593, 624,  594, 625,  595, 626,  596, 627,  597, 628,
-  598, 629,  599, 630,  600, 631,  601, 632,  602, 633,  603, 634,  604, 635,
-  605, 636,  606, 637,  607, 638,  608, 608,  609, 640,  610, 641,  611, 642,
-  612, 643,  613, 644,  614, 645,  615, 646,  616, 647,  617, 648,  618, 649,
-  619, 650,  620, 651,  621, 652,  622, 653,  623, 654,  624, 655,  625, 656,
-  626, 657,  627, 658,  628, 659,  629, 660,  630, 661,  631, 662,  632, 663,
-  633, 664,  634, 665,  635, 666,  636, 667,  637, 668,  638, 669,  639, 670,
-  640, 640,  641, 672,  642, 673,  643, 674,  644, 675,  645, 676,  646, 677,
-  647, 678,  648, 679,  649, 680,  650, 681,  651, 682,  652, 683,  653, 684,
-  654, 685,  655, 686,  656, 687,  657, 688,  658, 689,  659, 690,  660, 691,
-  661, 692,  662, 693,  663, 694,  664, 695,  665, 696,  666, 697,  667, 698,
-  668, 699,  669, 700,  670, 701,  671, 702,  672, 672,  673, 704,  674, 705,
-  675, 706,  676, 707,  677, 708,  678, 709,  679, 710,  680, 711,  681, 712,
-  682, 713,  683, 714,  684, 715,  685, 716,  686, 717,  687, 718,  688, 719,
-  689, 720,  690, 721,  691, 722,  692, 723,  693, 724,  694, 725,  695, 726,
-  696, 727,  697, 728,  698, 729,  699, 730,  700, 731,  701, 732,  702, 733,
-  703, 734,  704, 704,  705, 736,  706, 737,  707, 738,  708, 739,  709, 740,
-  710, 741,  711, 742,  712, 743,  713, 744,  714, 745,  715, 746,  716, 747,
-  717, 748,  718, 749,  719, 750,  720, 751,  721, 752,  722, 753,  723, 754,
-  724, 755,  725, 756,  726, 757,  727, 758,  728, 759,  729, 760,  730, 761,
-  731, 762,  732, 763,  733, 764,  734, 765,  735, 766,  736, 736,  737, 768,
-  738, 769,  739, 770,  740, 771,  741, 772,  742, 773,  743, 774,  744, 775,
-  745, 776,  746, 777,  747, 778,  748, 779,  749, 780,  750, 781,  751, 782,
-  752, 783,  753, 784,  754, 785,  755, 786,  756, 787,  757, 788,  758, 789,
-  759, 790,  760, 791,  761, 792,  762, 793,  763, 794,  764, 795,  765, 796,
-  766, 797,  767, 798,  768, 768,  769, 800,  770, 801,  771, 802,  772, 803,
-  773, 804,  774, 805,  775, 806,  776, 807,  777, 808,  778, 809,  779, 810,
-  780, 811,  781, 812,  782, 813,  783, 814,  784, 815,  785, 816,  786, 817,
-  787, 818,  788, 819,  789, 820,  790, 821,  791, 822,  792, 823,  793, 824,
-  794, 825,  795, 826,  796, 827,  797, 828,  798, 829,  799, 830,  800, 800,
-  801, 832,  802, 833,  803, 834,  804, 835,  805, 836,  806, 837,  807, 838,
-  808, 839,  809, 840,  810, 841,  811, 842,  812, 843,  813, 844,  814, 845,
-  815, 846,  816, 847,  817, 848,  818, 849,  819, 850,  820, 851,  821, 852,
-  822, 853,  823, 854,  824, 855,  825, 856,  826, 857,  827, 858,  828, 859,
-  829, 860,  830, 861,  831, 862,  832, 832,  833, 864,  834, 865,  835, 866,
-  836, 867,  837, 868,  838, 869,  839, 870,  840, 871,  841, 872,  842, 873,
-  843, 874,  844, 875,  845, 876,  846, 877,  847, 878,  848, 879,  849, 880,
-  850, 881,  851, 882,  852, 883,  853, 884,  854, 885,  855, 886,  856, 887,
-  857, 888,  858, 889,  859, 890,  860, 891,  861, 892,  862, 893,  863, 894,
-  864, 864,  865, 896,  866, 897,  867, 898,  868, 899,  869, 900,  870, 901,
-  871, 902,  872, 903,  873, 904,  874, 905,  875, 906,  876, 907,  877, 908,
-  878, 909,  879, 910,  880, 911,  881, 912,  882, 913,  883, 914,  884, 915,
-  885, 916,  886, 917,  887, 918,  888, 919,  889, 920,  890, 921,  891, 922,
-  892, 923,  893, 924,  894, 925,  895, 926,  896, 896,  897, 928,  898, 929,
-  899, 930,  900, 931,  901, 932,  902, 933,  903, 934,  904, 935,  905, 936,
-  906, 937,  907, 938,  908, 939,  909, 940,  910, 941,  911, 942,  912, 943,
-  913, 944,  914, 945,  915, 946,  916, 947,  917, 948,  918, 949,  919, 950,
-  920, 951,  921, 952,  922, 953,  923, 954,  924, 955,  925, 956,  926, 957,
-  927, 958,  928, 928,  929, 960,  930, 961,  931, 962,  932, 963,  933, 964,
-  934, 965,  935, 966,  936, 967,  937, 968,  938, 969,  939, 970,  940, 971,
-  941, 972,  942, 973,  943, 974,  944, 975,  945, 976,  946, 977,  947, 978,
-  948, 979,  949, 980,  950, 981,  951, 982,  952, 983,  953, 984,  954, 985,
-  955, 986,  956, 987,  957, 988,  958, 989,  959, 990,  960, 960,  961, 992,
-  962, 993,  963, 994,  964, 995,  965, 996,  966, 997,  967, 998,  968, 999,
-  969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
-  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
-  983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
-  990, 1021, 991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    2,   2,
-  2,   33,   33,  64,   64,  64,   96,  96,   65,  96,  34,  65,   3,   34,
-  3,   3,    4,   4,    4,   35,   35,  66,   66,  97,  97,  128,  128, 128,
-  160, 160,  129, 160,  98,  129,  67,  98,   36,  67,  5,   36,   5,   5,
-  6,   6,    6,   37,   37,  68,   68,  99,   99,  130, 130, 161,  161, 192,
-  192, 192,  224, 224,  193, 224,  162, 193,  131, 162, 100, 131,  69,  100,
-  38,  69,   7,   38,   7,   7,    8,   8,    8,   39,  39,  70,   70,  101,
-  101, 132,  132, 163,  163, 194,  194, 225,  225, 256, 256, 256,  288, 288,
-  257, 288,  226, 257,  195, 226,  164, 195,  133, 164, 102, 133,  71,  102,
-  40,  71,   9,   40,   9,   9,    10,  10,   10,  41,  41,  72,   72,  103,
-  103, 134,  134, 165,  165, 196,  196, 227,  227, 258, 258, 289,  289, 320,
-  320, 320,  352, 352,  321, 352,  290, 321,  259, 290, 228, 259,  197, 228,
-  166, 197,  135, 166,  104, 135,  73,  104,  42,  73,  11,  42,   11,  11,
-  12,  12,   12,  43,   43,  74,   74,  105,  105, 136, 136, 167,  167, 198,
-  198, 229,  229, 260,  260, 291,  291, 322,  322, 353, 353, 384,  384, 384,
-  416, 416,  385, 416,  354, 385,  323, 354,  292, 323, 261, 292,  230, 261,
-  199, 230,  168, 199,  137, 168,  106, 137,  75,  106, 44,  75,   13,  44,
-  13,  13,   14,  14,   14,  45,   45,  76,   76,  107, 107, 138,  138, 169,
-  169, 200,  200, 231,  231, 262,  262, 293,  293, 324, 324, 355,  355, 386,
-  386, 417,  417, 448,  448, 448,  480, 480,  449, 480, 418, 449,  387, 418,
-  356, 387,  325, 356,  294, 325,  263, 294,  232, 263, 201, 232,  170, 201,
-  139, 170,  108, 139,  77,  108,  46,  77,   15,  46,  15,  15,   16,  16,
-  16,  47,   47,  78,   78,  109,  109, 140,  140, 171, 171, 202,  202, 233,
-  233, 264,  264, 295,  295, 326,  326, 357,  357, 388, 388, 419,  419, 450,
-  450, 481,  481, 512,  512, 512,  544, 544,  513, 544, 482, 513,  451, 482,
-  420, 451,  389, 420,  358, 389,  327, 358,  296, 327, 265, 296,  234, 265,
-  203, 234,  172, 203,  141, 172,  110, 141,  79,  110, 48,  79,   17,  48,
-  17,  17,   18,  18,   18,  49,   49,  80,   80,  111, 111, 142,  142, 173,
-  173, 204,  204, 235,  235, 266,  266, 297,  297, 328, 328, 359,  359, 390,
-  390, 421,  421, 452,  452, 483,  483, 514,  514, 545, 545, 576,  576, 576,
-  608, 608,  577, 608,  546, 577,  515, 546,  484, 515, 453, 484,  422, 453,
-  391, 422,  360, 391,  329, 360,  298, 329,  267, 298, 236, 267,  205, 236,
-  174, 205,  143, 174,  112, 143,  81,  112,  50,  81,  19,  50,   19,  19,
-  20,  20,   20,  51,   51,  82,   82,  113,  113, 144, 144, 175,  175, 206,
-  206, 237,  237, 268,  268, 299,  299, 330,  330, 361, 361, 392,  392, 423,
-  423, 454,  454, 485,  485, 516,  516, 547,  547, 578, 578, 609,  609, 640,
-  640, 640,  672, 672,  641, 672,  610, 641,  579, 610, 548, 579,  517, 548,
-  486, 517,  455, 486,  424, 455,  393, 424,  362, 393, 331, 362,  300, 331,
-  269, 300,  238, 269,  207, 238,  176, 207,  145, 176, 114, 145,  83,  114,
-  52,  83,   21,  52,   21,  21,   22,  22,   22,  53,  53,  84,   84,  115,
-  115, 146,  146, 177,  177, 208,  208, 239,  239, 270, 270, 301,  301, 332,
-  332, 363,  363, 394,  394, 425,  425, 456,  456, 487, 487, 518,  518, 549,
-  549, 580,  580, 611,  611, 642,  642, 673,  673, 704, 704, 704,  736, 736,
-  705, 736,  674, 705,  643, 674,  612, 643,  581, 612, 550, 581,  519, 550,
-  488, 519,  457, 488,  426, 457,  395, 426,  364, 395, 333, 364,  302, 333,
-  271, 302,  240, 271,  209, 240,  178, 209,  147, 178, 116, 147,  85,  116,
-  54,  85,   23,  54,   23,  23,   24,  24,   24,  55,  55,  86,   86,  117,
-  117, 148,  148, 179,  179, 210,  210, 241,  241, 272, 272, 303,  303, 334,
-  334, 365,  365, 396,  396, 427,  427, 458,  458, 489, 489, 520,  520, 551,
-  551, 582,  582, 613,  613, 644,  644, 675,  675, 706, 706, 737,  737, 768,
-  768, 768,  800, 800,  769, 800,  738, 769,  707, 738, 676, 707,  645, 676,
-  614, 645,  583, 614,  552, 583,  521, 552,  490, 521, 459, 490,  428, 459,
-  397, 428,  366, 397,  335, 366,  304, 335,  273, 304, 242, 273,  211, 242,
-  180, 211,  149, 180,  118, 149,  87,  118,  56,  87,  25,  56,   25,  25,
-  26,  26,   26,  57,   57,  88,   88,  119,  119, 150, 150, 181,  181, 212,
-  212, 243,  243, 274,  274, 305,  305, 336,  336, 367, 367, 398,  398, 429,
-  429, 460,  460, 491,  491, 522,  522, 553,  553, 584, 584, 615,  615, 646,
-  646, 677,  677, 708,  708, 739,  739, 770,  770, 801, 801, 832,  832, 832,
-  864, 864,  833, 864,  802, 833,  771, 802,  740, 771, 709, 740,  678, 709,
-  647, 678,  616, 647,  585, 616,  554, 585,  523, 554, 492, 523,  461, 492,
-  430, 461,  399, 430,  368, 399,  337, 368,  306, 337, 275, 306,  244, 275,
-  213, 244,  182, 213,  151, 182,  120, 151,  89,  120, 58,  89,   27,  58,
-  27,  27,   28,  28,   28,  59,   59,  90,   90,  121, 121, 152,  152, 183,
-  183, 214,  214, 245,  245, 276,  276, 307,  307, 338, 338, 369,  369, 400,
-  400, 431,  431, 462,  462, 493,  493, 524,  524, 555, 555, 586,  586, 617,
-  617, 648,  648, 679,  679, 710,  710, 741,  741, 772, 772, 803,  803, 834,
-  834, 865,  865, 896,  896, 896,  928, 928,  897, 928, 866, 897,  835, 866,
-  804, 835,  773, 804,  742, 773,  711, 742,  680, 711, 649, 680,  618, 649,
-  587, 618,  556, 587,  525, 556,  494, 525,  463, 494, 432, 463,  401, 432,
-  370, 401,  339, 370,  308, 339,  277, 308,  246, 277, 215, 246,  184, 215,
-  153, 184,  122, 153,  91,  122,  60,  91,   29,  60,  29,  29,   30,  30,
-  30,  61,   61,  92,   92,  123,  123, 154,  154, 185, 185, 216,  216, 247,
-  247, 278,  278, 309,  309, 340,  340, 371,  371, 402, 402, 433,  433, 464,
-  464, 495,  495, 526,  526, 557,  557, 588,  588, 619, 619, 650,  650, 681,
-  681, 712,  712, 743,  743, 774,  774, 805,  805, 836, 836, 867,  867, 898,
-  898, 929,  929, 960,  960, 960,  961, 992,  930, 961, 899, 930,  868, 899,
-  837, 868,  806, 837,  775, 806,  744, 775,  713, 744, 682, 713,  651, 682,
-  620, 651,  589, 620,  558, 589,  527, 558,  496, 527, 465, 496,  434, 465,
-  403, 434,  372, 403,  341, 372,  310, 341,  279, 310, 248, 279,  217, 248,
-  186, 217,  155, 186,  124, 155,  93,  124,  62,  93,  31,  62,   63,  94,
-  94,  125,  125, 156,  156, 187,  187, 218,  218, 249, 249, 280,  280, 311,
-  311, 342,  342, 373,  373, 404,  404, 435,  435, 466, 466, 497,  497, 528,
-  528, 559,  559, 590,  590, 621,  621, 652,  652, 683, 683, 714,  714, 745,
-  745, 776,  776, 807,  807, 838,  838, 869,  869, 900, 900, 931,  931, 962,
-  962, 993,  963, 994,  932, 963,  901, 932,  870, 901, 839, 870,  808, 839,
-  777, 808,  746, 777,  715, 746,  684, 715,  653, 684, 622, 653,  591, 622,
-  560, 591,  529, 560,  498, 529,  467, 498,  436, 467, 405, 436,  374, 405,
-  343, 374,  312, 343,  281, 312,  250, 281,  219, 250, 188, 219,  157, 188,
-  126, 157,  95,  126,  127, 158,  158, 189,  189, 220, 220, 251,  251, 282,
-  282, 313,  313, 344,  344, 375,  375, 406,  406, 437, 437, 468,  468, 499,
-  499, 530,  530, 561,  561, 592,  592, 623,  623, 654, 654, 685,  685, 716,
-  716, 747,  747, 778,  778, 809,  809, 840,  840, 871, 871, 902,  902, 933,
-  933, 964,  964, 995,  965, 996,  934, 965,  903, 934, 872, 903,  841, 872,
-  810, 841,  779, 810,  748, 779,  717, 748,  686, 717, 655, 686,  624, 655,
-  593, 624,  562, 593,  531, 562,  500, 531,  469, 500, 438, 469,  407, 438,
-  376, 407,  345, 376,  314, 345,  283, 314,  252, 283, 221, 252,  190, 221,
-  159, 190,  191, 222,  222, 253,  253, 284,  284, 315, 315, 346,  346, 377,
-  377, 408,  408, 439,  439, 470,  470, 501,  501, 532, 532, 563,  563, 594,
-  594, 625,  625, 656,  656, 687,  687, 718,  718, 749, 749, 780,  780, 811,
-  811, 842,  842, 873,  873, 904,  904, 935,  935, 966, 966, 997,  967, 998,
-  936, 967,  905, 936,  874, 905,  843, 874,  812, 843, 781, 812,  750, 781,
-  719, 750,  688, 719,  657, 688,  626, 657,  595, 626, 564, 595,  533, 564,
-  502, 533,  471, 502,  440, 471,  409, 440,  378, 409, 347, 378,  316, 347,
-  285, 316,  254, 285,  223, 254,  255, 286,  286, 317, 317, 348,  348, 379,
-  379, 410,  410, 441,  441, 472,  472, 503,  503, 534, 534, 565,  565, 596,
-  596, 627,  627, 658,  658, 689,  689, 720,  720, 751, 751, 782,  782, 813,
-  813, 844,  844, 875,  875, 906,  906, 937,  937, 968, 968, 999,  969, 1000,
-  938, 969,  907, 938,  876, 907,  845, 876,  814, 845, 783, 814,  752, 783,
-  721, 752,  690, 721,  659, 690,  628, 659,  597, 628, 566, 597,  535, 566,
-  504, 535,  473, 504,  442, 473,  411, 442,  380, 411, 349, 380,  318, 349,
-  287, 318,  319, 350,  350, 381,  381, 412,  412, 443, 443, 474,  474, 505,
-  505, 536,  536, 567,  567, 598,  598, 629,  629, 660, 660, 691,  691, 722,
-  722, 753,  753, 784,  784, 815,  815, 846,  846, 877, 877, 908,  908, 939,
-  939, 970,  970, 1001, 971, 1002, 940, 971,  909, 940, 878, 909,  847, 878,
-  816, 847,  785, 816,  754, 785,  723, 754,  692, 723, 661, 692,  630, 661,
-  599, 630,  568, 599,  537, 568,  506, 537,  475, 506, 444, 475,  413, 444,
-  382, 413,  351, 382,  383, 414,  414, 445,  445, 476, 476, 507,  507, 538,
-  538, 569,  569, 600,  600, 631,  631, 662,  662, 693, 693, 724,  724, 755,
-  755, 786,  786, 817,  817, 848,  848, 879,  879, 910, 910, 941,  941, 972,
-  972, 1003, 973, 1004, 942, 973,  911, 942,  880, 911, 849, 880,  818, 849,
-  787, 818,  756, 787,  725, 756,  694, 725,  663, 694, 632, 663,  601, 632,
-  570, 601,  539, 570,  508, 539,  477, 508,  446, 477, 415, 446,  447, 478,
-  478, 509,  509, 540,  540, 571,  571, 602,  602, 633, 633, 664,  664, 695,
-  695, 726,  726, 757,  757, 788,  788, 819,  819, 850, 850, 881,  881, 912,
-  912, 943,  943, 974,  974, 1005, 975, 1006, 944, 975, 913, 944,  882, 913,
-  851, 882,  820, 851,  789, 820,  758, 789,  727, 758, 696, 727,  665, 696,
-  634, 665,  603, 634,  572, 603,  541, 572,  510, 541, 479, 510,  511, 542,
-  542, 573,  573, 604,  604, 635,  635, 666,  666, 697, 697, 728,  728, 759,
-  759, 790,  790, 821,  821, 852,  852, 883,  883, 914, 914, 945,  945, 976,
-  976, 1007, 977, 1008, 946, 977,  915, 946,  884, 915, 853, 884,  822, 853,
-  791, 822,  760, 791,  729, 760,  698, 729,  667, 698, 636, 667,  605, 636,
-  574, 605,  543, 574,  575, 606,  606, 637,  637, 668, 668, 699,  699, 730,
-  730, 761,  761, 792,  792, 823,  823, 854,  854, 885, 885, 916,  916, 947,
-  947, 978,  978, 1009, 979, 1010, 948, 979,  917, 948, 886, 917,  855, 886,
-  824, 855,  793, 824,  762, 793,  731, 762,  700, 731, 669, 700,  638, 669,
-  607, 638,  639, 670,  670, 701,  701, 732,  732, 763, 763, 794,  794, 825,
-  825, 856,  856, 887,  887, 918,  918, 949,  949, 980, 980, 1011, 981, 1012,
-  950, 981,  919, 950,  888, 919,  857, 888,  826, 857, 795, 826,  764, 795,
-  733, 764,  702, 733,  671, 702,  703, 734,  734, 765, 765, 796,  796, 827,
-  827, 858,  858, 889,  889, 920,  920, 951,  951, 982, 982, 1013, 983, 1014,
-  952, 983,  921, 952,  890, 921,  859, 890,  828, 859, 797, 828,  766, 797,
-  735, 766,  767, 798,  798, 829,  829, 860,  860, 891, 891, 922,  922, 953,
-  953, 984,  984, 1015, 985, 1016, 954, 985,  923, 954, 892, 923,  861, 892,
-  830, 861,  799, 830,  831, 862,  862, 893,  893, 924, 924, 955,  955, 986,
-  986, 1017, 987, 1018, 956, 987,  925, 956,  894, 925, 863, 894,  895, 926,
-  926, 957,  957, 988,  988, 1019, 989, 1020, 958, 989, 927, 958,  959, 990,
-  990, 1021, 991, 1022, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
-  0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
-};
+                av1_default_iscan_4x4[16]) = { 0, 1, 5,  6,  2, 4,  7,  12,
+                                               3, 8, 11, 13, 9, 10, 14, 15 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -3201,535 +1664,385 @@
 };
 
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
-  { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-  { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-  { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
-  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+  { default_scan_4x4, av1_default_iscan_4x4 },
+  { default_scan_8x8, av1_default_iscan_8x8 },
+  { default_scan_16x16, av1_default_iscan_16x16 },
+  { default_scan_32x32, av1_default_iscan_32x32 },
   // Half of the coefficients of tx64 at higher frequencies are set to
   // zeros. So tx32's scan order is used.
-  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+  { default_scan_32x32, av1_default_iscan_32x32 },
 };
 
 const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {
       // TX_4X4
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
   },
   {
       // TX_8X8
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
   },
   {
       // TX_16X16
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
   },
   {
       // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_64X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_4X8
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
   },
   {
       // TX_8X4
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
   },
   {
       // TX_8X16
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
   },
   {
       // TX_16X8
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
   },
   {
       // TX_16X32
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
   },
   {
       // TX_32X16
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
   },
   {
       // TX_32X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_64X32
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_4X16
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
   },
   {
       // TX_16X4
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
   },
   {
       // TX_8X32
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
   },
   {
       // TX_32X8
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
   },
   {
       // TX_16X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
   },
   {
       // TX_64X16
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
   },
 };

diff --git a/libaom/av1/common/scan.h b/libaom/av1/common/scan.h
index f9c3392..d9620e1 100644
--- a/libaom/av1/common/scan.h
+++ b/libaom/av1/common/scan.h

@@ -15,9 +15,9 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-#include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/libaom/av1/common/seg_common.c b/libaom/av1/common/seg_common.c
index 4650903..60b1851 100644
--- a/libaom/av1/common/seg_common.c
+++ b/libaom/av1/common/seg_common.c

@@ -39,7 +39,7 @@
   av1_zero(seg->feature_mask);
 }
 
-void calculate_segdata(struct segmentation *seg) {
+void av1_calculate_segdata(struct segmentation *seg) {
   seg->segid_preskip = 0;
   seg->last_active_segid = 0;
   for (int i = 0; i < MAX_SEGMENTS; i++) {

diff --git a/libaom/av1/common/seg_common.h b/libaom/av1/common/seg_common.h
index fa7894c..aeb9c17 100644
--- a/libaom/av1/common/seg_common.h
+++ b/libaom/av1/common/seg_common.h

@@ -83,7 +83,7 @@
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
-void calculate_segdata(struct segmentation *seg);
+void av1_calculate_segdata(struct segmentation *seg);
 
 int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 

diff --git a/libaom/av1/common/thread_common.c b/libaom/av1/common/thread_common.c
index 14406e6..f3c8795 100644
--- a/libaom/av1/common/thread_common.c
+++ b/libaom/av1/common/thread_common.c

@@ -206,7 +206,7 @@
 
 static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
                             int stop,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                             int is_decoding,
 #endif
                             int plane_start, int plane_end) {
@@ -223,7 +223,7 @@
         continue;
       else if (plane == 2 && !(cm->lf.filter_level_v))
         continue;
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
       int step = MAX_MIB_SIZE;
       if (is_decoding) {
         step = MI_SIZE_64X64;
@@ -268,7 +268,8 @@
     struct macroblockd_plane *planes, MACROBLOCKD *xd,
     AV1LfSync *const lf_sync) {
   const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
   int mi_row, mi_col, plane, dir;
   int r, c;
 
@@ -282,7 +283,8 @@
       r = mi_row >> MAX_MIB_SIZE_LOG2;
 
       if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
           av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
@@ -293,7 +295,8 @@
           sync_write(lf_sync, r, c, sb_cols, plane);
         }
       } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
           // Wait for vertical edge filtering of the top-right block to be
@@ -325,13 +328,14 @@
   return 1;
 }
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
 static INLINE void thread_loop_filter_bitmask_rows(
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
     struct macroblockd_plane *planes, MACROBLOCKD *xd,
     AV1LfSync *const lf_sync) {
   const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_cols, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
+      MIN_MIB_SIZE_LOG2;
   int mi_row, mi_col, plane, dir;
   int r, c;
   (void)xd;
@@ -346,7 +350,8 @@
       r = mi_row >> MIN_MIB_SIZE_LOG2;
 
       if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
           c = mi_col >> MIN_MIB_SIZE_LOG2;
 
           av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
@@ -357,7 +362,8 @@
           sync_write(lf_sync, r, c, sb_cols, plane);
         }
       } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
           c = mi_col >> MIN_MIB_SIZE_LOG2;
 
           // Wait for vertical edge filtering of the top-right block to be
@@ -388,30 +394,31 @@
                                   lf_data->planes, lf_data->xd, lf_sync);
   return 1;
 }
-#endif  // LOOP_FILTER_BITMASK
+#endif  // CONFIG_LPF_MASK
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
                                 int plane_start, int plane_end,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                                 int is_decoding,
 #endif
                                 AVxWorker *workers, int nworkers,
                                 AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   int sb_rows;
   if (is_decoding) {
-    sb_rows =
-        ALIGN_POWER_OF_TWO(cm->mi_rows, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2;
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
+              MIN_MIB_SIZE_LOG2;
   } else {
-    sb_rows =
-        ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+              MAX_MIB_SIZE_LOG2;
   }
 #else
   // Number of superblock rows and cols
   const int sb_rows =
-      ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
 #endif
   const int num_workers = nworkers;
   int i;
@@ -429,7 +436,7 @@
   }
 
   enqueue_lf_jobs(lf_sync, cm, start, stop,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                   is_decoding,
 #endif
                   plane_start, plane_end);
@@ -439,7 +446,7 @@
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
     if (is_decoding) {
       worker->hook = loop_filter_bitmask_row_worker;
     } else {
@@ -471,7 +478,7 @@
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MACROBLOCKD *xd, int plane_start, int plane_end,
                               int partial_frame,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,
@@ -479,16 +486,16 @@
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_rows;
-  if (partial_frame && cm->mi_rows > 8) {
-    start_mi_row = cm->mi_rows >> 1;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   if (is_decoding) {
     cm->is_decoding = is_decoding;
     // TODO(chengchen): currently use one thread to build bitmasks for the
@@ -909,7 +916,7 @@
                                           AV1_COMMON *cm, int optimized_lr,
                                           AVxWorker *workers, int num_workers,
                                           AV1LrSync *lr_sync, void *lr_ctxt) {
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int num_planes = av1_num_planes(cm);
 

diff --git a/libaom/av1/common/thread_common.h b/libaom/av1/common/thread_common.h
index e7dbb8b..7397f1c 100644
--- a/libaom/av1/common/thread_common.h
+++ b/libaom/av1/common/thread_common.h

@@ -101,9 +101,9 @@
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                              struct macroblockd *mbd, int plane_start,
+                              struct macroblockd *xd, int plane_start,
                               int plane_end, int partial_frame,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,

diff --git a/libaom/av1/common/tile_common.c b/libaom/av1/common/tile_common.c
index 02f50f5..1b11bd7 100644
--- a/libaom/av1/common/tile_common.c
+++ b/libaom/av1/common/tile_common.c

@@ -9,9 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/tile_common.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
 #include "aom_dsp/aom_dsp_common.h"
 
 void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
@@ -28,117 +28,126 @@
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  const int mi_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+  const int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  const int sb_rows = mi_rows >> seq_params->mib_size_log2;
 
-  int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
-  cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
-  int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+  const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
+  tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
 
-  cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
-  cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
-  cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
-  cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
-  cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
+  tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
+  tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+  tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+  tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+  tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
 }
 
-void av1_calculate_tile_cols(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             CommonTileParams *const tiles) {
+  int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
   int i;
 
   // This will be overridden if there is at least two columns of tiles
   // (otherwise there is no inner tile width)
-  cm->min_inner_tile_width = -1;
+  tiles->min_inner_width = -1;
 
-  if (cm->uniform_tile_spacing_flag) {
+  if (tiles->uniform_spacing) {
     int start_sb;
-    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
-    size_sb >>= cm->log2_tile_cols;
+    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols);
+    size_sb >>= tiles->log2_cols;
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
-      cm->tile_col_start_sb[i] = start_sb;
+      tiles->col_start_sb[i] = start_sb;
       start_sb += size_sb;
     }
-    cm->tile_cols = i;
-    cm->tile_col_start_sb[i] = sb_cols;
-    cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
-    cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
+    tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
+    tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
 
-    cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-    if (cm->tile_cols > 1) {
-      cm->min_inner_tile_width = cm->tile_width;
+    tiles->width = size_sb << seq_params->mib_size_log2;
+    tiles->width = AOMMIN(tiles->width, cm_mi_cols);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = tiles->width;
     }
   } else {
     int max_tile_area_sb = (sb_rows * sb_cols);
     int widest_tile_sb = 1;
     int narrowest_inner_tile_sb = 65536;
-    cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
-    for (i = 0; i < cm->tile_cols; i++) {
-      int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+    tiles->log2_cols = tile_log2(1, tiles->cols);
+    for (i = 0; i < tiles->cols; i++) {
+      int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
       widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
       // ignore the rightmost tile in frame for determining the narrowest
-      if (i < cm->tile_cols - 1)
+      if (i < tiles->cols - 1)
         narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
     }
-    if (cm->min_log2_tiles) {
-      max_tile_area_sb >>= (cm->min_log2_tiles + 1);
+    if (tiles->min_log2) {
+      max_tile_area_sb >>= (tiles->min_log2 + 1);
     }
-    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
-    if (cm->tile_cols > 1) {
-      cm->min_inner_tile_width = narrowest_inner_tile_sb
-                                 << cm->seq_params.mib_size_log2;
+    tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = narrowest_inner_tile_sb
+                               << seq_params->mib_size_log2;
     }
   }
 }
 
-void av1_calculate_tile_rows(AV1_COMMON *const cm) {
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, CommonTileParams *const tiles) {
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
   int start_sb, size_sb, i;
 
-  if (cm->uniform_tile_spacing_flag) {
-    size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows);
-    size_sb >>= cm->log2_tile_rows;
+  if (tiles->uniform_spacing) {
+    size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows);
+    size_sb >>= tiles->log2_rows;
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
-      cm->tile_row_start_sb[i] = start_sb;
+      tiles->row_start_sb[i] = start_sb;
       start_sb += size_sb;
     }
-    cm->tile_rows = i;
-    cm->tile_row_start_sb[i] = sb_rows;
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
 
-    cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+    tiles->height = size_sb << seq_params->mib_size_log2;
+    tiles->height = AOMMIN(tiles->height, cm_mi_rows);
   } else {
-    cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
+    tiles->log2_rows = tile_log2(1, tiles->rows);
   }
 }
 
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
-  assert(row < cm->tile_rows);
-  int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
-  int mi_row_end = cm->tile_row_start_sb[row + 1]
+  assert(row < cm->tiles.rows);
+  int mi_row_start = cm->tiles.row_start_sb[row]
+                     << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tiles.row_start_sb[row + 1]
                    << cm->seq_params.mib_size_log2;
   tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
-  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
   assert(tile->mi_row_end > tile->mi_row_start);
 }
 
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
-  assert(col < cm->tile_cols);
-  int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
-  int mi_col_end = cm->tile_col_start_sb[col + 1]
+  assert(col < cm->tiles.cols);
+  int mi_col_start = cm->tiles.col_start_sb[col]
+                     << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tiles.col_start_sb[col + 1]
                    << cm->seq_params.mib_size_log2;
   tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
-  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
+  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
@@ -198,21 +207,22 @@
 }
 
 void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
-  if (cm->uniform_tile_spacing_flag) {
-    *w = cm->tile_width;
-    *h = cm->tile_height;
+  const CommonTileParams *const tiles = &cm->tiles;
+  if (tiles->uniform_spacing) {
+    *w = tiles->width;
+    *h = tiles->height;
   } else {
-    for (int i = 0; i < cm->tile_cols; ++i) {
+    for (int i = 0; i < tiles->cols; ++i) {
       const int tile_width_sb =
-          cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+          tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
       const int tile_w = tile_width_sb * cm->seq_params.mib_size;
       assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
       *w = tile_w;
     }
 
-    for (int i = 0; i < cm->tile_rows; ++i) {
+    for (int i = 0; i < tiles->rows; ++i) {
       const int tile_height_sb =
-          cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+          tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
       const int tile_h = tile_height_sb * cm->seq_params.mib_size;
       assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
       *h = tile_h;
@@ -220,10 +230,10 @@
   }
 }
 
-int is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
   // Disable check if there is a single tile col in the frame
-  if (cm->tile_cols == 1) return 1;
+  if (cm->tiles.cols == 1) return 1;
 
-  return ((cm->min_inner_tile_width << MI_SIZE_LOG2) >=
+  return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
           (64 << av1_superres_scaled(cm)));
 }

diff --git a/libaom/av1/common/tile_common.h b/libaom/av1/common/tile_common.h
index a235f2d..ca7c5f4 100644
--- a/libaom/av1/common/tile_common.h
+++ b/libaom/av1/common/tile_common.h

@@ -19,6 +19,8 @@
 #include "config/aom_config.h"
 
 struct AV1Common;
+struct SequenceHeader;
+struct CommonTileParams;
 
 #define DEFAULT_MAX_NUM_TG 1
 
@@ -56,11 +58,15 @@
 
 void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
 void av1_get_tile_limits(struct AV1Common *const cm);
-void av1_calculate_tile_cols(struct AV1Common *const cm);
-void av1_calculate_tile_rows(struct AV1Common *const cm);
+void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             struct CommonTileParams *const tiles);
+void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows,
+                             struct CommonTileParams *const tiles);
 
 // Checks if the minimum tile_width requirement is satisfied
-int is_min_tile_width_satisfied(const struct AV1Common *cm);
+int av1_is_min_tile_width_satisfied(const struct AV1Common *cm);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/common/timing.c b/libaom/av1/common/timing.c
index 49dbde7..a959cdf 100644
--- a/libaom/av1/common/timing.c
+++ b/libaom/av1/common/timing.c

@@ -15,22 +15,35 @@
  * The tables are in Kbps instead of Mbps in the specification.
  * Note that depending on the profile, a multiplier is needed.
  */
+#define UNDEFINED_RATE \
+  (1 << 21)  // Placeholder rate for levels with undefined rate
+#define INVALID_RATE \
+  (0)  // For invalid profile-level configuration, set rate to 0
 
 /* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
 /* is a dummy value. The decoder model is not applicable for level 31. */
 static int32_t main_kbps[1 << LEVEL_BITS] = {
-  1500, 3000,  0,     0,     6000,  10000, 0,      0,      12000,  20000,    0,
-  0,    30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0,        0,
-  0,    0,     0,     0,     0,     0,     0,      0,      0,      (1 << 26)
+  1500,           3000,           UNDEFINED_RATE, UNDEFINED_RATE,
+  6000,           10000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  12000,          20000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  30000,          40000,          60000,          60000,
+  60000,          100000,         160000,         160000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
 };
 
 /* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
 /* is a dummy value. The decoder model is not applicable for level 31. */
 static int32_t high_kbps[1 << LEVEL_BITS] = {
-  0,      0,      0,      0,      0,      0,      0,      0,
-  30000,  50000,  0,      0,      100000, 160000, 240000, 240000,
-  240000, 480000, 800000, 800000, 0,      0,      0,      0,
-  0,      0,      0,      0,      0,      0,      0,      (1 << 26)
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  30000,          50000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  100000,         160000,         240000,         240000,
+  240000,         480000,         800000,         800000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
 };
 
 /* BitrateProfileFactor */
@@ -38,8 +51,8 @@
   1, 2, 3, 0, 0, 0, 0, 0
 };
 
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
-                          int seq_tier) {
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier) {
   int64_t bitrate;
 
   if (seq_tier) {
@@ -51,13 +64,13 @@
   return bitrate * 1000;
 }
 
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
   decoder_model->encoder_decoder_buffer_delay_length = 16;
   decoder_model->buffer_removal_time_length = 10;
   decoder_model->frame_presentation_time_length = 10;
 }
 
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
   op_params->decoder_model_param_present_flag = 1;
   op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
   op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
@@ -66,7 +79,7 @@
   op_params->initial_display_delay = 8;  // 8 frames delay
 }
 
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
     aom_dec_model_op_parameters_t *op_params) {
   op_params->decoder_model_param_present_flag = 0;
   op_params->decoder_buffer_delay =

diff --git a/libaom/av1/common/timing.h b/libaom/av1/common/timing.h
index 06939ae..9192124 100644
--- a/libaom/av1/common/timing.h
+++ b/libaom/av1/common/timing.h

@@ -42,18 +42,14 @@
   int initial_display_delay;
 } aom_dec_model_op_parameters_t;
 
-typedef struct aom_op_timing_info_t {
-  uint32_t buffer_removal_time;
-} aom_op_timing_info_t;
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
 
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
 
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
-
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
     aom_dec_model_op_parameters_t *op_params);
 
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
-                          int seq_tier);
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier);
 
 #endif  // AOM_AV1_COMMON_TIMING_H_

diff --git a/libaom/av1/common/token_cdfs.h b/libaom/av1/common/token_cdfs.h
index 53e9564..f1edda5 100644
--- a/libaom/av1/common/token_cdfs.h
+++ b/libaom/av1/common/token_cdfs.h

@@ -1707,1687 +1707,1687 @@
 
 static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
     [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
-    [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
-        { { { { { AOM_CDF4(4034, 8930, 12727) },
-                { AOM_CDF4(18082, 29741, 31877) },
-                { AOM_CDF4(12596, 26124, 30493) },
-                { AOM_CDF4(9446, 21118, 27005) },
-                { AOM_CDF4(6308, 15141, 21279) },
-                { AOM_CDF4(2463, 6357, 9783) },
-                { AOM_CDF4(20667, 30546, 31929) },
-                { AOM_CDF4(13043, 26123, 30134) },
-                { AOM_CDF4(8151, 18757, 24778) },
-                { AOM_CDF4(5255, 12839, 18632) },
-                { AOM_CDF4(2820, 7206, 11161) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(15736, 27553, 30604) },
-                { AOM_CDF4(11210, 23794, 28787) },
-                { AOM_CDF4(5947, 13874, 19701) },
-                { AOM_CDF4(4215, 9323, 13891) },
-                { AOM_CDF4(2833, 6462, 10059) },
-                { AOM_CDF4(19605, 30393, 31582) },
-                { AOM_CDF4(13523, 26252, 30248) },
-                { AOM_CDF4(8446, 18622, 24512) },
-                { AOM_CDF4(3818, 10343, 15974) },
-                { AOM_CDF4(1481, 4117, 6796) },
-                { AOM_CDF4(22649, 31302, 32190) },
-                { AOM_CDF4(14829, 27127, 30449) },
-                { AOM_CDF4(8313, 17702, 23304) },
-                { AOM_CDF4(3022, 8301, 12786) },
-                { AOM_CDF4(1536, 4412, 7184) },
-                { AOM_CDF4(22354, 29774, 31372) },
-                { AOM_CDF4(14723, 25472, 29214) },
-                { AOM_CDF4(6673, 13745, 18662) },
-                { AOM_CDF4(2068, 5766, 9322) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6302, 16444, 21761) },
-                { AOM_CDF4(23040, 31538, 32475) },
-                { AOM_CDF4(15196, 28452, 31496) },
-                { AOM_CDF4(10020, 22946, 28514) },
-                { AOM_CDF4(6533, 16862, 23501) },
-                { AOM_CDF4(3538, 9816, 15076) },
-                { AOM_CDF4(24444, 31875, 32525) },
-                { AOM_CDF4(15881, 28924, 31635) },
-                { AOM_CDF4(9922, 22873, 28466) },
-                { AOM_CDF4(6527, 16966, 23691) },
-                { AOM_CDF4(4114, 11303, 17220) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(20201, 30770, 32209) },
-                { AOM_CDF4(14754, 28071, 31258) },
-                { AOM_CDF4(8378, 20186, 26517) },
-                { AOM_CDF4(5916, 15299, 21978) },
-                { AOM_CDF4(4268, 11583, 17901) },
-                { AOM_CDF4(24361, 32025, 32581) },
-                { AOM_CDF4(18673, 30105, 31943) },
-                { AOM_CDF4(10196, 22244, 27576) },
-                { AOM_CDF4(5495, 14349, 20417) },
-                { AOM_CDF4(2676, 7415, 11498) },
-                { AOM_CDF4(24678, 31958, 32585) },
-                { AOM_CDF4(18629, 29906, 31831) },
-                { AOM_CDF4(9364, 20724, 26315) },
-                { AOM_CDF4(4641, 12318, 18094) },
-                { AOM_CDF4(2758, 7387, 11579) },
-                { AOM_CDF4(25433, 31842, 32469) },
-                { AOM_CDF4(18795, 29289, 31411) },
-                { AOM_CDF4(7644, 17584, 23592) },
-                { AOM_CDF4(3408, 9014, 15047) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4536, 10072, 14001) },
-                { AOM_CDF4(25459, 31416, 32206) },
-                { AOM_CDF4(16605, 28048, 30818) },
-                { AOM_CDF4(11008, 22857, 27719) },
-                { AOM_CDF4(6915, 16268, 22315) },
-                { AOM_CDF4(2625, 6812, 10537) },
-                { AOM_CDF4(24257, 31788, 32499) },
-                { AOM_CDF4(16880, 29454, 31879) },
-                { AOM_CDF4(11958, 25054, 29778) },
-                { AOM_CDF4(7916, 18718, 25084) },
-                { AOM_CDF4(3383, 8777, 13446) },
-                { AOM_CDF4(22720, 31603, 32393) },
-                { AOM_CDF4(14960, 28125, 31335) },
-                { AOM_CDF4(9731, 22210, 27928) },
-                { AOM_CDF4(6304, 15832, 22277) },
-                { AOM_CDF4(2910, 7818, 12166) },
-                { AOM_CDF4(20375, 30627, 32131) },
-                { AOM_CDF4(13904, 27284, 30887) },
-                { AOM_CDF4(9368, 21558, 27144) },
-                { AOM_CDF4(5937, 14966, 21119) },
-                { AOM_CDF4(2667, 7225, 11319) },
-                { AOM_CDF4(23970, 31470, 32378) },
-                { AOM_CDF4(17173, 29734, 32018) },
-                { AOM_CDF4(12795, 25441, 29965) },
-                { AOM_CDF4(8981, 19680, 25893) },
-                { AOM_CDF4(4728, 11372, 16902) },
-                { AOM_CDF4(24287, 31797, 32439) },
-                { AOM_CDF4(16703, 29145, 31696) },
-                { AOM_CDF4(10833, 23554, 28725) },
-                { AOM_CDF4(6468, 16566, 23057) },
-                { AOM_CDF4(2415, 6562, 10278) },
-                { AOM_CDF4(26610, 32395, 32659) },
-                { AOM_CDF4(18590, 30498, 32117) },
-                { AOM_CDF4(12420, 25756, 29950) },
-                { AOM_CDF4(7639, 18746, 24710) },
-                { AOM_CDF4(3001, 8086, 12347) },
-                { AOM_CDF4(25076, 32064, 32580) },
-                { AOM_CDF4(17946, 30128, 32028) },
-                { AOM_CDF4(12024, 24985, 29378) },
-                { AOM_CDF4(7517, 18390, 24304) },
-                { AOM_CDF4(3243, 8781, 13331) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6037, 16771, 21957) },
-                { AOM_CDF4(24774, 31704, 32426) },
-                { AOM_CDF4(16830, 28589, 31056) },
-                { AOM_CDF4(10602, 22828, 27760) },
-                { AOM_CDF4(6733, 16829, 23071) },
-                { AOM_CDF4(3250, 8914, 13556) },
-                { AOM_CDF4(25582, 32220, 32668) },
-                { AOM_CDF4(18659, 30342, 32223) },
-                { AOM_CDF4(12546, 26149, 30515) },
-                { AOM_CDF4(8420, 20451, 26801) },
-                { AOM_CDF4(4636, 12420, 18344) },
-                { AOM_CDF4(27581, 32362, 32639) },
-                { AOM_CDF4(18987, 30083, 31978) },
-                { AOM_CDF4(11327, 24248, 29084) },
-                { AOM_CDF4(7264, 17719, 24120) },
-                { AOM_CDF4(3995, 10768, 16169) },
-                { AOM_CDF4(25893, 31831, 32487) },
-                { AOM_CDF4(16577, 28587, 31379) },
-                { AOM_CDF4(10189, 22748, 28182) },
-                { AOM_CDF4(6832, 17094, 23556) },
-                { AOM_CDF4(3708, 10110, 15334) },
-                { AOM_CDF4(25904, 32282, 32656) },
-                { AOM_CDF4(19721, 30792, 32276) },
-                { AOM_CDF4(12819, 26243, 30411) },
-                { AOM_CDF4(8572, 20614, 26891) },
-                { AOM_CDF4(5364, 14059, 20467) },
-                { AOM_CDF4(26580, 32438, 32677) },
-                { AOM_CDF4(20852, 31225, 32340) },
-                { AOM_CDF4(12435, 25700, 29967) },
-                { AOM_CDF4(8691, 20825, 26976) },
-                { AOM_CDF4(4446, 12209, 17269) },
-                { AOM_CDF4(27350, 32429, 32696) },
-                { AOM_CDF4(21372, 30977, 32272) },
-                { AOM_CDF4(12673, 25270, 29853) },
-                { AOM_CDF4(9208, 20925, 26640) },
-                { AOM_CDF4(5018, 13351, 18732) },
-                { AOM_CDF4(27351, 32479, 32713) },
-                { AOM_CDF4(21398, 31209, 32387) },
-                { AOM_CDF4(12162, 25047, 29842) },
-                { AOM_CDF4(7896, 18691, 25319) },
-                { AOM_CDF4(4670, 12882, 18881) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5487, 10460, 13708) },
-                { AOM_CDF4(21597, 28303, 30674) },
-                { AOM_CDF4(11037, 21953, 26476) },
-                { AOM_CDF4(8147, 17962, 22952) },
-                { AOM_CDF4(5242, 13061, 18532) },
-                { AOM_CDF4(1889, 5208, 8182) },
-                { AOM_CDF4(26774, 32133, 32590) },
-                { AOM_CDF4(17844, 29564, 31767) },
-                { AOM_CDF4(11690, 24438, 29171) },
-                { AOM_CDF4(7542, 18215, 24459) },
-                { AOM_CDF4(2993, 8050, 12319) },
-                { AOM_CDF4(28023, 32328, 32591) },
-                { AOM_CDF4(18651, 30126, 31954) },
-                { AOM_CDF4(12164, 25146, 29589) },
-                { AOM_CDF4(7762, 18530, 24771) },
-                { AOM_CDF4(3492, 9183, 13920) },
-                { AOM_CDF4(27591, 32008, 32491) },
-                { AOM_CDF4(17149, 28853, 31510) },
-                { AOM_CDF4(11485, 24003, 28860) },
-                { AOM_CDF4(7697, 18086, 24210) },
-                { AOM_CDF4(3075, 7999, 12218) },
-                { AOM_CDF4(28268, 32482, 32654) },
-                { AOM_CDF4(19631, 31051, 32404) },
-                { AOM_CDF4(13860, 27260, 31020) },
-                { AOM_CDF4(9605, 21613, 27594) },
-                { AOM_CDF4(4876, 12162, 17908) },
-                { AOM_CDF4(27248, 32316, 32576) },
-                { AOM_CDF4(18955, 30457, 32075) },
-                { AOM_CDF4(11824, 23997, 28795) },
-                { AOM_CDF4(7346, 18196, 24647) },
-                { AOM_CDF4(3403, 9247, 14111) },
-                { AOM_CDF4(29711, 32655, 32735) },
-                { AOM_CDF4(21169, 31394, 32417) },
-                { AOM_CDF4(13487, 27198, 30957) },
-                { AOM_CDF4(8828, 21683, 27614) },
-                { AOM_CDF4(4270, 11451, 17038) },
-                { AOM_CDF4(28708, 32578, 32731) },
-                { AOM_CDF4(20120, 31241, 32482) },
-                { AOM_CDF4(13692, 27550, 31321) },
-                { AOM_CDF4(9418, 22514, 28439) },
-                { AOM_CDF4(4999, 13283, 19462) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(5673, 14302, 19711) },
-                { AOM_CDF4(26251, 30701, 31834) },
-                { AOM_CDF4(12782, 23783, 27803) },
-                { AOM_CDF4(9127, 20657, 25808) },
-                { AOM_CDF4(6368, 16208, 21462) },
-                { AOM_CDF4(2465, 7177, 10822) },
-                { AOM_CDF4(29961, 32563, 32719) },
-                { AOM_CDF4(18318, 29891, 31949) },
-                { AOM_CDF4(11361, 24514, 29357) },
-                { AOM_CDF4(7900, 19603, 25607) },
-                { AOM_CDF4(4002, 10590, 15546) },
-                { AOM_CDF4(29637, 32310, 32595) },
-                { AOM_CDF4(18296, 29913, 31809) },
-                { AOM_CDF4(10144, 21515, 26871) },
-                { AOM_CDF4(5358, 14322, 20394) },
-                { AOM_CDF4(3067, 8362, 13346) },
-                { AOM_CDF4(28652, 32470, 32676) },
-                { AOM_CDF4(17538, 30771, 32209) },
-                { AOM_CDF4(13924, 26882, 30494) },
-                { AOM_CDF4(10496, 22837, 27869) },
-                { AOM_CDF4(7236, 16396, 21621) },
-                { AOM_CDF4(30743, 32687, 32746) },
-                { AOM_CDF4(23006, 31676, 32489) },
-                { AOM_CDF4(14494, 27828, 31120) },
-                { AOM_CDF4(10174, 22801, 28352) },
-                { AOM_CDF4(6242, 15281, 21043) },
-                { AOM_CDF4(25817, 32243, 32720) },
-                { AOM_CDF4(18618, 31367, 32325) },
-                { AOM_CDF4(13997, 28318, 31878) },
-                { AOM_CDF4(12255, 26534, 31383) },
-                { AOM_CDF4(9561, 21588, 28450) },
-                { AOM_CDF4(28188, 32635, 32724) },
-                { AOM_CDF4(22060, 32365, 32728) },
-                { AOM_CDF4(18102, 30690, 32528) },
-                { AOM_CDF4(14196, 28864, 31999) },
-                { AOM_CDF4(12262, 25792, 30865) },
-                { AOM_CDF4(24176, 32109, 32628) },
-                { AOM_CDF4(18280, 29681, 31963) },
-                { AOM_CDF4(10205, 23703, 29664) },
-                { AOM_CDF4(7889, 20025, 27676) },
-                { AOM_CDF4(6060, 16743, 23970) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5141, 7096, 8260) },
-                { AOM_CDF4(27186, 29022, 29789) },
-                { AOM_CDF4(6668, 12568, 15682) },
-                { AOM_CDF4(2172, 6181, 8638) },
-                { AOM_CDF4(1126, 3379, 4531) },
-                { AOM_CDF4(443, 1361, 2254) },
-                { AOM_CDF4(26083, 31153, 32436) },
-                { AOM_CDF4(13486, 24603, 28483) },
-                { AOM_CDF4(6508, 14840, 19910) },
-                { AOM_CDF4(3386, 8800, 13286) },
-                { AOM_CDF4(1530, 4322, 7054) },
-                { AOM_CDF4(29639, 32080, 32548) },
-                { AOM_CDF4(15897, 27552, 30290) },
-                { AOM_CDF4(8588, 20047, 25383) },
-                { AOM_CDF4(4889, 13339, 19269) },
-                { AOM_CDF4(2240, 6871, 10498) },
-                { AOM_CDF4(28165, 32197, 32517) },
-                { AOM_CDF4(20735, 30427, 31568) },
-                { AOM_CDF4(14325, 24671, 27692) },
-                { AOM_CDF4(5119, 12554, 17805) },
-                { AOM_CDF4(1810, 5441, 8261) },
-                { AOM_CDF4(31212, 32724, 32748) },
-                { AOM_CDF4(23352, 31766, 32545) },
-                { AOM_CDF4(14669, 27570, 31059) },
-                { AOM_CDF4(8492, 20894, 27272) },
-                { AOM_CDF4(3644, 10194, 15204) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(2461, 7013, 9371) },
-                { AOM_CDF4(24749, 29600, 30986) },
-                { AOM_CDF4(9466, 19037, 22417) },
-                { AOM_CDF4(3584, 9280, 14400) },
-                { AOM_CDF4(1505, 3929, 5433) },
-                { AOM_CDF4(677, 1500, 2736) },
-                { AOM_CDF4(23987, 30702, 32117) },
-                { AOM_CDF4(13554, 24571, 29263) },
-                { AOM_CDF4(6211, 14556, 21155) },
-                { AOM_CDF4(3135, 10972, 15625) },
-                { AOM_CDF4(2435, 7127, 11427) },
-                { AOM_CDF4(31300, 32532, 32550) },
-                { AOM_CDF4(14757, 30365, 31954) },
-                { AOM_CDF4(4405, 11612, 18553) },
-                { AOM_CDF4(580, 4132, 7322) },
-                { AOM_CDF4(1695, 10169, 14124) },
-                { AOM_CDF4(30008, 32282, 32591) },
-                { AOM_CDF4(19244, 30108, 31748) },
-                { AOM_CDF4(11180, 24158, 29555) },
-                { AOM_CDF4(5650, 14972, 19209) },
-                { AOM_CDF4(2114, 5109, 8456) },
-                { AOM_CDF4(31856, 32716, 32748) },
-                { AOM_CDF4(23012, 31664, 32572) },
-                { AOM_CDF4(13694, 26656, 30636) },
-                { AOM_CDF4(8142, 19508, 26093) },
-                { AOM_CDF4(4253, 10955, 16724) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(601, 983, 1311) },
-                { AOM_CDF4(18725, 23406, 28087) },
-                { AOM_CDF4(5461, 8192, 10923) },
-                { AOM_CDF4(3781, 15124, 21425) },
-                { AOM_CDF4(2587, 7761, 12072) },
-                { AOM_CDF4(106, 458, 810) },
-                { AOM_CDF4(22282, 29710, 31894) },
-                { AOM_CDF4(8508, 20926, 25984) },
-                { AOM_CDF4(3726, 12713, 18083) },
-                { AOM_CDF4(1620, 7112, 10893) },
-                { AOM_CDF4(729, 2236, 3495) },
-                { AOM_CDF4(30163, 32474, 32684) },
-                { AOM_CDF4(18304, 30464, 32000) },
-                { AOM_CDF4(11443, 26526, 29647) },
-                { AOM_CDF4(6007, 15292, 21299) },
-                { AOM_CDF4(2234, 6703, 8937) },
-                { AOM_CDF4(30954, 32177, 32571) },
-                { AOM_CDF4(17363, 29562, 31076) },
-                { AOM_CDF4(9686, 22464, 27410) },
-                { AOM_CDF4(8192, 16384, 21390) },
-                { AOM_CDF4(1755, 8046, 11264) },
-                { AOM_CDF4(31168, 32734, 32748) },
-                { AOM_CDF4(22486, 31441, 32471) },
-                { AOM_CDF4(12833, 25627, 29738) },
-                { AOM_CDF4(6980, 17379, 23122) },
-                { AOM_CDF4(3111, 8887, 13479) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(6041, 11854, 15927) },
-                { AOM_CDF4(20326, 30905, 32251) },
-                { AOM_CDF4(14164, 26831, 30725) },
-                { AOM_CDF4(9760, 20647, 26585) },
-                { AOM_CDF4(6416, 14953, 21219) },
-                { AOM_CDF4(2966, 7151, 10891) },
-                { AOM_CDF4(23567, 31374, 32254) },
-                { AOM_CDF4(14978, 27416, 30946) },
-                { AOM_CDF4(9434, 20225, 26254) },
-                { AOM_CDF4(6658, 14558, 20535) },
-                { AOM_CDF4(3916, 8677, 12989) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(18088, 29545, 31587) },
-                { AOM_CDF4(13062, 25843, 30073) },
-                { AOM_CDF4(8940, 16827, 22251) },
-                { AOM_CDF4(7654, 13220, 17973) },
-                { AOM_CDF4(5733, 10316, 14456) },
-                { AOM_CDF4(22879, 31388, 32114) },
-                { AOM_CDF4(15215, 27993, 30955) },
-                { AOM_CDF4(9397, 19445, 24978) },
-                { AOM_CDF4(3442, 9813, 15344) },
-                { AOM_CDF4(1368, 3936, 6532) },
-                { AOM_CDF4(25494, 32033, 32406) },
-                { AOM_CDF4(16772, 27963, 30718) },
-                { AOM_CDF4(9419, 18165, 23260) },
-                { AOM_CDF4(2677, 7501, 11797) },
-                { AOM_CDF4(1516, 4344, 7170) },
-                { AOM_CDF4(26556, 31454, 32101) },
-                { AOM_CDF4(17128, 27035, 30108) },
-                { AOM_CDF4(8324, 15344, 20249) },
-                { AOM_CDF4(1903, 5696, 9469) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8455, 19003, 24368) },
-                { AOM_CDF4(23563, 32021, 32604) },
-                { AOM_CDF4(16237, 29446, 31935) },
-                { AOM_CDF4(10724, 23999, 29358) },
-                { AOM_CDF4(6725, 17528, 24416) },
-                { AOM_CDF4(3927, 10927, 16825) },
-                { AOM_CDF4(26313, 32288, 32634) },
-                { AOM_CDF4(17430, 30095, 32095) },
-                { AOM_CDF4(11116, 24606, 29679) },
-                { AOM_CDF4(7195, 18384, 25269) },
-                { AOM_CDF4(4726, 12852, 19315) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(22822, 31648, 32483) },
-                { AOM_CDF4(16724, 29633, 31929) },
-                { AOM_CDF4(10261, 23033, 28725) },
-                { AOM_CDF4(7029, 17840, 24528) },
-                { AOM_CDF4(4867, 13886, 21502) },
-                { AOM_CDF4(25298, 31892, 32491) },
-                { AOM_CDF4(17809, 29330, 31512) },
-                { AOM_CDF4(9668, 21329, 26579) },
-                { AOM_CDF4(4774, 12956, 18976) },
-                { AOM_CDF4(2322, 7030, 11540) },
-                { AOM_CDF4(25472, 31920, 32543) },
-                { AOM_CDF4(17957, 29387, 31632) },
-                { AOM_CDF4(9196, 20593, 26400) },
-                { AOM_CDF4(4680, 12705, 19202) },
-                { AOM_CDF4(2917, 8456, 13436) },
-                { AOM_CDF4(26471, 32059, 32574) },
-                { AOM_CDF4(18458, 29783, 31909) },
-                { AOM_CDF4(8400, 19464, 25956) },
-                { AOM_CDF4(3812, 10973, 17206) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(6779, 13743, 17678) },
-                { AOM_CDF4(24806, 31797, 32457) },
-                { AOM_CDF4(17616, 29047, 31372) },
-                { AOM_CDF4(11063, 23175, 28003) },
-                { AOM_CDF4(6521, 16110, 22324) },
-                { AOM_CDF4(2764, 7504, 11654) },
-                { AOM_CDF4(25266, 32367, 32637) },
-                { AOM_CDF4(19054, 30553, 32175) },
-                { AOM_CDF4(12139, 25212, 29807) },
-                { AOM_CDF4(7311, 18162, 24704) },
-                { AOM_CDF4(3397, 9164, 14074) },
-                { AOM_CDF4(25988, 32208, 32522) },
-                { AOM_CDF4(16253, 28912, 31526) },
-                { AOM_CDF4(9151, 21387, 27372) },
-                { AOM_CDF4(5688, 14915, 21496) },
-                { AOM_CDF4(2717, 7627, 12004) },
-                { AOM_CDF4(23144, 31855, 32443) },
-                { AOM_CDF4(16070, 28491, 31325) },
-                { AOM_CDF4(8702, 20467, 26517) },
-                { AOM_CDF4(5243, 13956, 20367) },
-                { AOM_CDF4(2621, 7335, 11567) },
-                { AOM_CDF4(26636, 32340, 32630) },
-                { AOM_CDF4(19990, 31050, 32341) },
-                { AOM_CDF4(13243, 26105, 30315) },
-                { AOM_CDF4(8588, 19521, 25918) },
-                { AOM_CDF4(4717, 11585, 17304) },
-                { AOM_CDF4(25844, 32292, 32582) },
-                { AOM_CDF4(19090, 30635, 32097) },
-                { AOM_CDF4(11963, 24546, 28939) },
-                { AOM_CDF4(6218, 16087, 22354) },
-                { AOM_CDF4(2340, 6608, 10426) },
-                { AOM_CDF4(28046, 32576, 32694) },
-                { AOM_CDF4(21178, 31313, 32296) },
-                { AOM_CDF4(13486, 26184, 29870) },
-                { AOM_CDF4(7149, 17871, 23723) },
-                { AOM_CDF4(2833, 7958, 12259) },
-                { AOM_CDF4(27710, 32528, 32686) },
-                { AOM_CDF4(20674, 31076, 32268) },
-                { AOM_CDF4(12413, 24955, 29243) },
-                { AOM_CDF4(6676, 16927, 23097) },
-                { AOM_CDF4(2966, 8333, 12919) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8639, 19339, 24429) },
-                { AOM_CDF4(24404, 31837, 32525) },
-                { AOM_CDF4(16997, 29425, 31784) },
-                { AOM_CDF4(11253, 24234, 29149) },
-                { AOM_CDF4(6751, 17394, 24028) },
-                { AOM_CDF4(3490, 9830, 15191) },
-                { AOM_CDF4(26283, 32471, 32714) },
-                { AOM_CDF4(19599, 31168, 32442) },
-                { AOM_CDF4(13146, 26954, 30893) },
-                { AOM_CDF4(8214, 20588, 26890) },
-                { AOM_CDF4(4699, 13081, 19300) },
-                { AOM_CDF4(28212, 32458, 32669) },
-                { AOM_CDF4(18594, 30316, 32100) },
-                { AOM_CDF4(11219, 24408, 29234) },
-                { AOM_CDF4(6865, 17656, 24149) },
-                { AOM_CDF4(3678, 10362, 16006) },
-                { AOM_CDF4(25825, 32136, 32616) },
-                { AOM_CDF4(17313, 29853, 32021) },
-                { AOM_CDF4(11197, 24471, 29472) },
-                { AOM_CDF4(6947, 17781, 24405) },
-                { AOM_CDF4(3768, 10660, 16261) },
-                { AOM_CDF4(27352, 32500, 32706) },
-                { AOM_CDF4(20850, 31468, 32469) },
-                { AOM_CDF4(14021, 27707, 31133) },
-                { AOM_CDF4(8964, 21748, 27838) },
-                { AOM_CDF4(5437, 14665, 21187) },
-                { AOM_CDF4(26304, 32492, 32698) },
-                { AOM_CDF4(20409, 31380, 32385) },
-                { AOM_CDF4(13682, 27222, 30632) },
-                { AOM_CDF4(8974, 21236, 26685) },
-                { AOM_CDF4(4234, 11665, 16934) },
-                { AOM_CDF4(26273, 32357, 32711) },
-                { AOM_CDF4(20672, 31242, 32441) },
-                { AOM_CDF4(14172, 27254, 30902) },
-                { AOM_CDF4(9870, 21898, 27275) },
-                { AOM_CDF4(5164, 13506, 19270) },
-                { AOM_CDF4(26725, 32459, 32728) },
-                { AOM_CDF4(20991, 31442, 32527) },
-                { AOM_CDF4(13071, 26434, 30811) },
-                { AOM_CDF4(8184, 20090, 26742) },
-                { AOM_CDF4(4803, 13255, 19895) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7555, 14942, 18501) },
-                { AOM_CDF4(24410, 31178, 32287) },
-                { AOM_CDF4(14394, 26738, 30253) },
-                { AOM_CDF4(8413, 19554, 25195) },
-                { AOM_CDF4(4766, 12924, 18785) },
-                { AOM_CDF4(2029, 5806, 9207) },
-                { AOM_CDF4(26776, 32364, 32663) },
-                { AOM_CDF4(18732, 29967, 31931) },
-                { AOM_CDF4(11005, 23786, 28852) },
-                { AOM_CDF4(6466, 16909, 23510) },
-                { AOM_CDF4(3044, 8638, 13419) },
-                { AOM_CDF4(29208, 32582, 32704) },
-                { AOM_CDF4(20068, 30857, 32208) },
-                { AOM_CDF4(12003, 25085, 29595) },
-                { AOM_CDF4(6947, 17750, 24189) },
-                { AOM_CDF4(3245, 9103, 14007) },
-                { AOM_CDF4(27359, 32465, 32669) },
-                { AOM_CDF4(19421, 30614, 32174) },
-                { AOM_CDF4(11915, 25010, 29579) },
-                { AOM_CDF4(6950, 17676, 24074) },
-                { AOM_CDF4(3007, 8473, 13096) },
-                { AOM_CDF4(29002, 32676, 32735) },
-                { AOM_CDF4(22102, 31849, 32576) },
-                { AOM_CDF4(14408, 28009, 31405) },
-                { AOM_CDF4(9027, 21679, 27931) },
-                { AOM_CDF4(4694, 12678, 18748) },
-                { AOM_CDF4(28216, 32528, 32682) },
-                { AOM_CDF4(20849, 31264, 32318) },
-                { AOM_CDF4(12756, 25815, 29751) },
-                { AOM_CDF4(7565, 18801, 24923) },
-                { AOM_CDF4(3509, 9533, 14477) },
-                { AOM_CDF4(30133, 32687, 32739) },
-                { AOM_CDF4(23063, 31910, 32515) },
-                { AOM_CDF4(14588, 28051, 31132) },
-                { AOM_CDF4(9085, 21649, 27457) },
-                { AOM_CDF4(4261, 11654, 17264) },
-                { AOM_CDF4(29518, 32691, 32748) },
-                { AOM_CDF4(22451, 31959, 32613) },
-                { AOM_CDF4(14864, 28722, 31700) },
-                { AOM_CDF4(9695, 22964, 28716) },
-                { AOM_CDF4(4932, 13358, 19502) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6465, 16958, 21688) },
-                { AOM_CDF4(25199, 31514, 32360) },
-                { AOM_CDF4(14774, 27149, 30607) },
-                { AOM_CDF4(9257, 21438, 26972) },
-                { AOM_CDF4(5723, 15183, 21882) },
-                { AOM_CDF4(3150, 8879, 13731) },
-                { AOM_CDF4(26989, 32262, 32682) },
-                { AOM_CDF4(17396, 29937, 32085) },
-                { AOM_CDF4(11387, 24901, 29784) },
-                { AOM_CDF4(7289, 18821, 25548) },
-                { AOM_CDF4(3734, 10577, 16086) },
-                { AOM_CDF4(29728, 32501, 32695) },
-                { AOM_CDF4(17431, 29701, 31903) },
-                { AOM_CDF4(9921, 22826, 28300) },
-                { AOM_CDF4(5896, 15434, 22068) },
-                { AOM_CDF4(3430, 9646, 14757) },
-                { AOM_CDF4(28614, 32511, 32705) },
-                { AOM_CDF4(19364, 30638, 32263) },
-                { AOM_CDF4(13129, 26254, 30402) },
-                { AOM_CDF4(8754, 20484, 26440) },
-                { AOM_CDF4(4378, 11607, 17110) },
-                { AOM_CDF4(30292, 32671, 32744) },
-                { AOM_CDF4(21780, 31603, 32501) },
-                { AOM_CDF4(14314, 27829, 31291) },
-                { AOM_CDF4(9611, 22327, 28263) },
-                { AOM_CDF4(4890, 13087, 19065) },
-                { AOM_CDF4(25862, 32567, 32733) },
-                { AOM_CDF4(20794, 32050, 32567) },
-                { AOM_CDF4(17243, 30625, 32254) },
-                { AOM_CDF4(13283, 27628, 31474) },
-                { AOM_CDF4(9669, 22532, 28918) },
-                { AOM_CDF4(27435, 32697, 32748) },
-                { AOM_CDF4(24922, 32390, 32714) },
-                { AOM_CDF4(21449, 31504, 32536) },
-                { AOM_CDF4(16392, 29729, 31832) },
-                { AOM_CDF4(11692, 24884, 29076) },
-                { AOM_CDF4(24193, 32290, 32735) },
-                { AOM_CDF4(18909, 31104, 32563) },
-                { AOM_CDF4(12236, 26841, 31403) },
-                { AOM_CDF4(8171, 21840, 29082) },
-                { AOM_CDF4(7224, 17280, 25275) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(3078, 6839, 9890) },
-                { AOM_CDF4(13837, 20450, 24479) },
-                { AOM_CDF4(5914, 14222, 19328) },
-                { AOM_CDF4(3866, 10267, 14762) },
-                { AOM_CDF4(2612, 7208, 11042) },
-                { AOM_CDF4(1067, 2991, 4776) },
-                { AOM_CDF4(25817, 31646, 32529) },
-                { AOM_CDF4(13708, 26338, 30385) },
-                { AOM_CDF4(7328, 18585, 24870) },
-                { AOM_CDF4(4691, 13080, 19276) },
-                { AOM_CDF4(1825, 5253, 8352) },
-                { AOM_CDF4(29386, 32315, 32624) },
-                { AOM_CDF4(17160, 29001, 31360) },
-                { AOM_CDF4(9602, 21862, 27396) },
-                { AOM_CDF4(5915, 15772, 22148) },
-                { AOM_CDF4(2786, 7779, 12047) },
-                { AOM_CDF4(29246, 32450, 32663) },
-                { AOM_CDF4(18696, 29929, 31818) },
-                { AOM_CDF4(10510, 23369, 28560) },
-                { AOM_CDF4(6229, 16499, 23125) },
-                { AOM_CDF4(2608, 7448, 11705) },
-                { AOM_CDF4(30753, 32710, 32748) },
-                { AOM_CDF4(21638, 31487, 32503) },
-                { AOM_CDF4(12937, 26854, 30870) },
-                { AOM_CDF4(8182, 20596, 26970) },
-                { AOM_CDF4(3637, 10269, 15497) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(5244, 12150, 16906) },
-                { AOM_CDF4(20486, 26858, 29701) },
-                { AOM_CDF4(7756, 18317, 23735) },
-                { AOM_CDF4(3452, 9256, 13146) },
-                { AOM_CDF4(2020, 5206, 8229) },
-                { AOM_CDF4(1801, 4993, 7903) },
-                { AOM_CDF4(27051, 31858, 32531) },
-                { AOM_CDF4(15988, 27531, 30619) },
-                { AOM_CDF4(9188, 21484, 26719) },
-                { AOM_CDF4(6273, 17186, 23800) },
-                { AOM_CDF4(3108, 9355, 14764) },
-                { AOM_CDF4(31076, 32520, 32680) },
-                { AOM_CDF4(18119, 30037, 31850) },
-                { AOM_CDF4(10244, 22969, 27472) },
-                { AOM_CDF4(4692, 14077, 19273) },
-                { AOM_CDF4(3694, 11677, 17556) },
-                { AOM_CDF4(30060, 32581, 32720) },
-                { AOM_CDF4(21011, 30775, 32120) },
-                { AOM_CDF4(11931, 24820, 29289) },
-                { AOM_CDF4(7119, 17662, 24356) },
-                { AOM_CDF4(3833, 10706, 16304) },
-                { AOM_CDF4(31954, 32731, 32748) },
-                { AOM_CDF4(23913, 31724, 32489) },
-                { AOM_CDF4(15520, 28060, 31286) },
-                { AOM_CDF4(11517, 23008, 28571) },
-                { AOM_CDF4(6193, 14508, 20629) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(1035, 2807, 4156) },
-                { AOM_CDF4(13162, 18138, 20939) },
-                { AOM_CDF4(2696, 6633, 8755) },
-                { AOM_CDF4(1373, 4161, 6853) },
-                { AOM_CDF4(1099, 2746, 4716) },
-                { AOM_CDF4(340, 1021, 1599) },
-                { AOM_CDF4(22826, 30419, 32135) },
-                { AOM_CDF4(10395, 21762, 26942) },
-                { AOM_CDF4(4726, 12407, 17361) },
-                { AOM_CDF4(2447, 7080, 10593) },
-                { AOM_CDF4(1227, 3717, 6011) },
-                { AOM_CDF4(28156, 31424, 31934) },
-                { AOM_CDF4(16915, 27754, 30373) },
-                { AOM_CDF4(9148, 20990, 26431) },
-                { AOM_CDF4(5950, 15515, 21148) },
-                { AOM_CDF4(2492, 7327, 11526) },
-                { AOM_CDF4(30602, 32477, 32670) },
-                { AOM_CDF4(20026, 29955, 31568) },
-                { AOM_CDF4(11220, 23628, 28105) },
-                { AOM_CDF4(6652, 17019, 22973) },
-                { AOM_CDF4(3064, 8536, 13043) },
-                { AOM_CDF4(31769, 32724, 32748) },
-                { AOM_CDF4(22230, 30887, 32373) },
-                { AOM_CDF4(12234, 25079, 29731) },
-                { AOM_CDF4(7326, 18816, 25353) },
-                { AOM_CDF4(3933, 10907, 16616) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(8896, 16227, 20630) },
-                { AOM_CDF4(23629, 31782, 32527) },
-                { AOM_CDF4(15173, 27755, 31321) },
-                { AOM_CDF4(10158, 21233, 27382) },
-                { AOM_CDF4(6420, 14857, 21558) },
-                { AOM_CDF4(3269, 8155, 12646) },
-                { AOM_CDF4(24835, 32009, 32496) },
-                { AOM_CDF4(16509, 28421, 31579) },
-                { AOM_CDF4(10957, 21514, 27418) },
-                { AOM_CDF4(7881, 15930, 22096) },
-                { AOM_CDF4(5388, 10960, 15918) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(20745, 30773, 32093) },
-                { AOM_CDF4(15200, 27221, 30861) },
-                { AOM_CDF4(13032, 20873, 25667) },
-                { AOM_CDF4(12285, 18663, 23494) },
-                { AOM_CDF4(11563, 17481, 21489) },
-                { AOM_CDF4(26260, 31982, 32320) },
-                { AOM_CDF4(15397, 28083, 31100) },
-                { AOM_CDF4(9742, 19217, 24824) },
-                { AOM_CDF4(3261, 9629, 15362) },
-                { AOM_CDF4(1480, 4322, 7499) },
-                { AOM_CDF4(27599, 32256, 32460) },
-                { AOM_CDF4(16857, 27659, 30774) },
-                { AOM_CDF4(9551, 18290, 23748) },
-                { AOM_CDF4(3052, 8933, 14103) },
-                { AOM_CDF4(2021, 5910, 9787) },
-                { AOM_CDF4(29005, 32015, 32392) },
-                { AOM_CDF4(17677, 27694, 30863) },
-                { AOM_CDF4(9204, 17356, 23219) },
-                { AOM_CDF4(2403, 7516, 12814) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(10808, 22056, 26896) },
-                { AOM_CDF4(25739, 32313, 32676) },
-                { AOM_CDF4(17288, 30203, 32221) },
-                { AOM_CDF4(11359, 24878, 29896) },
-                { AOM_CDF4(6949, 17767, 24893) },
-                { AOM_CDF4(4287, 11796, 18071) },
-                { AOM_CDF4(27880, 32521, 32705) },
-                { AOM_CDF4(19038, 31004, 32414) },
-                { AOM_CDF4(12564, 26345, 30768) },
-                { AOM_CDF4(8269, 19947, 26779) },
-                { AOM_CDF4(5674, 14657, 21674) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(25742, 32319, 32671) },
-                { AOM_CDF4(19557, 31164, 32454) },
-                { AOM_CDF4(13381, 26381, 30755) },
-                { AOM_CDF4(10101, 21466, 26722) },
-                { AOM_CDF4(9209, 19650, 26825) },
-                { AOM_CDF4(27107, 31917, 32432) },
-                { AOM_CDF4(18056, 28893, 31203) },
-                { AOM_CDF4(10200, 21434, 26764) },
-                { AOM_CDF4(4660, 12913, 19502) },
-                { AOM_CDF4(2368, 6930, 12504) },
-                { AOM_CDF4(26960, 32158, 32613) },
-                { AOM_CDF4(18628, 30005, 32031) },
-                { AOM_CDF4(10233, 22442, 28232) },
-                { AOM_CDF4(5471, 14630, 21516) },
-                { AOM_CDF4(3235, 10767, 17109) },
-                { AOM_CDF4(27696, 32440, 32692) },
-                { AOM_CDF4(20032, 31167, 32438) },
-                { AOM_CDF4(8700, 21341, 28442) },
-                { AOM_CDF4(5662, 14831, 21795) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(9704, 17294, 21132) },
-                { AOM_CDF4(26762, 32278, 32633) },
-                { AOM_CDF4(18382, 29620, 31819) },
-                { AOM_CDF4(10891, 23475, 28723) },
-                { AOM_CDF4(6358, 16583, 23309) },
-                { AOM_CDF4(3248, 9118, 14141) },
-                { AOM_CDF4(27204, 32573, 32699) },
-                { AOM_CDF4(19818, 30824, 32329) },
-                { AOM_CDF4(11772, 25120, 30041) },
-                { AOM_CDF4(6995, 18033, 25039) },
-                { AOM_CDF4(3752, 10442, 16098) },
-                { AOM_CDF4(27222, 32256, 32559) },
-                { AOM_CDF4(15356, 28399, 31475) },
-                { AOM_CDF4(8821, 20635, 27057) },
-                { AOM_CDF4(5511, 14404, 21239) },
-                { AOM_CDF4(2935, 8222, 13051) },
-                { AOM_CDF4(24875, 32120, 32529) },
-                { AOM_CDF4(15233, 28265, 31445) },
-                { AOM_CDF4(8605, 20570, 26932) },
-                { AOM_CDF4(5431, 14413, 21196) },
-                { AOM_CDF4(2994, 8341, 13223) },
-                { AOM_CDF4(28201, 32604, 32700) },
-                { AOM_CDF4(21041, 31446, 32456) },
-                { AOM_CDF4(13221, 26213, 30475) },
-                { AOM_CDF4(8255, 19385, 26037) },
-                { AOM_CDF4(4930, 12585, 18830) },
-                { AOM_CDF4(28768, 32448, 32627) },
-                { AOM_CDF4(19705, 30561, 32021) },
-                { AOM_CDF4(11572, 23589, 28220) },
-                { AOM_CDF4(5532, 15034, 21446) },
-                { AOM_CDF4(2460, 7150, 11456) },
-                { AOM_CDF4(29874, 32619, 32699) },
-                { AOM_CDF4(21621, 31071, 32201) },
-                { AOM_CDF4(12511, 24747, 28992) },
-                { AOM_CDF4(6281, 16395, 22748) },
-                { AOM_CDF4(3246, 9278, 14497) },
-                { AOM_CDF4(29715, 32625, 32712) },
-                { AOM_CDF4(20958, 31011, 32283) },
-                { AOM_CDF4(11233, 23671, 28806) },
-                { AOM_CDF4(6012, 16128, 22868) },
-                { AOM_CDF4(3427, 9851, 15414) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(11016, 22111, 26794) },
-                { AOM_CDF4(25946, 32357, 32677) },
-                { AOM_CDF4(17890, 30452, 32252) },
-                { AOM_CDF4(11678, 25142, 29816) },
-                { AOM_CDF4(6720, 17534, 24584) },
-                { AOM_CDF4(4230, 11665, 17820) },
-                { AOM_CDF4(28400, 32623, 32747) },
-                { AOM_CDF4(21164, 31668, 32575) },
-                { AOM_CDF4(13572, 27388, 31182) },
-                { AOM_CDF4(8234, 20750, 27358) },
-                { AOM_CDF4(5065, 14055, 20897) },
-                { AOM_CDF4(28981, 32547, 32705) },
-                { AOM_CDF4(18681, 30543, 32239) },
-                { AOM_CDF4(10919, 24075, 29286) },
-                { AOM_CDF4(6431, 17199, 24077) },
-                { AOM_CDF4(3819, 10464, 16618) },
-                { AOM_CDF4(26870, 32467, 32693) },
-                { AOM_CDF4(19041, 30831, 32347) },
-                { AOM_CDF4(11794, 25211, 30016) },
-                { AOM_CDF4(6888, 18019, 24970) },
-                { AOM_CDF4(4370, 12363, 18992) },
-                { AOM_CDF4(29578, 32670, 32744) },
-                { AOM_CDF4(23159, 32007, 32613) },
-                { AOM_CDF4(15315, 28669, 31676) },
-                { AOM_CDF4(9298, 22607, 28782) },
-                { AOM_CDF4(6144, 15913, 22968) },
-                { AOM_CDF4(28110, 32499, 32669) },
-                { AOM_CDF4(21574, 30937, 32015) },
-                { AOM_CDF4(12759, 24818, 28727) },
-                { AOM_CDF4(6545, 16761, 23042) },
-                { AOM_CDF4(3649, 10597, 16833) },
-                { AOM_CDF4(28163, 32552, 32728) },
-                { AOM_CDF4(22101, 31469, 32464) },
-                { AOM_CDF4(13160, 25472, 30143) },
-                { AOM_CDF4(7303, 18684, 25468) },
-                { AOM_CDF4(5241, 13975, 20955) },
-                { AOM_CDF4(28400, 32631, 32744) },
-                { AOM_CDF4(22104, 31793, 32603) },
-                { AOM_CDF4(13557, 26571, 30846) },
-                { AOM_CDF4(7749, 19861, 26675) },
-                { AOM_CDF4(4873, 14030, 21234) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(9800, 17635, 21073) },
-                { AOM_CDF4(26153, 31885, 32527) },
-                { AOM_CDF4(15038, 27852, 31006) },
-                { AOM_CDF4(8718, 20564, 26486) },
-                { AOM_CDF4(5128, 14076, 20514) },
-                { AOM_CDF4(2636, 7566, 11925) },
-                { AOM_CDF4(27551, 32504, 32701) },
-                { AOM_CDF4(18310, 30054, 32100) },
-                { AOM_CDF4(10211, 23420, 29082) },
-                { AOM_CDF4(6222, 16876, 23916) },
-                { AOM_CDF4(3462, 9954, 15498) },
-                { AOM_CDF4(29991, 32633, 32721) },
-                { AOM_CDF4(19883, 30751, 32201) },
-                { AOM_CDF4(11141, 24184, 29285) },
-                { AOM_CDF4(6420, 16940, 23774) },
-                { AOM_CDF4(3392, 9753, 15118) },
-                { AOM_CDF4(28465, 32616, 32712) },
-                { AOM_CDF4(19850, 30702, 32244) },
-                { AOM_CDF4(10983, 24024, 29223) },
-                { AOM_CDF4(6294, 16770, 23582) },
-                { AOM_CDF4(3244, 9283, 14509) },
-                { AOM_CDF4(30023, 32717, 32748) },
-                { AOM_CDF4(22940, 32032, 32626) },
-                { AOM_CDF4(14282, 27928, 31473) },
-                { AOM_CDF4(8562, 21327, 27914) },
-                { AOM_CDF4(4846, 13393, 19919) },
-                { AOM_CDF4(29981, 32590, 32695) },
-                { AOM_CDF4(20465, 30963, 32166) },
-                { AOM_CDF4(11479, 23579, 28195) },
-                { AOM_CDF4(5916, 15648, 22073) },
-                { AOM_CDF4(3031, 8605, 13398) },
-                { AOM_CDF4(31146, 32691, 32739) },
-                { AOM_CDF4(23106, 31724, 32444) },
-                { AOM_CDF4(13783, 26738, 30439) },
-                { AOM_CDF4(7852, 19468, 25807) },
-                { AOM_CDF4(3860, 11124, 16853) },
-                { AOM_CDF4(31014, 32724, 32748) },
-                { AOM_CDF4(23629, 32109, 32628) },
-                { AOM_CDF4(14747, 28115, 31403) },
-                { AOM_CDF4(8545, 21242, 27478) },
-                { AOM_CDF4(4574, 12781, 19067) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(9185, 19694, 24688) },
-                { AOM_CDF4(26081, 31985, 32621) },
-                { AOM_CDF4(16015, 29000, 31787) },
-                { AOM_CDF4(10542, 23690, 29206) },
-                { AOM_CDF4(6732, 17945, 24677) },
-                { AOM_CDF4(3916, 11039, 16722) },
-                { AOM_CDF4(28224, 32566, 32744) },
-                { AOM_CDF4(19100, 31138, 32485) },
-                { AOM_CDF4(12528, 26620, 30879) },
-                { AOM_CDF4(7741, 20277, 26885) },
-                { AOM_CDF4(4566, 12845, 18990) },
-                { AOM_CDF4(29933, 32593, 32718) },
-                { AOM_CDF4(17670, 30333, 32155) },
-                { AOM_CDF4(10385, 23600, 28909) },
-                { AOM_CDF4(6243, 16236, 22407) },
-                { AOM_CDF4(3976, 10389, 16017) },
-                { AOM_CDF4(28377, 32561, 32738) },
-                { AOM_CDF4(19366, 31175, 32482) },
-                { AOM_CDF4(13327, 27175, 31094) },
-                { AOM_CDF4(8258, 20769, 27143) },
-                { AOM_CDF4(4703, 13198, 19527) },
-                { AOM_CDF4(31086, 32706, 32748) },
-                { AOM_CDF4(22853, 31902, 32583) },
-                { AOM_CDF4(14759, 28186, 31419) },
-                { AOM_CDF4(9284, 22382, 28348) },
-                { AOM_CDF4(5585, 15192, 21868) },
-                { AOM_CDF4(28291, 32652, 32746) },
-                { AOM_CDF4(19849, 32107, 32571) },
-                { AOM_CDF4(14834, 26818, 29214) },
-                { AOM_CDF4(10306, 22594, 28672) },
-                { AOM_CDF4(6615, 17384, 23384) },
-                { AOM_CDF4(28947, 32604, 32745) },
-                { AOM_CDF4(25625, 32289, 32646) },
-                { AOM_CDF4(18758, 28672, 31403) },
-                { AOM_CDF4(10017, 23430, 28523) },
-                { AOM_CDF4(6862, 15269, 22131) },
-                { AOM_CDF4(23933, 32509, 32739) },
-                { AOM_CDF4(19927, 31495, 32631) },
-                { AOM_CDF4(11903, 26023, 30621) },
-                { AOM_CDF4(7026, 20094, 27252) },
-                { AOM_CDF4(5998, 18106, 24437) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4456, 11274, 15533) },
-                { AOM_CDF4(21219, 29079, 31616) },
-                { AOM_CDF4(11173, 23774, 28567) },
-                { AOM_CDF4(7282, 18293, 24263) },
-                { AOM_CDF4(4890, 13286, 19115) },
-                { AOM_CDF4(1890, 5508, 8659) },
-                { AOM_CDF4(26651, 32136, 32647) },
-                { AOM_CDF4(14630, 28254, 31455) },
-                { AOM_CDF4(8716, 21287, 27395) },
-                { AOM_CDF4(5615, 15331, 22008) },
-                { AOM_CDF4(2675, 7700, 12150) },
-                { AOM_CDF4(29954, 32526, 32690) },
-                { AOM_CDF4(16126, 28982, 31633) },
-                { AOM_CDF4(9030, 21361, 27352) },
-                { AOM_CDF4(5411, 14793, 21271) },
-                { AOM_CDF4(2943, 8422, 13163) },
-                { AOM_CDF4(29539, 32601, 32730) },
-                { AOM_CDF4(18125, 30385, 32201) },
-                { AOM_CDF4(10422, 24090, 29468) },
-                { AOM_CDF4(6468, 17487, 24438) },
-                { AOM_CDF4(2970, 8653, 13531) },
-                { AOM_CDF4(30912, 32715, 32748) },
-                { AOM_CDF4(20666, 31373, 32497) },
-                { AOM_CDF4(12509, 26640, 30917) },
-                { AOM_CDF4(8058, 20629, 27290) },
-                { AOM_CDF4(4231, 12006, 18052) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(10202, 20633, 25484) },
-                { AOM_CDF4(27336, 31445, 32352) },
-                { AOM_CDF4(12420, 24384, 28552) },
-                { AOM_CDF4(7648, 18115, 23856) },
-                { AOM_CDF4(5662, 14341, 19902) },
-                { AOM_CDF4(3611, 10328, 15390) },
-                { AOM_CDF4(30945, 32616, 32736) },
-                { AOM_CDF4(18682, 30505, 32253) },
-                { AOM_CDF4(11513, 25336, 30203) },
-                { AOM_CDF4(7449, 19452, 26148) },
-                { AOM_CDF4(4482, 13051, 18886) },
-                { AOM_CDF4(32022, 32690, 32747) },
-                { AOM_CDF4(18578, 30501, 32146) },
-                { AOM_CDF4(11249, 23368, 28631) },
-                { AOM_CDF4(5645, 16958, 22158) },
-                { AOM_CDF4(5009, 11444, 16637) },
-                { AOM_CDF4(31357, 32710, 32748) },
-                { AOM_CDF4(21552, 31494, 32504) },
-                { AOM_CDF4(13891, 27677, 31340) },
-                { AOM_CDF4(9051, 22098, 28172) },
-                { AOM_CDF4(5190, 13377, 19486) },
-                { AOM_CDF4(32364, 32740, 32748) },
-                { AOM_CDF4(24839, 31907, 32551) },
-                { AOM_CDF4(17160, 28779, 31696) },
-                { AOM_CDF4(12452, 24137, 29602) },
-                { AOM_CDF4(6165, 15389, 22477) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(2575, 7281, 11077) },
-                { AOM_CDF4(14002, 20866, 25402) },
-                { AOM_CDF4(6343, 15056, 19658) },
-                { AOM_CDF4(4474, 11858, 17041) },
-                { AOM_CDF4(2865, 8299, 12534) },
-                { AOM_CDF4(1344, 3949, 6391) },
-                { AOM_CDF4(24720, 31239, 32459) },
-                { AOM_CDF4(12585, 25356, 29968) },
-                { AOM_CDF4(7181, 18246, 24444) },
-                { AOM_CDF4(5025, 13667, 19885) },
-                { AOM_CDF4(2521, 7304, 11605) },
-                { AOM_CDF4(29908, 32252, 32584) },
-                { AOM_CDF4(17421, 29156, 31575) },
-                { AOM_CDF4(9889, 22188, 27782) },
-                { AOM_CDF4(5878, 15647, 22123) },
-                { AOM_CDF4(2814, 8665, 13323) },
-                { AOM_CDF4(30183, 32568, 32713) },
-                { AOM_CDF4(18528, 30195, 32049) },
-                { AOM_CDF4(10982, 24606, 29657) },
-                { AOM_CDF4(6957, 18165, 25231) },
-                { AOM_CDF4(3508, 10118, 15468) },
-                { AOM_CDF4(31761, 32736, 32748) },
-                { AOM_CDF4(21041, 31328, 32546) },
-                { AOM_CDF4(12568, 26732, 31166) },
-                { AOM_CDF4(8052, 20720, 27733) },
-                { AOM_CDF4(4336, 12192, 18396) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(7062, 16472, 22319) },
-                { AOM_CDF4(24538, 32261, 32674) },
-                { AOM_CDF4(13675, 28041, 31779) },
-                { AOM_CDF4(8590, 20674, 27631) },
-                { AOM_CDF4(5685, 14675, 22013) },
-                { AOM_CDF4(3655, 9898, 15731) },
-                { AOM_CDF4(26493, 32418, 32658) },
-                { AOM_CDF4(16376, 29342, 32090) },
-                { AOM_CDF4(10594, 22649, 28970) },
-                { AOM_CDF4(8176, 17170, 24303) },
-                { AOM_CDF4(5605, 12694, 19139) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(23888, 31902, 32542) },
-                { AOM_CDF4(18612, 29687, 31987) },
-                { AOM_CDF4(16245, 24852, 29249) },
-                { AOM_CDF4(15765, 22608, 27559) },
-                { AOM_CDF4(19895, 24699, 27510) },
-                { AOM_CDF4(28401, 32212, 32457) },
-                { AOM_CDF4(15274, 27825, 30980) },
-                { AOM_CDF4(9364, 18128, 24332) },
-                { AOM_CDF4(2283, 8193, 15082) },
-                { AOM_CDF4(1228, 3972, 7881) },
-                { AOM_CDF4(29455, 32469, 32620) },
-                { AOM_CDF4(17981, 28245, 31388) },
-                { AOM_CDF4(10921, 20098, 26240) },
-                { AOM_CDF4(3743, 11829, 18657) },
-                { AOM_CDF4(2374, 9593, 15715) },
-                { AOM_CDF4(31068, 32466, 32635) },
-                { AOM_CDF4(20321, 29572, 31971) },
-                { AOM_CDF4(10771, 20255, 27119) },
-                { AOM_CDF4(2795, 10410, 17361) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(9320, 22102, 27840) },
-                { AOM_CDF4(27057, 32464, 32724) },
-                { AOM_CDF4(16331, 30268, 32309) },
-                { AOM_CDF4(10319, 23935, 29720) },
-                { AOM_CDF4(6189, 16448, 24106) },
-                { AOM_CDF4(3589, 10884, 18808) },
-                { AOM_CDF4(29026, 32624, 32748) },
-                { AOM_CDF4(19226, 31507, 32587) },
-                { AOM_CDF4(12692, 26921, 31203) },
-                { AOM_CDF4(7049, 19532, 27635) },
-                { AOM_CDF4(7727, 15669, 23252) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(28056, 32625, 32748) },
-                { AOM_CDF4(22383, 32075, 32669) },
-                { AOM_CDF4(15417, 27098, 31749) },
-                { AOM_CDF4(18127, 26493, 27190) },
-                { AOM_CDF4(5461, 16384, 21845) },
-                { AOM_CDF4(27982, 32091, 32584) },
-                { AOM_CDF4(19045, 29868, 31972) },
-                { AOM_CDF4(10397, 22266, 27932) },
-                { AOM_CDF4(5990, 13697, 21500) },
-                { AOM_CDF4(1792, 6912, 15104) },
-                { AOM_CDF4(28198, 32501, 32718) },
-                { AOM_CDF4(21534, 31521, 32569) },
-                { AOM_CDF4(11109, 25217, 30017) },
-                { AOM_CDF4(5671, 15124, 26151) },
-                { AOM_CDF4(4681, 14043, 18725) },
-                { AOM_CDF4(28688, 32580, 32741) },
-                { AOM_CDF4(22576, 32079, 32661) },
-                { AOM_CDF4(10627, 22141, 28340) },
-                { AOM_CDF4(9362, 14043, 28087) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7754, 16948, 22142) },
-                { AOM_CDF4(25670, 32330, 32691) },
-                { AOM_CDF4(15663, 29225, 31994) },
-                { AOM_CDF4(9878, 23288, 29158) },
-                { AOM_CDF4(6419, 17088, 24336) },
-                { AOM_CDF4(3859, 11003, 17039) },
-                { AOM_CDF4(27562, 32595, 32725) },
-                { AOM_CDF4(17575, 30588, 32399) },
-                { AOM_CDF4(10819, 24838, 30309) },
-                { AOM_CDF4(7124, 18686, 25916) },
-                { AOM_CDF4(4479, 12688, 19340) },
-                { AOM_CDF4(28385, 32476, 32673) },
-                { AOM_CDF4(15306, 29005, 31938) },
-                { AOM_CDF4(8937, 21615, 28322) },
-                { AOM_CDF4(5982, 15603, 22786) },
-                { AOM_CDF4(3620, 10267, 16136) },
-                { AOM_CDF4(27280, 32464, 32667) },
-                { AOM_CDF4(15607, 29160, 32004) },
-                { AOM_CDF4(9091, 22135, 28740) },
-                { AOM_CDF4(6232, 16632, 24020) },
-                { AOM_CDF4(4047, 11377, 17672) },
-                { AOM_CDF4(29220, 32630, 32718) },
-                { AOM_CDF4(19650, 31220, 32462) },
-                { AOM_CDF4(13050, 26312, 30827) },
-                { AOM_CDF4(9228, 20870, 27468) },
-                { AOM_CDF4(6146, 15149, 21971) },
-                { AOM_CDF4(30169, 32481, 32623) },
-                { AOM_CDF4(17212, 29311, 31554) },
-                { AOM_CDF4(9911, 21311, 26882) },
-                { AOM_CDF4(4487, 13314, 20372) },
-                { AOM_CDF4(2570, 7772, 12889) },
-                { AOM_CDF4(30924, 32613, 32708) },
-                { AOM_CDF4(19490, 30206, 32107) },
-                { AOM_CDF4(11232, 23998, 29276) },
-                { AOM_CDF4(6769, 17955, 25035) },
-                { AOM_CDF4(4398, 12623, 19214) },
-                { AOM_CDF4(30609, 32627, 32722) },
-                { AOM_CDF4(19370, 30582, 32287) },
-                { AOM_CDF4(10457, 23619, 29409) },
-                { AOM_CDF4(6443, 17637, 24834) },
-                { AOM_CDF4(4645, 13236, 20106) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8626, 20271, 26216) },
-                { AOM_CDF4(26707, 32406, 32711) },
-                { AOM_CDF4(16999, 30329, 32286) },
-                { AOM_CDF4(11445, 25123, 30286) },
-                { AOM_CDF4(6411, 18828, 25601) },
-                { AOM_CDF4(6801, 12458, 20248) },
-                { AOM_CDF4(29918, 32682, 32748) },
-                { AOM_CDF4(20649, 31739, 32618) },
-                { AOM_CDF4(12879, 27773, 31581) },
-                { AOM_CDF4(7896, 21751, 28244) },
-                { AOM_CDF4(5260, 14870, 23698) },
-                { AOM_CDF4(29252, 32593, 32731) },
-                { AOM_CDF4(17072, 30460, 32294) },
-                { AOM_CDF4(10653, 24143, 29365) },
-                { AOM_CDF4(6536, 17490, 23983) },
-                { AOM_CDF4(4929, 13170, 20085) },
-                { AOM_CDF4(28137, 32518, 32715) },
-                { AOM_CDF4(18171, 30784, 32407) },
-                { AOM_CDF4(11437, 25436, 30459) },
-                { AOM_CDF4(7252, 18534, 26176) },
-                { AOM_CDF4(4126, 13353, 20978) },
-                { AOM_CDF4(31162, 32726, 32748) },
-                { AOM_CDF4(23017, 32222, 32701) },
-                { AOM_CDF4(15629, 29233, 32046) },
-                { AOM_CDF4(9387, 22621, 29480) },
-                { AOM_CDF4(6922, 17616, 25010) },
-                { AOM_CDF4(28838, 32265, 32614) },
-                { AOM_CDF4(19701, 30206, 31920) },
-                { AOM_CDF4(11214, 22410, 27933) },
-                { AOM_CDF4(5320, 14177, 23034) },
-                { AOM_CDF4(5049, 12881, 17827) },
-                { AOM_CDF4(27484, 32471, 32734) },
-                { AOM_CDF4(21076, 31526, 32561) },
-                { AOM_CDF4(12707, 26303, 31211) },
-                { AOM_CDF4(8169, 21722, 28219) },
-                { AOM_CDF4(6045, 19406, 27042) },
-                { AOM_CDF4(27753, 32572, 32745) },
-                { AOM_CDF4(20832, 31878, 32653) },
-                { AOM_CDF4(13250, 27356, 31674) },
-                { AOM_CDF4(7718, 21508, 29858) },
-                { AOM_CDF4(7209, 18350, 25559) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7876, 16901, 21741) },
-                { AOM_CDF4(24001, 31898, 32625) },
-                { AOM_CDF4(14529, 27959, 31451) },
-                { AOM_CDF4(8273, 20818, 27258) },
-                { AOM_CDF4(5278, 14673, 21510) },
-                { AOM_CDF4(2983, 8843, 14039) },
-                { AOM_CDF4(28016, 32574, 32732) },
-                { AOM_CDF4(17471, 30306, 32301) },
-                { AOM_CDF4(10224, 24063, 29728) },
-                { AOM_CDF4(6602, 17954, 25052) },
-                { AOM_CDF4(4002, 11585, 17759) },
-                { AOM_CDF4(30190, 32634, 32739) },
-                { AOM_CDF4(17497, 30282, 32270) },
-                { AOM_CDF4(10229, 23729, 29538) },
-                { AOM_CDF4(6344, 17211, 24440) },
-                { AOM_CDF4(3849, 11189, 17108) },
-                { AOM_CDF4(28570, 32583, 32726) },
-                { AOM_CDF4(17521, 30161, 32238) },
-                { AOM_CDF4(10153, 23565, 29378) },
-                { AOM_CDF4(6455, 17341, 24443) },
-                { AOM_CDF4(3907, 11042, 17024) },
-                { AOM_CDF4(30689, 32715, 32748) },
-                { AOM_CDF4(21546, 31840, 32610) },
-                { AOM_CDF4(13547, 27581, 31459) },
-                { AOM_CDF4(8912, 21757, 28309) },
-                { AOM_CDF4(5548, 15080, 22046) },
-                { AOM_CDF4(30783, 32540, 32685) },
-                { AOM_CDF4(17540, 29528, 31668) },
-                { AOM_CDF4(10160, 21468, 26783) },
-                { AOM_CDF4(4724, 13393, 20054) },
-                { AOM_CDF4(2702, 8174, 13102) },
-                { AOM_CDF4(31648, 32686, 32742) },
-                { AOM_CDF4(20954, 31094, 32337) },
-                { AOM_CDF4(12420, 25698, 30179) },
-                { AOM_CDF4(7304, 19320, 26248) },
-                { AOM_CDF4(4366, 12261, 18864) },
-                { AOM_CDF4(31581, 32723, 32748) },
-                { AOM_CDF4(21373, 31586, 32525) },
-                { AOM_CDF4(12744, 26625, 30885) },
-                { AOM_CDF4(7431, 20322, 26950) },
-                { AOM_CDF4(4692, 13323, 20111) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(7833, 18369, 24095) },
-                { AOM_CDF4(26650, 32273, 32702) },
-                { AOM_CDF4(16371, 29961, 32191) },
-                { AOM_CDF4(11055, 24082, 29629) },
-                { AOM_CDF4(6892, 18644, 25400) },
-                { AOM_CDF4(5006, 13057, 19240) },
-                { AOM_CDF4(29834, 32666, 32748) },
-                { AOM_CDF4(19577, 31335, 32570) },
-                { AOM_CDF4(12253, 26509, 31122) },
-                { AOM_CDF4(7991, 20772, 27711) },
-                { AOM_CDF4(5677, 15910, 23059) },
-                { AOM_CDF4(30109, 32532, 32720) },
-                { AOM_CDF4(16747, 30166, 32252) },
-                { AOM_CDF4(10134, 23542, 29184) },
-                { AOM_CDF4(5791, 16176, 23556) },
-                { AOM_CDF4(4362, 10414, 17284) },
-                { AOM_CDF4(29492, 32626, 32748) },
-                { AOM_CDF4(19894, 31402, 32525) },
-                { AOM_CDF4(12942, 27071, 30869) },
-                { AOM_CDF4(8346, 21216, 27405) },
-                { AOM_CDF4(6572, 17087, 23859) },
-                { AOM_CDF4(32035, 32735, 32748) },
-                { AOM_CDF4(22957, 31838, 32618) },
-                { AOM_CDF4(14724, 28572, 31772) },
-                { AOM_CDF4(10364, 23999, 29553) },
-                { AOM_CDF4(7004, 18433, 25655) },
-                { AOM_CDF4(27528, 32277, 32681) },
-                { AOM_CDF4(16959, 31171, 32096) },
-                { AOM_CDF4(10486, 23593, 27962) },
-                { AOM_CDF4(8192, 16384, 23211) },
-                { AOM_CDF4(8937, 17873, 20852) },
-                { AOM_CDF4(27715, 32002, 32615) },
-                { AOM_CDF4(15073, 29491, 31676) },
-                { AOM_CDF4(11264, 24576, 28672) },
-                { AOM_CDF4(2341, 18725, 23406) },
-                { AOM_CDF4(7282, 18204, 25486) },
-                { AOM_CDF4(28547, 32213, 32657) },
-                { AOM_CDF4(20788, 29773, 32239) },
-                { AOM_CDF4(6780, 21469, 30508) },
-                { AOM_CDF4(5958, 14895, 23831) },
-                { AOM_CDF4(16384, 21845, 27307) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5992, 14304, 19765) },
-                { AOM_CDF4(22612, 31238, 32456) },
-                { AOM_CDF4(13456, 27162, 31087) },
-                { AOM_CDF4(8001, 20062, 26504) },
-                { AOM_CDF4(5168, 14105, 20764) },
-                { AOM_CDF4(2632, 7771, 12385) },
-                { AOM_CDF4(27034, 32344, 32709) },
-                { AOM_CDF4(15850, 29415, 31997) },
-                { AOM_CDF4(9494, 22776, 28841) },
-                { AOM_CDF4(6151, 16830, 23969) },
-                { AOM_CDF4(3461, 10039, 15722) },
-                { AOM_CDF4(30134, 32569, 32731) },
-                { AOM_CDF4(15638, 29422, 31945) },
-                { AOM_CDF4(9150, 21865, 28218) },
-                { AOM_CDF4(5647, 15719, 22676) },
-                { AOM_CDF4(3402, 9772, 15477) },
-                { AOM_CDF4(28530, 32586, 32735) },
-                { AOM_CDF4(17139, 30298, 32292) },
-                { AOM_CDF4(10200, 24039, 29685) },
-                { AOM_CDF4(6419, 17674, 24786) },
-                { AOM_CDF4(3544, 10225, 15824) },
-                { AOM_CDF4(31333, 32726, 32748) },
-                { AOM_CDF4(20618, 31487, 32544) },
-                { AOM_CDF4(12901, 27217, 31232) },
-                { AOM_CDF4(8624, 21734, 28171) },
-                { AOM_CDF4(5104, 14191, 20748) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(11206, 21090, 26561) },
-                { AOM_CDF4(28759, 32279, 32671) },
-                { AOM_CDF4(14171, 27952, 31569) },
-                { AOM_CDF4(9743, 22907, 29141) },
-                { AOM_CDF4(6871, 17886, 24868) },
-                { AOM_CDF4(4960, 13152, 19315) },
-                { AOM_CDF4(31077, 32661, 32748) },
-                { AOM_CDF4(19400, 31195, 32515) },
-                { AOM_CDF4(12752, 26858, 31040) },
-                { AOM_CDF4(8370, 22098, 28591) },
-                { AOM_CDF4(5457, 15373, 22298) },
-                { AOM_CDF4(31697, 32706, 32748) },
-                { AOM_CDF4(17860, 30657, 32333) },
-                { AOM_CDF4(12510, 24812, 29261) },
-                { AOM_CDF4(6180, 19124, 24722) },
-                { AOM_CDF4(5041, 13548, 17959) },
-                { AOM_CDF4(31552, 32716, 32748) },
-                { AOM_CDF4(21908, 31769, 32623) },
-                { AOM_CDF4(14470, 28201, 31565) },
-                { AOM_CDF4(9493, 22982, 28608) },
-                { AOM_CDF4(6858, 17240, 24137) },
-                { AOM_CDF4(32543, 32752, 32756) },
-                { AOM_CDF4(24286, 32097, 32666) },
-                { AOM_CDF4(15958, 29217, 32024) },
-                { AOM_CDF4(10207, 24234, 29958) },
-                { AOM_CDF4(6929, 18305, 25652) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4137, 10847, 15682) },
-                { AOM_CDF4(17824, 27001, 30058) },
-                { AOM_CDF4(10204, 22796, 28291) },
-                { AOM_CDF4(6076, 15935, 22125) },
-                { AOM_CDF4(3852, 10937, 16816) },
-                { AOM_CDF4(2252, 6324, 10131) },
-                { AOM_CDF4(25840, 32016, 32662) },
-                { AOM_CDF4(15109, 28268, 31531) },
-                { AOM_CDF4(9385, 22231, 28340) },
-                { AOM_CDF4(6082, 16672, 23479) },
-                { AOM_CDF4(3318, 9427, 14681) },
-                { AOM_CDF4(30594, 32574, 32718) },
-                { AOM_CDF4(16836, 29552, 31859) },
-                { AOM_CDF4(9556, 22542, 28356) },
-                { AOM_CDF4(6305, 16725, 23540) },
-                { AOM_CDF4(3376, 9895, 15184) },
-                { AOM_CDF4(29383, 32617, 32745) },
-                { AOM_CDF4(18891, 30809, 32401) },
-                { AOM_CDF4(11688, 25942, 30687) },
-                { AOM_CDF4(7468, 19469, 26651) },
-                { AOM_CDF4(3909, 11358, 17012) },
-                { AOM_CDF4(31564, 32736, 32748) },
-                { AOM_CDF4(20906, 31611, 32600) },
-                { AOM_CDF4(13191, 27621, 31537) },
-                { AOM_CDF4(8768, 22029, 28676) },
-                { AOM_CDF4(5079, 14109, 20906) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } } };
+    [CDF_SIZE(NUM_BASE_LEVELS +
+              2)] = { { { { { AOM_CDF4(4034, 8930, 12727) },
+                            { AOM_CDF4(18082, 29741, 31877) },
+                            { AOM_CDF4(12596, 26124, 30493) },
+                            { AOM_CDF4(9446, 21118, 27005) },
+                            { AOM_CDF4(6308, 15141, 21279) },
+                            { AOM_CDF4(2463, 6357, 9783) },
+                            { AOM_CDF4(20667, 30546, 31929) },
+                            { AOM_CDF4(13043, 26123, 30134) },
+                            { AOM_CDF4(8151, 18757, 24778) },
+                            { AOM_CDF4(5255, 12839, 18632) },
+                            { AOM_CDF4(2820, 7206, 11161) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(15736, 27553, 30604) },
+                            { AOM_CDF4(11210, 23794, 28787) },
+                            { AOM_CDF4(5947, 13874, 19701) },
+                            { AOM_CDF4(4215, 9323, 13891) },
+                            { AOM_CDF4(2833, 6462, 10059) },
+                            { AOM_CDF4(19605, 30393, 31582) },
+                            { AOM_CDF4(13523, 26252, 30248) },
+                            { AOM_CDF4(8446, 18622, 24512) },
+                            { AOM_CDF4(3818, 10343, 15974) },
+                            { AOM_CDF4(1481, 4117, 6796) },
+                            { AOM_CDF4(22649, 31302, 32190) },
+                            { AOM_CDF4(14829, 27127, 30449) },
+                            { AOM_CDF4(8313, 17702, 23304) },
+                            { AOM_CDF4(3022, 8301, 12786) },
+                            { AOM_CDF4(1536, 4412, 7184) },
+                            { AOM_CDF4(22354, 29774, 31372) },
+                            { AOM_CDF4(14723, 25472, 29214) },
+                            { AOM_CDF4(6673, 13745, 18662) },
+                            { AOM_CDF4(2068, 5766, 9322) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6302, 16444, 21761) },
+                            { AOM_CDF4(23040, 31538, 32475) },
+                            { AOM_CDF4(15196, 28452, 31496) },
+                            { AOM_CDF4(10020, 22946, 28514) },
+                            { AOM_CDF4(6533, 16862, 23501) },
+                            { AOM_CDF4(3538, 9816, 15076) },
+                            { AOM_CDF4(24444, 31875, 32525) },
+                            { AOM_CDF4(15881, 28924, 31635) },
+                            { AOM_CDF4(9922, 22873, 28466) },
+                            { AOM_CDF4(6527, 16966, 23691) },
+                            { AOM_CDF4(4114, 11303, 17220) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20201, 30770, 32209) },
+                            { AOM_CDF4(14754, 28071, 31258) },
+                            { AOM_CDF4(8378, 20186, 26517) },
+                            { AOM_CDF4(5916, 15299, 21978) },
+                            { AOM_CDF4(4268, 11583, 17901) },
+                            { AOM_CDF4(24361, 32025, 32581) },
+                            { AOM_CDF4(18673, 30105, 31943) },
+                            { AOM_CDF4(10196, 22244, 27576) },
+                            { AOM_CDF4(5495, 14349, 20417) },
+                            { AOM_CDF4(2676, 7415, 11498) },
+                            { AOM_CDF4(24678, 31958, 32585) },
+                            { AOM_CDF4(18629, 29906, 31831) },
+                            { AOM_CDF4(9364, 20724, 26315) },
+                            { AOM_CDF4(4641, 12318, 18094) },
+                            { AOM_CDF4(2758, 7387, 11579) },
+                            { AOM_CDF4(25433, 31842, 32469) },
+                            { AOM_CDF4(18795, 29289, 31411) },
+                            { AOM_CDF4(7644, 17584, 23592) },
+                            { AOM_CDF4(3408, 9014, 15047) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4536, 10072, 14001) },
+                            { AOM_CDF4(25459, 31416, 32206) },
+                            { AOM_CDF4(16605, 28048, 30818) },
+                            { AOM_CDF4(11008, 22857, 27719) },
+                            { AOM_CDF4(6915, 16268, 22315) },
+                            { AOM_CDF4(2625, 6812, 10537) },
+                            { AOM_CDF4(24257, 31788, 32499) },
+                            { AOM_CDF4(16880, 29454, 31879) },
+                            { AOM_CDF4(11958, 25054, 29778) },
+                            { AOM_CDF4(7916, 18718, 25084) },
+                            { AOM_CDF4(3383, 8777, 13446) },
+                            { AOM_CDF4(22720, 31603, 32393) },
+                            { AOM_CDF4(14960, 28125, 31335) },
+                            { AOM_CDF4(9731, 22210, 27928) },
+                            { AOM_CDF4(6304, 15832, 22277) },
+                            { AOM_CDF4(2910, 7818, 12166) },
+                            { AOM_CDF4(20375, 30627, 32131) },
+                            { AOM_CDF4(13904, 27284, 30887) },
+                            { AOM_CDF4(9368, 21558, 27144) },
+                            { AOM_CDF4(5937, 14966, 21119) },
+                            { AOM_CDF4(2667, 7225, 11319) },
+                            { AOM_CDF4(23970, 31470, 32378) },
+                            { AOM_CDF4(17173, 29734, 32018) },
+                            { AOM_CDF4(12795, 25441, 29965) },
+                            { AOM_CDF4(8981, 19680, 25893) },
+                            { AOM_CDF4(4728, 11372, 16902) },
+                            { AOM_CDF4(24287, 31797, 32439) },
+                            { AOM_CDF4(16703, 29145, 31696) },
+                            { AOM_CDF4(10833, 23554, 28725) },
+                            { AOM_CDF4(6468, 16566, 23057) },
+                            { AOM_CDF4(2415, 6562, 10278) },
+                            { AOM_CDF4(26610, 32395, 32659) },
+                            { AOM_CDF4(18590, 30498, 32117) },
+                            { AOM_CDF4(12420, 25756, 29950) },
+                            { AOM_CDF4(7639, 18746, 24710) },
+                            { AOM_CDF4(3001, 8086, 12347) },
+                            { AOM_CDF4(25076, 32064, 32580) },
+                            { AOM_CDF4(17946, 30128, 32028) },
+                            { AOM_CDF4(12024, 24985, 29378) },
+                            { AOM_CDF4(7517, 18390, 24304) },
+                            { AOM_CDF4(3243, 8781, 13331) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6037, 16771, 21957) },
+                            { AOM_CDF4(24774, 31704, 32426) },
+                            { AOM_CDF4(16830, 28589, 31056) },
+                            { AOM_CDF4(10602, 22828, 27760) },
+                            { AOM_CDF4(6733, 16829, 23071) },
+                            { AOM_CDF4(3250, 8914, 13556) },
+                            { AOM_CDF4(25582, 32220, 32668) },
+                            { AOM_CDF4(18659, 30342, 32223) },
+                            { AOM_CDF4(12546, 26149, 30515) },
+                            { AOM_CDF4(8420, 20451, 26801) },
+                            { AOM_CDF4(4636, 12420, 18344) },
+                            { AOM_CDF4(27581, 32362, 32639) },
+                            { AOM_CDF4(18987, 30083, 31978) },
+                            { AOM_CDF4(11327, 24248, 29084) },
+                            { AOM_CDF4(7264, 17719, 24120) },
+                            { AOM_CDF4(3995, 10768, 16169) },
+                            { AOM_CDF4(25893, 31831, 32487) },
+                            { AOM_CDF4(16577, 28587, 31379) },
+                            { AOM_CDF4(10189, 22748, 28182) },
+                            { AOM_CDF4(6832, 17094, 23556) },
+                            { AOM_CDF4(3708, 10110, 15334) },
+                            { AOM_CDF4(25904, 32282, 32656) },
+                            { AOM_CDF4(19721, 30792, 32276) },
+                            { AOM_CDF4(12819, 26243, 30411) },
+                            { AOM_CDF4(8572, 20614, 26891) },
+                            { AOM_CDF4(5364, 14059, 20467) },
+                            { AOM_CDF4(26580, 32438, 32677) },
+                            { AOM_CDF4(20852, 31225, 32340) },
+                            { AOM_CDF4(12435, 25700, 29967) },
+                            { AOM_CDF4(8691, 20825, 26976) },
+                            { AOM_CDF4(4446, 12209, 17269) },
+                            { AOM_CDF4(27350, 32429, 32696) },
+                            { AOM_CDF4(21372, 30977, 32272) },
+                            { AOM_CDF4(12673, 25270, 29853) },
+                            { AOM_CDF4(9208, 20925, 26640) },
+                            { AOM_CDF4(5018, 13351, 18732) },
+                            { AOM_CDF4(27351, 32479, 32713) },
+                            { AOM_CDF4(21398, 31209, 32387) },
+                            { AOM_CDF4(12162, 25047, 29842) },
+                            { AOM_CDF4(7896, 18691, 25319) },
+                            { AOM_CDF4(4670, 12882, 18881) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5487, 10460, 13708) },
+                            { AOM_CDF4(21597, 28303, 30674) },
+                            { AOM_CDF4(11037, 21953, 26476) },
+                            { AOM_CDF4(8147, 17962, 22952) },
+                            { AOM_CDF4(5242, 13061, 18532) },
+                            { AOM_CDF4(1889, 5208, 8182) },
+                            { AOM_CDF4(26774, 32133, 32590) },
+                            { AOM_CDF4(17844, 29564, 31767) },
+                            { AOM_CDF4(11690, 24438, 29171) },
+                            { AOM_CDF4(7542, 18215, 24459) },
+                            { AOM_CDF4(2993, 8050, 12319) },
+                            { AOM_CDF4(28023, 32328, 32591) },
+                            { AOM_CDF4(18651, 30126, 31954) },
+                            { AOM_CDF4(12164, 25146, 29589) },
+                            { AOM_CDF4(7762, 18530, 24771) },
+                            { AOM_CDF4(3492, 9183, 13920) },
+                            { AOM_CDF4(27591, 32008, 32491) },
+                            { AOM_CDF4(17149, 28853, 31510) },
+                            { AOM_CDF4(11485, 24003, 28860) },
+                            { AOM_CDF4(7697, 18086, 24210) },
+                            { AOM_CDF4(3075, 7999, 12218) },
+                            { AOM_CDF4(28268, 32482, 32654) },
+                            { AOM_CDF4(19631, 31051, 32404) },
+                            { AOM_CDF4(13860, 27260, 31020) },
+                            { AOM_CDF4(9605, 21613, 27594) },
+                            { AOM_CDF4(4876, 12162, 17908) },
+                            { AOM_CDF4(27248, 32316, 32576) },
+                            { AOM_CDF4(18955, 30457, 32075) },
+                            { AOM_CDF4(11824, 23997, 28795) },
+                            { AOM_CDF4(7346, 18196, 24647) },
+                            { AOM_CDF4(3403, 9247, 14111) },
+                            { AOM_CDF4(29711, 32655, 32735) },
+                            { AOM_CDF4(21169, 31394, 32417) },
+                            { AOM_CDF4(13487, 27198, 30957) },
+                            { AOM_CDF4(8828, 21683, 27614) },
+                            { AOM_CDF4(4270, 11451, 17038) },
+                            { AOM_CDF4(28708, 32578, 32731) },
+                            { AOM_CDF4(20120, 31241, 32482) },
+                            { AOM_CDF4(13692, 27550, 31321) },
+                            { AOM_CDF4(9418, 22514, 28439) },
+                            { AOM_CDF4(4999, 13283, 19462) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5673, 14302, 19711) },
+                            { AOM_CDF4(26251, 30701, 31834) },
+                            { AOM_CDF4(12782, 23783, 27803) },
+                            { AOM_CDF4(9127, 20657, 25808) },
+                            { AOM_CDF4(6368, 16208, 21462) },
+                            { AOM_CDF4(2465, 7177, 10822) },
+                            { AOM_CDF4(29961, 32563, 32719) },
+                            { AOM_CDF4(18318, 29891, 31949) },
+                            { AOM_CDF4(11361, 24514, 29357) },
+                            { AOM_CDF4(7900, 19603, 25607) },
+                            { AOM_CDF4(4002, 10590, 15546) },
+                            { AOM_CDF4(29637, 32310, 32595) },
+                            { AOM_CDF4(18296, 29913, 31809) },
+                            { AOM_CDF4(10144, 21515, 26871) },
+                            { AOM_CDF4(5358, 14322, 20394) },
+                            { AOM_CDF4(3067, 8362, 13346) },
+                            { AOM_CDF4(28652, 32470, 32676) },
+                            { AOM_CDF4(17538, 30771, 32209) },
+                            { AOM_CDF4(13924, 26882, 30494) },
+                            { AOM_CDF4(10496, 22837, 27869) },
+                            { AOM_CDF4(7236, 16396, 21621) },
+                            { AOM_CDF4(30743, 32687, 32746) },
+                            { AOM_CDF4(23006, 31676, 32489) },
+                            { AOM_CDF4(14494, 27828, 31120) },
+                            { AOM_CDF4(10174, 22801, 28352) },
+                            { AOM_CDF4(6242, 15281, 21043) },
+                            { AOM_CDF4(25817, 32243, 32720) },
+                            { AOM_CDF4(18618, 31367, 32325) },
+                            { AOM_CDF4(13997, 28318, 31878) },
+                            { AOM_CDF4(12255, 26534, 31383) },
+                            { AOM_CDF4(9561, 21588, 28450) },
+                            { AOM_CDF4(28188, 32635, 32724) },
+                            { AOM_CDF4(22060, 32365, 32728) },
+                            { AOM_CDF4(18102, 30690, 32528) },
+                            { AOM_CDF4(14196, 28864, 31999) },
+                            { AOM_CDF4(12262, 25792, 30865) },
+                            { AOM_CDF4(24176, 32109, 32628) },
+                            { AOM_CDF4(18280, 29681, 31963) },
+                            { AOM_CDF4(10205, 23703, 29664) },
+                            { AOM_CDF4(7889, 20025, 27676) },
+                            { AOM_CDF4(6060, 16743, 23970) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5141, 7096, 8260) },
+                            { AOM_CDF4(27186, 29022, 29789) },
+                            { AOM_CDF4(6668, 12568, 15682) },
+                            { AOM_CDF4(2172, 6181, 8638) },
+                            { AOM_CDF4(1126, 3379, 4531) },
+                            { AOM_CDF4(443, 1361, 2254) },
+                            { AOM_CDF4(26083, 31153, 32436) },
+                            { AOM_CDF4(13486, 24603, 28483) },
+                            { AOM_CDF4(6508, 14840, 19910) },
+                            { AOM_CDF4(3386, 8800, 13286) },
+                            { AOM_CDF4(1530, 4322, 7054) },
+                            { AOM_CDF4(29639, 32080, 32548) },
+                            { AOM_CDF4(15897, 27552, 30290) },
+                            { AOM_CDF4(8588, 20047, 25383) },
+                            { AOM_CDF4(4889, 13339, 19269) },
+                            { AOM_CDF4(2240, 6871, 10498) },
+                            { AOM_CDF4(28165, 32197, 32517) },
+                            { AOM_CDF4(20735, 30427, 31568) },
+                            { AOM_CDF4(14325, 24671, 27692) },
+                            { AOM_CDF4(5119, 12554, 17805) },
+                            { AOM_CDF4(1810, 5441, 8261) },
+                            { AOM_CDF4(31212, 32724, 32748) },
+                            { AOM_CDF4(23352, 31766, 32545) },
+                            { AOM_CDF4(14669, 27570, 31059) },
+                            { AOM_CDF4(8492, 20894, 27272) },
+                            { AOM_CDF4(3644, 10194, 15204) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(2461, 7013, 9371) },
+                            { AOM_CDF4(24749, 29600, 30986) },
+                            { AOM_CDF4(9466, 19037, 22417) },
+                            { AOM_CDF4(3584, 9280, 14400) },
+                            { AOM_CDF4(1505, 3929, 5433) },
+                            { AOM_CDF4(677, 1500, 2736) },
+                            { AOM_CDF4(23987, 30702, 32117) },
+                            { AOM_CDF4(13554, 24571, 29263) },
+                            { AOM_CDF4(6211, 14556, 21155) },
+                            { AOM_CDF4(3135, 10972, 15625) },
+                            { AOM_CDF4(2435, 7127, 11427) },
+                            { AOM_CDF4(31300, 32532, 32550) },
+                            { AOM_CDF4(14757, 30365, 31954) },
+                            { AOM_CDF4(4405, 11612, 18553) },
+                            { AOM_CDF4(580, 4132, 7322) },
+                            { AOM_CDF4(1695, 10169, 14124) },
+                            { AOM_CDF4(30008, 32282, 32591) },
+                            { AOM_CDF4(19244, 30108, 31748) },
+                            { AOM_CDF4(11180, 24158, 29555) },
+                            { AOM_CDF4(5650, 14972, 19209) },
+                            { AOM_CDF4(2114, 5109, 8456) },
+                            { AOM_CDF4(31856, 32716, 32748) },
+                            { AOM_CDF4(23012, 31664, 32572) },
+                            { AOM_CDF4(13694, 26656, 30636) },
+                            { AOM_CDF4(8142, 19508, 26093) },
+                            { AOM_CDF4(4253, 10955, 16724) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(601, 983, 1311) },
+                            { AOM_CDF4(18725, 23406, 28087) },
+                            { AOM_CDF4(5461, 8192, 10923) },
+                            { AOM_CDF4(3781, 15124, 21425) },
+                            { AOM_CDF4(2587, 7761, 12072) },
+                            { AOM_CDF4(106, 458, 810) },
+                            { AOM_CDF4(22282, 29710, 31894) },
+                            { AOM_CDF4(8508, 20926, 25984) },
+                            { AOM_CDF4(3726, 12713, 18083) },
+                            { AOM_CDF4(1620, 7112, 10893) },
+                            { AOM_CDF4(729, 2236, 3495) },
+                            { AOM_CDF4(30163, 32474, 32684) },
+                            { AOM_CDF4(18304, 30464, 32000) },
+                            { AOM_CDF4(11443, 26526, 29647) },
+                            { AOM_CDF4(6007, 15292, 21299) },
+                            { AOM_CDF4(2234, 6703, 8937) },
+                            { AOM_CDF4(30954, 32177, 32571) },
+                            { AOM_CDF4(17363, 29562, 31076) },
+                            { AOM_CDF4(9686, 22464, 27410) },
+                            { AOM_CDF4(8192, 16384, 21390) },
+                            { AOM_CDF4(1755, 8046, 11264) },
+                            { AOM_CDF4(31168, 32734, 32748) },
+                            { AOM_CDF4(22486, 31441, 32471) },
+                            { AOM_CDF4(12833, 25627, 29738) },
+                            { AOM_CDF4(6980, 17379, 23122) },
+                            { AOM_CDF4(3111, 8887, 13479) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(6041, 11854, 15927) },
+                            { AOM_CDF4(20326, 30905, 32251) },
+                            { AOM_CDF4(14164, 26831, 30725) },
+                            { AOM_CDF4(9760, 20647, 26585) },
+                            { AOM_CDF4(6416, 14953, 21219) },
+                            { AOM_CDF4(2966, 7151, 10891) },
+                            { AOM_CDF4(23567, 31374, 32254) },
+                            { AOM_CDF4(14978, 27416, 30946) },
+                            { AOM_CDF4(9434, 20225, 26254) },
+                            { AOM_CDF4(6658, 14558, 20535) },
+                            { AOM_CDF4(3916, 8677, 12989) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(18088, 29545, 31587) },
+                            { AOM_CDF4(13062, 25843, 30073) },
+                            { AOM_CDF4(8940, 16827, 22251) },
+                            { AOM_CDF4(7654, 13220, 17973) },
+                            { AOM_CDF4(5733, 10316, 14456) },
+                            { AOM_CDF4(22879, 31388, 32114) },
+                            { AOM_CDF4(15215, 27993, 30955) },
+                            { AOM_CDF4(9397, 19445, 24978) },
+                            { AOM_CDF4(3442, 9813, 15344) },
+                            { AOM_CDF4(1368, 3936, 6532) },
+                            { AOM_CDF4(25494, 32033, 32406) },
+                            { AOM_CDF4(16772, 27963, 30718) },
+                            { AOM_CDF4(9419, 18165, 23260) },
+                            { AOM_CDF4(2677, 7501, 11797) },
+                            { AOM_CDF4(1516, 4344, 7170) },
+                            { AOM_CDF4(26556, 31454, 32101) },
+                            { AOM_CDF4(17128, 27035, 30108) },
+                            { AOM_CDF4(8324, 15344, 20249) },
+                            { AOM_CDF4(1903, 5696, 9469) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8455, 19003, 24368) },
+                            { AOM_CDF4(23563, 32021, 32604) },
+                            { AOM_CDF4(16237, 29446, 31935) },
+                            { AOM_CDF4(10724, 23999, 29358) },
+                            { AOM_CDF4(6725, 17528, 24416) },
+                            { AOM_CDF4(3927, 10927, 16825) },
+                            { AOM_CDF4(26313, 32288, 32634) },
+                            { AOM_CDF4(17430, 30095, 32095) },
+                            { AOM_CDF4(11116, 24606, 29679) },
+                            { AOM_CDF4(7195, 18384, 25269) },
+                            { AOM_CDF4(4726, 12852, 19315) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(22822, 31648, 32483) },
+                            { AOM_CDF4(16724, 29633, 31929) },
+                            { AOM_CDF4(10261, 23033, 28725) },
+                            { AOM_CDF4(7029, 17840, 24528) },
+                            { AOM_CDF4(4867, 13886, 21502) },
+                            { AOM_CDF4(25298, 31892, 32491) },
+                            { AOM_CDF4(17809, 29330, 31512) },
+                            { AOM_CDF4(9668, 21329, 26579) },
+                            { AOM_CDF4(4774, 12956, 18976) },
+                            { AOM_CDF4(2322, 7030, 11540) },
+                            { AOM_CDF4(25472, 31920, 32543) },
+                            { AOM_CDF4(17957, 29387, 31632) },
+                            { AOM_CDF4(9196, 20593, 26400) },
+                            { AOM_CDF4(4680, 12705, 19202) },
+                            { AOM_CDF4(2917, 8456, 13436) },
+                            { AOM_CDF4(26471, 32059, 32574) },
+                            { AOM_CDF4(18458, 29783, 31909) },
+                            { AOM_CDF4(8400, 19464, 25956) },
+                            { AOM_CDF4(3812, 10973, 17206) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(6779, 13743, 17678) },
+                            { AOM_CDF4(24806, 31797, 32457) },
+                            { AOM_CDF4(17616, 29047, 31372) },
+                            { AOM_CDF4(11063, 23175, 28003) },
+                            { AOM_CDF4(6521, 16110, 22324) },
+                            { AOM_CDF4(2764, 7504, 11654) },
+                            { AOM_CDF4(25266, 32367, 32637) },
+                            { AOM_CDF4(19054, 30553, 32175) },
+                            { AOM_CDF4(12139, 25212, 29807) },
+                            { AOM_CDF4(7311, 18162, 24704) },
+                            { AOM_CDF4(3397, 9164, 14074) },
+                            { AOM_CDF4(25988, 32208, 32522) },
+                            { AOM_CDF4(16253, 28912, 31526) },
+                            { AOM_CDF4(9151, 21387, 27372) },
+                            { AOM_CDF4(5688, 14915, 21496) },
+                            { AOM_CDF4(2717, 7627, 12004) },
+                            { AOM_CDF4(23144, 31855, 32443) },
+                            { AOM_CDF4(16070, 28491, 31325) },
+                            { AOM_CDF4(8702, 20467, 26517) },
+                            { AOM_CDF4(5243, 13956, 20367) },
+                            { AOM_CDF4(2621, 7335, 11567) },
+                            { AOM_CDF4(26636, 32340, 32630) },
+                            { AOM_CDF4(19990, 31050, 32341) },
+                            { AOM_CDF4(13243, 26105, 30315) },
+                            { AOM_CDF4(8588, 19521, 25918) },
+                            { AOM_CDF4(4717, 11585, 17304) },
+                            { AOM_CDF4(25844, 32292, 32582) },
+                            { AOM_CDF4(19090, 30635, 32097) },
+                            { AOM_CDF4(11963, 24546, 28939) },
+                            { AOM_CDF4(6218, 16087, 22354) },
+                            { AOM_CDF4(2340, 6608, 10426) },
+                            { AOM_CDF4(28046, 32576, 32694) },
+                            { AOM_CDF4(21178, 31313, 32296) },
+                            { AOM_CDF4(13486, 26184, 29870) },
+                            { AOM_CDF4(7149, 17871, 23723) },
+                            { AOM_CDF4(2833, 7958, 12259) },
+                            { AOM_CDF4(27710, 32528, 32686) },
+                            { AOM_CDF4(20674, 31076, 32268) },
+                            { AOM_CDF4(12413, 24955, 29243) },
+                            { AOM_CDF4(6676, 16927, 23097) },
+                            { AOM_CDF4(2966, 8333, 12919) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8639, 19339, 24429) },
+                            { AOM_CDF4(24404, 31837, 32525) },
+                            { AOM_CDF4(16997, 29425, 31784) },
+                            { AOM_CDF4(11253, 24234, 29149) },
+                            { AOM_CDF4(6751, 17394, 24028) },
+                            { AOM_CDF4(3490, 9830, 15191) },
+                            { AOM_CDF4(26283, 32471, 32714) },
+                            { AOM_CDF4(19599, 31168, 32442) },
+                            { AOM_CDF4(13146, 26954, 30893) },
+                            { AOM_CDF4(8214, 20588, 26890) },
+                            { AOM_CDF4(4699, 13081, 19300) },
+                            { AOM_CDF4(28212, 32458, 32669) },
+                            { AOM_CDF4(18594, 30316, 32100) },
+                            { AOM_CDF4(11219, 24408, 29234) },
+                            { AOM_CDF4(6865, 17656, 24149) },
+                            { AOM_CDF4(3678, 10362, 16006) },
+                            { AOM_CDF4(25825, 32136, 32616) },
+                            { AOM_CDF4(17313, 29853, 32021) },
+                            { AOM_CDF4(11197, 24471, 29472) },
+                            { AOM_CDF4(6947, 17781, 24405) },
+                            { AOM_CDF4(3768, 10660, 16261) },
+                            { AOM_CDF4(27352, 32500, 32706) },
+                            { AOM_CDF4(20850, 31468, 32469) },
+                            { AOM_CDF4(14021, 27707, 31133) },
+                            { AOM_CDF4(8964, 21748, 27838) },
+                            { AOM_CDF4(5437, 14665, 21187) },
+                            { AOM_CDF4(26304, 32492, 32698) },
+                            { AOM_CDF4(20409, 31380, 32385) },
+                            { AOM_CDF4(13682, 27222, 30632) },
+                            { AOM_CDF4(8974, 21236, 26685) },
+                            { AOM_CDF4(4234, 11665, 16934) },
+                            { AOM_CDF4(26273, 32357, 32711) },
+                            { AOM_CDF4(20672, 31242, 32441) },
+                            { AOM_CDF4(14172, 27254, 30902) },
+                            { AOM_CDF4(9870, 21898, 27275) },
+                            { AOM_CDF4(5164, 13506, 19270) },
+                            { AOM_CDF4(26725, 32459, 32728) },
+                            { AOM_CDF4(20991, 31442, 32527) },
+                            { AOM_CDF4(13071, 26434, 30811) },
+                            { AOM_CDF4(8184, 20090, 26742) },
+                            { AOM_CDF4(4803, 13255, 19895) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7555, 14942, 18501) },
+                            { AOM_CDF4(24410, 31178, 32287) },
+                            { AOM_CDF4(14394, 26738, 30253) },
+                            { AOM_CDF4(8413, 19554, 25195) },
+                            { AOM_CDF4(4766, 12924, 18785) },
+                            { AOM_CDF4(2029, 5806, 9207) },
+                            { AOM_CDF4(26776, 32364, 32663) },
+                            { AOM_CDF4(18732, 29967, 31931) },
+                            { AOM_CDF4(11005, 23786, 28852) },
+                            { AOM_CDF4(6466, 16909, 23510) },
+                            { AOM_CDF4(3044, 8638, 13419) },
+                            { AOM_CDF4(29208, 32582, 32704) },
+                            { AOM_CDF4(20068, 30857, 32208) },
+                            { AOM_CDF4(12003, 25085, 29595) },
+                            { AOM_CDF4(6947, 17750, 24189) },
+                            { AOM_CDF4(3245, 9103, 14007) },
+                            { AOM_CDF4(27359, 32465, 32669) },
+                            { AOM_CDF4(19421, 30614, 32174) },
+                            { AOM_CDF4(11915, 25010, 29579) },
+                            { AOM_CDF4(6950, 17676, 24074) },
+                            { AOM_CDF4(3007, 8473, 13096) },
+                            { AOM_CDF4(29002, 32676, 32735) },
+                            { AOM_CDF4(22102, 31849, 32576) },
+                            { AOM_CDF4(14408, 28009, 31405) },
+                            { AOM_CDF4(9027, 21679, 27931) },
+                            { AOM_CDF4(4694, 12678, 18748) },
+                            { AOM_CDF4(28216, 32528, 32682) },
+                            { AOM_CDF4(20849, 31264, 32318) },
+                            { AOM_CDF4(12756, 25815, 29751) },
+                            { AOM_CDF4(7565, 18801, 24923) },
+                            { AOM_CDF4(3509, 9533, 14477) },
+                            { AOM_CDF4(30133, 32687, 32739) },
+                            { AOM_CDF4(23063, 31910, 32515) },
+                            { AOM_CDF4(14588, 28051, 31132) },
+                            { AOM_CDF4(9085, 21649, 27457) },
+                            { AOM_CDF4(4261, 11654, 17264) },
+                            { AOM_CDF4(29518, 32691, 32748) },
+                            { AOM_CDF4(22451, 31959, 32613) },
+                            { AOM_CDF4(14864, 28722, 31700) },
+                            { AOM_CDF4(9695, 22964, 28716) },
+                            { AOM_CDF4(4932, 13358, 19502) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6465, 16958, 21688) },
+                            { AOM_CDF4(25199, 31514, 32360) },
+                            { AOM_CDF4(14774, 27149, 30607) },
+                            { AOM_CDF4(9257, 21438, 26972) },
+                            { AOM_CDF4(5723, 15183, 21882) },
+                            { AOM_CDF4(3150, 8879, 13731) },
+                            { AOM_CDF4(26989, 32262, 32682) },
+                            { AOM_CDF4(17396, 29937, 32085) },
+                            { AOM_CDF4(11387, 24901, 29784) },
+                            { AOM_CDF4(7289, 18821, 25548) },
+                            { AOM_CDF4(3734, 10577, 16086) },
+                            { AOM_CDF4(29728, 32501, 32695) },
+                            { AOM_CDF4(17431, 29701, 31903) },
+                            { AOM_CDF4(9921, 22826, 28300) },
+                            { AOM_CDF4(5896, 15434, 22068) },
+                            { AOM_CDF4(3430, 9646, 14757) },
+                            { AOM_CDF4(28614, 32511, 32705) },
+                            { AOM_CDF4(19364, 30638, 32263) },
+                            { AOM_CDF4(13129, 26254, 30402) },
+                            { AOM_CDF4(8754, 20484, 26440) },
+                            { AOM_CDF4(4378, 11607, 17110) },
+                            { AOM_CDF4(30292, 32671, 32744) },
+                            { AOM_CDF4(21780, 31603, 32501) },
+                            { AOM_CDF4(14314, 27829, 31291) },
+                            { AOM_CDF4(9611, 22327, 28263) },
+                            { AOM_CDF4(4890, 13087, 19065) },
+                            { AOM_CDF4(25862, 32567, 32733) },
+                            { AOM_CDF4(20794, 32050, 32567) },
+                            { AOM_CDF4(17243, 30625, 32254) },
+                            { AOM_CDF4(13283, 27628, 31474) },
+                            { AOM_CDF4(9669, 22532, 28918) },
+                            { AOM_CDF4(27435, 32697, 32748) },
+                            { AOM_CDF4(24922, 32390, 32714) },
+                            { AOM_CDF4(21449, 31504, 32536) },
+                            { AOM_CDF4(16392, 29729, 31832) },
+                            { AOM_CDF4(11692, 24884, 29076) },
+                            { AOM_CDF4(24193, 32290, 32735) },
+                            { AOM_CDF4(18909, 31104, 32563) },
+                            { AOM_CDF4(12236, 26841, 31403) },
+                            { AOM_CDF4(8171, 21840, 29082) },
+                            { AOM_CDF4(7224, 17280, 25275) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(3078, 6839, 9890) },
+                            { AOM_CDF4(13837, 20450, 24479) },
+                            { AOM_CDF4(5914, 14222, 19328) },
+                            { AOM_CDF4(3866, 10267, 14762) },
+                            { AOM_CDF4(2612, 7208, 11042) },
+                            { AOM_CDF4(1067, 2991, 4776) },
+                            { AOM_CDF4(25817, 31646, 32529) },
+                            { AOM_CDF4(13708, 26338, 30385) },
+                            { AOM_CDF4(7328, 18585, 24870) },
+                            { AOM_CDF4(4691, 13080, 19276) },
+                            { AOM_CDF4(1825, 5253, 8352) },
+                            { AOM_CDF4(29386, 32315, 32624) },
+                            { AOM_CDF4(17160, 29001, 31360) },
+                            { AOM_CDF4(9602, 21862, 27396) },
+                            { AOM_CDF4(5915, 15772, 22148) },
+                            { AOM_CDF4(2786, 7779, 12047) },
+                            { AOM_CDF4(29246, 32450, 32663) },
+                            { AOM_CDF4(18696, 29929, 31818) },
+                            { AOM_CDF4(10510, 23369, 28560) },
+                            { AOM_CDF4(6229, 16499, 23125) },
+                            { AOM_CDF4(2608, 7448, 11705) },
+                            { AOM_CDF4(30753, 32710, 32748) },
+                            { AOM_CDF4(21638, 31487, 32503) },
+                            { AOM_CDF4(12937, 26854, 30870) },
+                            { AOM_CDF4(8182, 20596, 26970) },
+                            { AOM_CDF4(3637, 10269, 15497) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5244, 12150, 16906) },
+                            { AOM_CDF4(20486, 26858, 29701) },
+                            { AOM_CDF4(7756, 18317, 23735) },
+                            { AOM_CDF4(3452, 9256, 13146) },
+                            { AOM_CDF4(2020, 5206, 8229) },
+                            { AOM_CDF4(1801, 4993, 7903) },
+                            { AOM_CDF4(27051, 31858, 32531) },
+                            { AOM_CDF4(15988, 27531, 30619) },
+                            { AOM_CDF4(9188, 21484, 26719) },
+                            { AOM_CDF4(6273, 17186, 23800) },
+                            { AOM_CDF4(3108, 9355, 14764) },
+                            { AOM_CDF4(31076, 32520, 32680) },
+                            { AOM_CDF4(18119, 30037, 31850) },
+                            { AOM_CDF4(10244, 22969, 27472) },
+                            { AOM_CDF4(4692, 14077, 19273) },
+                            { AOM_CDF4(3694, 11677, 17556) },
+                            { AOM_CDF4(30060, 32581, 32720) },
+                            { AOM_CDF4(21011, 30775, 32120) },
+                            { AOM_CDF4(11931, 24820, 29289) },
+                            { AOM_CDF4(7119, 17662, 24356) },
+                            { AOM_CDF4(3833, 10706, 16304) },
+                            { AOM_CDF4(31954, 32731, 32748) },
+                            { AOM_CDF4(23913, 31724, 32489) },
+                            { AOM_CDF4(15520, 28060, 31286) },
+                            { AOM_CDF4(11517, 23008, 28571) },
+                            { AOM_CDF4(6193, 14508, 20629) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(1035, 2807, 4156) },
+                            { AOM_CDF4(13162, 18138, 20939) },
+                            { AOM_CDF4(2696, 6633, 8755) },
+                            { AOM_CDF4(1373, 4161, 6853) },
+                            { AOM_CDF4(1099, 2746, 4716) },
+                            { AOM_CDF4(340, 1021, 1599) },
+                            { AOM_CDF4(22826, 30419, 32135) },
+                            { AOM_CDF4(10395, 21762, 26942) },
+                            { AOM_CDF4(4726, 12407, 17361) },
+                            { AOM_CDF4(2447, 7080, 10593) },
+                            { AOM_CDF4(1227, 3717, 6011) },
+                            { AOM_CDF4(28156, 31424, 31934) },
+                            { AOM_CDF4(16915, 27754, 30373) },
+                            { AOM_CDF4(9148, 20990, 26431) },
+                            { AOM_CDF4(5950, 15515, 21148) },
+                            { AOM_CDF4(2492, 7327, 11526) },
+                            { AOM_CDF4(30602, 32477, 32670) },
+                            { AOM_CDF4(20026, 29955, 31568) },
+                            { AOM_CDF4(11220, 23628, 28105) },
+                            { AOM_CDF4(6652, 17019, 22973) },
+                            { AOM_CDF4(3064, 8536, 13043) },
+                            { AOM_CDF4(31769, 32724, 32748) },
+                            { AOM_CDF4(22230, 30887, 32373) },
+                            { AOM_CDF4(12234, 25079, 29731) },
+                            { AOM_CDF4(7326, 18816, 25353) },
+                            { AOM_CDF4(3933, 10907, 16616) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(8896, 16227, 20630) },
+                            { AOM_CDF4(23629, 31782, 32527) },
+                            { AOM_CDF4(15173, 27755, 31321) },
+                            { AOM_CDF4(10158, 21233, 27382) },
+                            { AOM_CDF4(6420, 14857, 21558) },
+                            { AOM_CDF4(3269, 8155, 12646) },
+                            { AOM_CDF4(24835, 32009, 32496) },
+                            { AOM_CDF4(16509, 28421, 31579) },
+                            { AOM_CDF4(10957, 21514, 27418) },
+                            { AOM_CDF4(7881, 15930, 22096) },
+                            { AOM_CDF4(5388, 10960, 15918) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20745, 30773, 32093) },
+                            { AOM_CDF4(15200, 27221, 30861) },
+                            { AOM_CDF4(13032, 20873, 25667) },
+                            { AOM_CDF4(12285, 18663, 23494) },
+                            { AOM_CDF4(11563, 17481, 21489) },
+                            { AOM_CDF4(26260, 31982, 32320) },
+                            { AOM_CDF4(15397, 28083, 31100) },
+                            { AOM_CDF4(9742, 19217, 24824) },
+                            { AOM_CDF4(3261, 9629, 15362) },
+                            { AOM_CDF4(1480, 4322, 7499) },
+                            { AOM_CDF4(27599, 32256, 32460) },
+                            { AOM_CDF4(16857, 27659, 30774) },
+                            { AOM_CDF4(9551, 18290, 23748) },
+                            { AOM_CDF4(3052, 8933, 14103) },
+                            { AOM_CDF4(2021, 5910, 9787) },
+                            { AOM_CDF4(29005, 32015, 32392) },
+                            { AOM_CDF4(17677, 27694, 30863) },
+                            { AOM_CDF4(9204, 17356, 23219) },
+                            { AOM_CDF4(2403, 7516, 12814) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10808, 22056, 26896) },
+                            { AOM_CDF4(25739, 32313, 32676) },
+                            { AOM_CDF4(17288, 30203, 32221) },
+                            { AOM_CDF4(11359, 24878, 29896) },
+                            { AOM_CDF4(6949, 17767, 24893) },
+                            { AOM_CDF4(4287, 11796, 18071) },
+                            { AOM_CDF4(27880, 32521, 32705) },
+                            { AOM_CDF4(19038, 31004, 32414) },
+                            { AOM_CDF4(12564, 26345, 30768) },
+                            { AOM_CDF4(8269, 19947, 26779) },
+                            { AOM_CDF4(5674, 14657, 21674) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(25742, 32319, 32671) },
+                            { AOM_CDF4(19557, 31164, 32454) },
+                            { AOM_CDF4(13381, 26381, 30755) },
+                            { AOM_CDF4(10101, 21466, 26722) },
+                            { AOM_CDF4(9209, 19650, 26825) },
+                            { AOM_CDF4(27107, 31917, 32432) },
+                            { AOM_CDF4(18056, 28893, 31203) },
+                            { AOM_CDF4(10200, 21434, 26764) },
+                            { AOM_CDF4(4660, 12913, 19502) },
+                            { AOM_CDF4(2368, 6930, 12504) },
+                            { AOM_CDF4(26960, 32158, 32613) },
+                            { AOM_CDF4(18628, 30005, 32031) },
+                            { AOM_CDF4(10233, 22442, 28232) },
+                            { AOM_CDF4(5471, 14630, 21516) },
+                            { AOM_CDF4(3235, 10767, 17109) },
+                            { AOM_CDF4(27696, 32440, 32692) },
+                            { AOM_CDF4(20032, 31167, 32438) },
+                            { AOM_CDF4(8700, 21341, 28442) },
+                            { AOM_CDF4(5662, 14831, 21795) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9704, 17294, 21132) },
+                            { AOM_CDF4(26762, 32278, 32633) },
+                            { AOM_CDF4(18382, 29620, 31819) },
+                            { AOM_CDF4(10891, 23475, 28723) },
+                            { AOM_CDF4(6358, 16583, 23309) },
+                            { AOM_CDF4(3248, 9118, 14141) },
+                            { AOM_CDF4(27204, 32573, 32699) },
+                            { AOM_CDF4(19818, 30824, 32329) },
+                            { AOM_CDF4(11772, 25120, 30041) },
+                            { AOM_CDF4(6995, 18033, 25039) },
+                            { AOM_CDF4(3752, 10442, 16098) },
+                            { AOM_CDF4(27222, 32256, 32559) },
+                            { AOM_CDF4(15356, 28399, 31475) },
+                            { AOM_CDF4(8821, 20635, 27057) },
+                            { AOM_CDF4(5511, 14404, 21239) },
+                            { AOM_CDF4(2935, 8222, 13051) },
+                            { AOM_CDF4(24875, 32120, 32529) },
+                            { AOM_CDF4(15233, 28265, 31445) },
+                            { AOM_CDF4(8605, 20570, 26932) },
+                            { AOM_CDF4(5431, 14413, 21196) },
+                            { AOM_CDF4(2994, 8341, 13223) },
+                            { AOM_CDF4(28201, 32604, 32700) },
+                            { AOM_CDF4(21041, 31446, 32456) },
+                            { AOM_CDF4(13221, 26213, 30475) },
+                            { AOM_CDF4(8255, 19385, 26037) },
+                            { AOM_CDF4(4930, 12585, 18830) },
+                            { AOM_CDF4(28768, 32448, 32627) },
+                            { AOM_CDF4(19705, 30561, 32021) },
+                            { AOM_CDF4(11572, 23589, 28220) },
+                            { AOM_CDF4(5532, 15034, 21446) },
+                            { AOM_CDF4(2460, 7150, 11456) },
+                            { AOM_CDF4(29874, 32619, 32699) },
+                            { AOM_CDF4(21621, 31071, 32201) },
+                            { AOM_CDF4(12511, 24747, 28992) },
+                            { AOM_CDF4(6281, 16395, 22748) },
+                            { AOM_CDF4(3246, 9278, 14497) },
+                            { AOM_CDF4(29715, 32625, 32712) },
+                            { AOM_CDF4(20958, 31011, 32283) },
+                            { AOM_CDF4(11233, 23671, 28806) },
+                            { AOM_CDF4(6012, 16128, 22868) },
+                            { AOM_CDF4(3427, 9851, 15414) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11016, 22111, 26794) },
+                            { AOM_CDF4(25946, 32357, 32677) },
+                            { AOM_CDF4(17890, 30452, 32252) },
+                            { AOM_CDF4(11678, 25142, 29816) },
+                            { AOM_CDF4(6720, 17534, 24584) },
+                            { AOM_CDF4(4230, 11665, 17820) },
+                            { AOM_CDF4(28400, 32623, 32747) },
+                            { AOM_CDF4(21164, 31668, 32575) },
+                            { AOM_CDF4(13572, 27388, 31182) },
+                            { AOM_CDF4(8234, 20750, 27358) },
+                            { AOM_CDF4(5065, 14055, 20897) },
+                            { AOM_CDF4(28981, 32547, 32705) },
+                            { AOM_CDF4(18681, 30543, 32239) },
+                            { AOM_CDF4(10919, 24075, 29286) },
+                            { AOM_CDF4(6431, 17199, 24077) },
+                            { AOM_CDF4(3819, 10464, 16618) },
+                            { AOM_CDF4(26870, 32467, 32693) },
+                            { AOM_CDF4(19041, 30831, 32347) },
+                            { AOM_CDF4(11794, 25211, 30016) },
+                            { AOM_CDF4(6888, 18019, 24970) },
+                            { AOM_CDF4(4370, 12363, 18992) },
+                            { AOM_CDF4(29578, 32670, 32744) },
+                            { AOM_CDF4(23159, 32007, 32613) },
+                            { AOM_CDF4(15315, 28669, 31676) },
+                            { AOM_CDF4(9298, 22607, 28782) },
+                            { AOM_CDF4(6144, 15913, 22968) },
+                            { AOM_CDF4(28110, 32499, 32669) },
+                            { AOM_CDF4(21574, 30937, 32015) },
+                            { AOM_CDF4(12759, 24818, 28727) },
+                            { AOM_CDF4(6545, 16761, 23042) },
+                            { AOM_CDF4(3649, 10597, 16833) },
+                            { AOM_CDF4(28163, 32552, 32728) },
+                            { AOM_CDF4(22101, 31469, 32464) },
+                            { AOM_CDF4(13160, 25472, 30143) },
+                            { AOM_CDF4(7303, 18684, 25468) },
+                            { AOM_CDF4(5241, 13975, 20955) },
+                            { AOM_CDF4(28400, 32631, 32744) },
+                            { AOM_CDF4(22104, 31793, 32603) },
+                            { AOM_CDF4(13557, 26571, 30846) },
+                            { AOM_CDF4(7749, 19861, 26675) },
+                            { AOM_CDF4(4873, 14030, 21234) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9800, 17635, 21073) },
+                            { AOM_CDF4(26153, 31885, 32527) },
+                            { AOM_CDF4(15038, 27852, 31006) },
+                            { AOM_CDF4(8718, 20564, 26486) },
+                            { AOM_CDF4(5128, 14076, 20514) },
+                            { AOM_CDF4(2636, 7566, 11925) },
+                            { AOM_CDF4(27551, 32504, 32701) },
+                            { AOM_CDF4(18310, 30054, 32100) },
+                            { AOM_CDF4(10211, 23420, 29082) },
+                            { AOM_CDF4(6222, 16876, 23916) },
+                            { AOM_CDF4(3462, 9954, 15498) },
+                            { AOM_CDF4(29991, 32633, 32721) },
+                            { AOM_CDF4(19883, 30751, 32201) },
+                            { AOM_CDF4(11141, 24184, 29285) },
+                            { AOM_CDF4(6420, 16940, 23774) },
+                            { AOM_CDF4(3392, 9753, 15118) },
+                            { AOM_CDF4(28465, 32616, 32712) },
+                            { AOM_CDF4(19850, 30702, 32244) },
+                            { AOM_CDF4(10983, 24024, 29223) },
+                            { AOM_CDF4(6294, 16770, 23582) },
+                            { AOM_CDF4(3244, 9283, 14509) },
+                            { AOM_CDF4(30023, 32717, 32748) },
+                            { AOM_CDF4(22940, 32032, 32626) },
+                            { AOM_CDF4(14282, 27928, 31473) },
+                            { AOM_CDF4(8562, 21327, 27914) },
+                            { AOM_CDF4(4846, 13393, 19919) },
+                            { AOM_CDF4(29981, 32590, 32695) },
+                            { AOM_CDF4(20465, 30963, 32166) },
+                            { AOM_CDF4(11479, 23579, 28195) },
+                            { AOM_CDF4(5916, 15648, 22073) },
+                            { AOM_CDF4(3031, 8605, 13398) },
+                            { AOM_CDF4(31146, 32691, 32739) },
+                            { AOM_CDF4(23106, 31724, 32444) },
+                            { AOM_CDF4(13783, 26738, 30439) },
+                            { AOM_CDF4(7852, 19468, 25807) },
+                            { AOM_CDF4(3860, 11124, 16853) },
+                            { AOM_CDF4(31014, 32724, 32748) },
+                            { AOM_CDF4(23629, 32109, 32628) },
+                            { AOM_CDF4(14747, 28115, 31403) },
+                            { AOM_CDF4(8545, 21242, 27478) },
+                            { AOM_CDF4(4574, 12781, 19067) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9185, 19694, 24688) },
+                            { AOM_CDF4(26081, 31985, 32621) },
+                            { AOM_CDF4(16015, 29000, 31787) },
+                            { AOM_CDF4(10542, 23690, 29206) },
+                            { AOM_CDF4(6732, 17945, 24677) },
+                            { AOM_CDF4(3916, 11039, 16722) },
+                            { AOM_CDF4(28224, 32566, 32744) },
+                            { AOM_CDF4(19100, 31138, 32485) },
+                            { AOM_CDF4(12528, 26620, 30879) },
+                            { AOM_CDF4(7741, 20277, 26885) },
+                            { AOM_CDF4(4566, 12845, 18990) },
+                            { AOM_CDF4(29933, 32593, 32718) },
+                            { AOM_CDF4(17670, 30333, 32155) },
+                            { AOM_CDF4(10385, 23600, 28909) },
+                            { AOM_CDF4(6243, 16236, 22407) },
+                            { AOM_CDF4(3976, 10389, 16017) },
+                            { AOM_CDF4(28377, 32561, 32738) },
+                            { AOM_CDF4(19366, 31175, 32482) },
+                            { AOM_CDF4(13327, 27175, 31094) },
+                            { AOM_CDF4(8258, 20769, 27143) },
+                            { AOM_CDF4(4703, 13198, 19527) },
+                            { AOM_CDF4(31086, 32706, 32748) },
+                            { AOM_CDF4(22853, 31902, 32583) },
+                            { AOM_CDF4(14759, 28186, 31419) },
+                            { AOM_CDF4(9284, 22382, 28348) },
+                            { AOM_CDF4(5585, 15192, 21868) },
+                            { AOM_CDF4(28291, 32652, 32746) },
+                            { AOM_CDF4(19849, 32107, 32571) },
+                            { AOM_CDF4(14834, 26818, 29214) },
+                            { AOM_CDF4(10306, 22594, 28672) },
+                            { AOM_CDF4(6615, 17384, 23384) },
+                            { AOM_CDF4(28947, 32604, 32745) },
+                            { AOM_CDF4(25625, 32289, 32646) },
+                            { AOM_CDF4(18758, 28672, 31403) },
+                            { AOM_CDF4(10017, 23430, 28523) },
+                            { AOM_CDF4(6862, 15269, 22131) },
+                            { AOM_CDF4(23933, 32509, 32739) },
+                            { AOM_CDF4(19927, 31495, 32631) },
+                            { AOM_CDF4(11903, 26023, 30621) },
+                            { AOM_CDF4(7026, 20094, 27252) },
+                            { AOM_CDF4(5998, 18106, 24437) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4456, 11274, 15533) },
+                            { AOM_CDF4(21219, 29079, 31616) },
+                            { AOM_CDF4(11173, 23774, 28567) },
+                            { AOM_CDF4(7282, 18293, 24263) },
+                            { AOM_CDF4(4890, 13286, 19115) },
+                            { AOM_CDF4(1890, 5508, 8659) },
+                            { AOM_CDF4(26651, 32136, 32647) },
+                            { AOM_CDF4(14630, 28254, 31455) },
+                            { AOM_CDF4(8716, 21287, 27395) },
+                            { AOM_CDF4(5615, 15331, 22008) },
+                            { AOM_CDF4(2675, 7700, 12150) },
+                            { AOM_CDF4(29954, 32526, 32690) },
+                            { AOM_CDF4(16126, 28982, 31633) },
+                            { AOM_CDF4(9030, 21361, 27352) },
+                            { AOM_CDF4(5411, 14793, 21271) },
+                            { AOM_CDF4(2943, 8422, 13163) },
+                            { AOM_CDF4(29539, 32601, 32730) },
+                            { AOM_CDF4(18125, 30385, 32201) },
+                            { AOM_CDF4(10422, 24090, 29468) },
+                            { AOM_CDF4(6468, 17487, 24438) },
+                            { AOM_CDF4(2970, 8653, 13531) },
+                            { AOM_CDF4(30912, 32715, 32748) },
+                            { AOM_CDF4(20666, 31373, 32497) },
+                            { AOM_CDF4(12509, 26640, 30917) },
+                            { AOM_CDF4(8058, 20629, 27290) },
+                            { AOM_CDF4(4231, 12006, 18052) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10202, 20633, 25484) },
+                            { AOM_CDF4(27336, 31445, 32352) },
+                            { AOM_CDF4(12420, 24384, 28552) },
+                            { AOM_CDF4(7648, 18115, 23856) },
+                            { AOM_CDF4(5662, 14341, 19902) },
+                            { AOM_CDF4(3611, 10328, 15390) },
+                            { AOM_CDF4(30945, 32616, 32736) },
+                            { AOM_CDF4(18682, 30505, 32253) },
+                            { AOM_CDF4(11513, 25336, 30203) },
+                            { AOM_CDF4(7449, 19452, 26148) },
+                            { AOM_CDF4(4482, 13051, 18886) },
+                            { AOM_CDF4(32022, 32690, 32747) },
+                            { AOM_CDF4(18578, 30501, 32146) },
+                            { AOM_CDF4(11249, 23368, 28631) },
+                            { AOM_CDF4(5645, 16958, 22158) },
+                            { AOM_CDF4(5009, 11444, 16637) },
+                            { AOM_CDF4(31357, 32710, 32748) },
+                            { AOM_CDF4(21552, 31494, 32504) },
+                            { AOM_CDF4(13891, 27677, 31340) },
+                            { AOM_CDF4(9051, 22098, 28172) },
+                            { AOM_CDF4(5190, 13377, 19486) },
+                            { AOM_CDF4(32364, 32740, 32748) },
+                            { AOM_CDF4(24839, 31907, 32551) },
+                            { AOM_CDF4(17160, 28779, 31696) },
+                            { AOM_CDF4(12452, 24137, 29602) },
+                            { AOM_CDF4(6165, 15389, 22477) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(2575, 7281, 11077) },
+                            { AOM_CDF4(14002, 20866, 25402) },
+                            { AOM_CDF4(6343, 15056, 19658) },
+                            { AOM_CDF4(4474, 11858, 17041) },
+                            { AOM_CDF4(2865, 8299, 12534) },
+                            { AOM_CDF4(1344, 3949, 6391) },
+                            { AOM_CDF4(24720, 31239, 32459) },
+                            { AOM_CDF4(12585, 25356, 29968) },
+                            { AOM_CDF4(7181, 18246, 24444) },
+                            { AOM_CDF4(5025, 13667, 19885) },
+                            { AOM_CDF4(2521, 7304, 11605) },
+                            { AOM_CDF4(29908, 32252, 32584) },
+                            { AOM_CDF4(17421, 29156, 31575) },
+                            { AOM_CDF4(9889, 22188, 27782) },
+                            { AOM_CDF4(5878, 15647, 22123) },
+                            { AOM_CDF4(2814, 8665, 13323) },
+                            { AOM_CDF4(30183, 32568, 32713) },
+                            { AOM_CDF4(18528, 30195, 32049) },
+                            { AOM_CDF4(10982, 24606, 29657) },
+                            { AOM_CDF4(6957, 18165, 25231) },
+                            { AOM_CDF4(3508, 10118, 15468) },
+                            { AOM_CDF4(31761, 32736, 32748) },
+                            { AOM_CDF4(21041, 31328, 32546) },
+                            { AOM_CDF4(12568, 26732, 31166) },
+                            { AOM_CDF4(8052, 20720, 27733) },
+                            { AOM_CDF4(4336, 12192, 18396) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(7062, 16472, 22319) },
+                            { AOM_CDF4(24538, 32261, 32674) },
+                            { AOM_CDF4(13675, 28041, 31779) },
+                            { AOM_CDF4(8590, 20674, 27631) },
+                            { AOM_CDF4(5685, 14675, 22013) },
+                            { AOM_CDF4(3655, 9898, 15731) },
+                            { AOM_CDF4(26493, 32418, 32658) },
+                            { AOM_CDF4(16376, 29342, 32090) },
+                            { AOM_CDF4(10594, 22649, 28970) },
+                            { AOM_CDF4(8176, 17170, 24303) },
+                            { AOM_CDF4(5605, 12694, 19139) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(23888, 31902, 32542) },
+                            { AOM_CDF4(18612, 29687, 31987) },
+                            { AOM_CDF4(16245, 24852, 29249) },
+                            { AOM_CDF4(15765, 22608, 27559) },
+                            { AOM_CDF4(19895, 24699, 27510) },
+                            { AOM_CDF4(28401, 32212, 32457) },
+                            { AOM_CDF4(15274, 27825, 30980) },
+                            { AOM_CDF4(9364, 18128, 24332) },
+                            { AOM_CDF4(2283, 8193, 15082) },
+                            { AOM_CDF4(1228, 3972, 7881) },
+                            { AOM_CDF4(29455, 32469, 32620) },
+                            { AOM_CDF4(17981, 28245, 31388) },
+                            { AOM_CDF4(10921, 20098, 26240) },
+                            { AOM_CDF4(3743, 11829, 18657) },
+                            { AOM_CDF4(2374, 9593, 15715) },
+                            { AOM_CDF4(31068, 32466, 32635) },
+                            { AOM_CDF4(20321, 29572, 31971) },
+                            { AOM_CDF4(10771, 20255, 27119) },
+                            { AOM_CDF4(2795, 10410, 17361) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9320, 22102, 27840) },
+                            { AOM_CDF4(27057, 32464, 32724) },
+                            { AOM_CDF4(16331, 30268, 32309) },
+                            { AOM_CDF4(10319, 23935, 29720) },
+                            { AOM_CDF4(6189, 16448, 24106) },
+                            { AOM_CDF4(3589, 10884, 18808) },
+                            { AOM_CDF4(29026, 32624, 32748) },
+                            { AOM_CDF4(19226, 31507, 32587) },
+                            { AOM_CDF4(12692, 26921, 31203) },
+                            { AOM_CDF4(7049, 19532, 27635) },
+                            { AOM_CDF4(7727, 15669, 23252) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(28056, 32625, 32748) },
+                            { AOM_CDF4(22383, 32075, 32669) },
+                            { AOM_CDF4(15417, 27098, 31749) },
+                            { AOM_CDF4(18127, 26493, 27190) },
+                            { AOM_CDF4(5461, 16384, 21845) },
+                            { AOM_CDF4(27982, 32091, 32584) },
+                            { AOM_CDF4(19045, 29868, 31972) },
+                            { AOM_CDF4(10397, 22266, 27932) },
+                            { AOM_CDF4(5990, 13697, 21500) },
+                            { AOM_CDF4(1792, 6912, 15104) },
+                            { AOM_CDF4(28198, 32501, 32718) },
+                            { AOM_CDF4(21534, 31521, 32569) },
+                            { AOM_CDF4(11109, 25217, 30017) },
+                            { AOM_CDF4(5671, 15124, 26151) },
+                            { AOM_CDF4(4681, 14043, 18725) },
+                            { AOM_CDF4(28688, 32580, 32741) },
+                            { AOM_CDF4(22576, 32079, 32661) },
+                            { AOM_CDF4(10627, 22141, 28340) },
+                            { AOM_CDF4(9362, 14043, 28087) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7754, 16948, 22142) },
+                            { AOM_CDF4(25670, 32330, 32691) },
+                            { AOM_CDF4(15663, 29225, 31994) },
+                            { AOM_CDF4(9878, 23288, 29158) },
+                            { AOM_CDF4(6419, 17088, 24336) },
+                            { AOM_CDF4(3859, 11003, 17039) },
+                            { AOM_CDF4(27562, 32595, 32725) },
+                            { AOM_CDF4(17575, 30588, 32399) },
+                            { AOM_CDF4(10819, 24838, 30309) },
+                            { AOM_CDF4(7124, 18686, 25916) },
+                            { AOM_CDF4(4479, 12688, 19340) },
+                            { AOM_CDF4(28385, 32476, 32673) },
+                            { AOM_CDF4(15306, 29005, 31938) },
+                            { AOM_CDF4(8937, 21615, 28322) },
+                            { AOM_CDF4(5982, 15603, 22786) },
+                            { AOM_CDF4(3620, 10267, 16136) },
+                            { AOM_CDF4(27280, 32464, 32667) },
+                            { AOM_CDF4(15607, 29160, 32004) },
+                            { AOM_CDF4(9091, 22135, 28740) },
+                            { AOM_CDF4(6232, 16632, 24020) },
+                            { AOM_CDF4(4047, 11377, 17672) },
+                            { AOM_CDF4(29220, 32630, 32718) },
+                            { AOM_CDF4(19650, 31220, 32462) },
+                            { AOM_CDF4(13050, 26312, 30827) },
+                            { AOM_CDF4(9228, 20870, 27468) },
+                            { AOM_CDF4(6146, 15149, 21971) },
+                            { AOM_CDF4(30169, 32481, 32623) },
+                            { AOM_CDF4(17212, 29311, 31554) },
+                            { AOM_CDF4(9911, 21311, 26882) },
+                            { AOM_CDF4(4487, 13314, 20372) },
+                            { AOM_CDF4(2570, 7772, 12889) },
+                            { AOM_CDF4(30924, 32613, 32708) },
+                            { AOM_CDF4(19490, 30206, 32107) },
+                            { AOM_CDF4(11232, 23998, 29276) },
+                            { AOM_CDF4(6769, 17955, 25035) },
+                            { AOM_CDF4(4398, 12623, 19214) },
+                            { AOM_CDF4(30609, 32627, 32722) },
+                            { AOM_CDF4(19370, 30582, 32287) },
+                            { AOM_CDF4(10457, 23619, 29409) },
+                            { AOM_CDF4(6443, 17637, 24834) },
+                            { AOM_CDF4(4645, 13236, 20106) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8626, 20271, 26216) },
+                            { AOM_CDF4(26707, 32406, 32711) },
+                            { AOM_CDF4(16999, 30329, 32286) },
+                            { AOM_CDF4(11445, 25123, 30286) },
+                            { AOM_CDF4(6411, 18828, 25601) },
+                            { AOM_CDF4(6801, 12458, 20248) },
+                            { AOM_CDF4(29918, 32682, 32748) },
+                            { AOM_CDF4(20649, 31739, 32618) },
+                            { AOM_CDF4(12879, 27773, 31581) },
+                            { AOM_CDF4(7896, 21751, 28244) },
+                            { AOM_CDF4(5260, 14870, 23698) },
+                            { AOM_CDF4(29252, 32593, 32731) },
+                            { AOM_CDF4(17072, 30460, 32294) },
+                            { AOM_CDF4(10653, 24143, 29365) },
+                            { AOM_CDF4(6536, 17490, 23983) },
+                            { AOM_CDF4(4929, 13170, 20085) },
+                            { AOM_CDF4(28137, 32518, 32715) },
+                            { AOM_CDF4(18171, 30784, 32407) },
+                            { AOM_CDF4(11437, 25436, 30459) },
+                            { AOM_CDF4(7252, 18534, 26176) },
+                            { AOM_CDF4(4126, 13353, 20978) },
+                            { AOM_CDF4(31162, 32726, 32748) },
+                            { AOM_CDF4(23017, 32222, 32701) },
+                            { AOM_CDF4(15629, 29233, 32046) },
+                            { AOM_CDF4(9387, 22621, 29480) },
+                            { AOM_CDF4(6922, 17616, 25010) },
+                            { AOM_CDF4(28838, 32265, 32614) },
+                            { AOM_CDF4(19701, 30206, 31920) },
+                            { AOM_CDF4(11214, 22410, 27933) },
+                            { AOM_CDF4(5320, 14177, 23034) },
+                            { AOM_CDF4(5049, 12881, 17827) },
+                            { AOM_CDF4(27484, 32471, 32734) },
+                            { AOM_CDF4(21076, 31526, 32561) },
+                            { AOM_CDF4(12707, 26303, 31211) },
+                            { AOM_CDF4(8169, 21722, 28219) },
+                            { AOM_CDF4(6045, 19406, 27042) },
+                            { AOM_CDF4(27753, 32572, 32745) },
+                            { AOM_CDF4(20832, 31878, 32653) },
+                            { AOM_CDF4(13250, 27356, 31674) },
+                            { AOM_CDF4(7718, 21508, 29858) },
+                            { AOM_CDF4(7209, 18350, 25559) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7876, 16901, 21741) },
+                            { AOM_CDF4(24001, 31898, 32625) },
+                            { AOM_CDF4(14529, 27959, 31451) },
+                            { AOM_CDF4(8273, 20818, 27258) },
+                            { AOM_CDF4(5278, 14673, 21510) },
+                            { AOM_CDF4(2983, 8843, 14039) },
+                            { AOM_CDF4(28016, 32574, 32732) },
+                            { AOM_CDF4(17471, 30306, 32301) },
+                            { AOM_CDF4(10224, 24063, 29728) },
+                            { AOM_CDF4(6602, 17954, 25052) },
+                            { AOM_CDF4(4002, 11585, 17759) },
+                            { AOM_CDF4(30190, 32634, 32739) },
+                            { AOM_CDF4(17497, 30282, 32270) },
+                            { AOM_CDF4(10229, 23729, 29538) },
+                            { AOM_CDF4(6344, 17211, 24440) },
+                            { AOM_CDF4(3849, 11189, 17108) },
+                            { AOM_CDF4(28570, 32583, 32726) },
+                            { AOM_CDF4(17521, 30161, 32238) },
+                            { AOM_CDF4(10153, 23565, 29378) },
+                            { AOM_CDF4(6455, 17341, 24443) },
+                            { AOM_CDF4(3907, 11042, 17024) },
+                            { AOM_CDF4(30689, 32715, 32748) },
+                            { AOM_CDF4(21546, 31840, 32610) },
+                            { AOM_CDF4(13547, 27581, 31459) },
+                            { AOM_CDF4(8912, 21757, 28309) },
+                            { AOM_CDF4(5548, 15080, 22046) },
+                            { AOM_CDF4(30783, 32540, 32685) },
+                            { AOM_CDF4(17540, 29528, 31668) },
+                            { AOM_CDF4(10160, 21468, 26783) },
+                            { AOM_CDF4(4724, 13393, 20054) },
+                            { AOM_CDF4(2702, 8174, 13102) },
+                            { AOM_CDF4(31648, 32686, 32742) },
+                            { AOM_CDF4(20954, 31094, 32337) },
+                            { AOM_CDF4(12420, 25698, 30179) },
+                            { AOM_CDF4(7304, 19320, 26248) },
+                            { AOM_CDF4(4366, 12261, 18864) },
+                            { AOM_CDF4(31581, 32723, 32748) },
+                            { AOM_CDF4(21373, 31586, 32525) },
+                            { AOM_CDF4(12744, 26625, 30885) },
+                            { AOM_CDF4(7431, 20322, 26950) },
+                            { AOM_CDF4(4692, 13323, 20111) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(7833, 18369, 24095) },
+                            { AOM_CDF4(26650, 32273, 32702) },
+                            { AOM_CDF4(16371, 29961, 32191) },
+                            { AOM_CDF4(11055, 24082, 29629) },
+                            { AOM_CDF4(6892, 18644, 25400) },
+                            { AOM_CDF4(5006, 13057, 19240) },
+                            { AOM_CDF4(29834, 32666, 32748) },
+                            { AOM_CDF4(19577, 31335, 32570) },
+                            { AOM_CDF4(12253, 26509, 31122) },
+                            { AOM_CDF4(7991, 20772, 27711) },
+                            { AOM_CDF4(5677, 15910, 23059) },
+                            { AOM_CDF4(30109, 32532, 32720) },
+                            { AOM_CDF4(16747, 30166, 32252) },
+                            { AOM_CDF4(10134, 23542, 29184) },
+                            { AOM_CDF4(5791, 16176, 23556) },
+                            { AOM_CDF4(4362, 10414, 17284) },
+                            { AOM_CDF4(29492, 32626, 32748) },
+                            { AOM_CDF4(19894, 31402, 32525) },
+                            { AOM_CDF4(12942, 27071, 30869) },
+                            { AOM_CDF4(8346, 21216, 27405) },
+                            { AOM_CDF4(6572, 17087, 23859) },
+                            { AOM_CDF4(32035, 32735, 32748) },
+                            { AOM_CDF4(22957, 31838, 32618) },
+                            { AOM_CDF4(14724, 28572, 31772) },
+                            { AOM_CDF4(10364, 23999, 29553) },
+                            { AOM_CDF4(7004, 18433, 25655) },
+                            { AOM_CDF4(27528, 32277, 32681) },
+                            { AOM_CDF4(16959, 31171, 32096) },
+                            { AOM_CDF4(10486, 23593, 27962) },
+                            { AOM_CDF4(8192, 16384, 23211) },
+                            { AOM_CDF4(8937, 17873, 20852) },
+                            { AOM_CDF4(27715, 32002, 32615) },
+                            { AOM_CDF4(15073, 29491, 31676) },
+                            { AOM_CDF4(11264, 24576, 28672) },
+                            { AOM_CDF4(2341, 18725, 23406) },
+                            { AOM_CDF4(7282, 18204, 25486) },
+                            { AOM_CDF4(28547, 32213, 32657) },
+                            { AOM_CDF4(20788, 29773, 32239) },
+                            { AOM_CDF4(6780, 21469, 30508) },
+                            { AOM_CDF4(5958, 14895, 23831) },
+                            { AOM_CDF4(16384, 21845, 27307) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5992, 14304, 19765) },
+                            { AOM_CDF4(22612, 31238, 32456) },
+                            { AOM_CDF4(13456, 27162, 31087) },
+                            { AOM_CDF4(8001, 20062, 26504) },
+                            { AOM_CDF4(5168, 14105, 20764) },
+                            { AOM_CDF4(2632, 7771, 12385) },
+                            { AOM_CDF4(27034, 32344, 32709) },
+                            { AOM_CDF4(15850, 29415, 31997) },
+                            { AOM_CDF4(9494, 22776, 28841) },
+                            { AOM_CDF4(6151, 16830, 23969) },
+                            { AOM_CDF4(3461, 10039, 15722) },
+                            { AOM_CDF4(30134, 32569, 32731) },
+                            { AOM_CDF4(15638, 29422, 31945) },
+                            { AOM_CDF4(9150, 21865, 28218) },
+                            { AOM_CDF4(5647, 15719, 22676) },
+                            { AOM_CDF4(3402, 9772, 15477) },
+                            { AOM_CDF4(28530, 32586, 32735) },
+                            { AOM_CDF4(17139, 30298, 32292) },
+                            { AOM_CDF4(10200, 24039, 29685) },
+                            { AOM_CDF4(6419, 17674, 24786) },
+                            { AOM_CDF4(3544, 10225, 15824) },
+                            { AOM_CDF4(31333, 32726, 32748) },
+                            { AOM_CDF4(20618, 31487, 32544) },
+                            { AOM_CDF4(12901, 27217, 31232) },
+                            { AOM_CDF4(8624, 21734, 28171) },
+                            { AOM_CDF4(5104, 14191, 20748) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11206, 21090, 26561) },
+                            { AOM_CDF4(28759, 32279, 32671) },
+                            { AOM_CDF4(14171, 27952, 31569) },
+                            { AOM_CDF4(9743, 22907, 29141) },
+                            { AOM_CDF4(6871, 17886, 24868) },
+                            { AOM_CDF4(4960, 13152, 19315) },
+                            { AOM_CDF4(31077, 32661, 32748) },
+                            { AOM_CDF4(19400, 31195, 32515) },
+                            { AOM_CDF4(12752, 26858, 31040) },
+                            { AOM_CDF4(8370, 22098, 28591) },
+                            { AOM_CDF4(5457, 15373, 22298) },
+                            { AOM_CDF4(31697, 32706, 32748) },
+                            { AOM_CDF4(17860, 30657, 32333) },
+                            { AOM_CDF4(12510, 24812, 29261) },
+                            { AOM_CDF4(6180, 19124, 24722) },
+                            { AOM_CDF4(5041, 13548, 17959) },
+                            { AOM_CDF4(31552, 32716, 32748) },
+                            { AOM_CDF4(21908, 31769, 32623) },
+                            { AOM_CDF4(14470, 28201, 31565) },
+                            { AOM_CDF4(9493, 22982, 28608) },
+                            { AOM_CDF4(6858, 17240, 24137) },
+                            { AOM_CDF4(32543, 32752, 32756) },
+                            { AOM_CDF4(24286, 32097, 32666) },
+                            { AOM_CDF4(15958, 29217, 32024) },
+                            { AOM_CDF4(10207, 24234, 29958) },
+                            { AOM_CDF4(6929, 18305, 25652) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4137, 10847, 15682) },
+                            { AOM_CDF4(17824, 27001, 30058) },
+                            { AOM_CDF4(10204, 22796, 28291) },
+                            { AOM_CDF4(6076, 15935, 22125) },
+                            { AOM_CDF4(3852, 10937, 16816) },
+                            { AOM_CDF4(2252, 6324, 10131) },
+                            { AOM_CDF4(25840, 32016, 32662) },
+                            { AOM_CDF4(15109, 28268, 31531) },
+                            { AOM_CDF4(9385, 22231, 28340) },
+                            { AOM_CDF4(6082, 16672, 23479) },
+                            { AOM_CDF4(3318, 9427, 14681) },
+                            { AOM_CDF4(30594, 32574, 32718) },
+                            { AOM_CDF4(16836, 29552, 31859) },
+                            { AOM_CDF4(9556, 22542, 28356) },
+                            { AOM_CDF4(6305, 16725, 23540) },
+                            { AOM_CDF4(3376, 9895, 15184) },
+                            { AOM_CDF4(29383, 32617, 32745) },
+                            { AOM_CDF4(18891, 30809, 32401) },
+                            { AOM_CDF4(11688, 25942, 30687) },
+                            { AOM_CDF4(7468, 19469, 26651) },
+                            { AOM_CDF4(3909, 11358, 17012) },
+                            { AOM_CDF4(31564, 32736, 32748) },
+                            { AOM_CDF4(20906, 31611, 32600) },
+                            { AOM_CDF4(13191, 27621, 31537) },
+                            { AOM_CDF4(8768, 22029, 28676) },
+                            { AOM_CDF4(5079, 14109, 20906) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } } };
 
 static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
     [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(

diff --git a/libaom/av1/common/txb_common.c b/libaom/av1/common/txb_common.c
index cb92bd8..4eef319 100644
--- a/libaom/av1/common/txb_common.c
+++ b/libaom/av1/common/txb_common.c

@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 
 const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
@@ -453,6 +453,6 @@
   av1_nz_map_ctx_offset_64x32,  // TX_64x16
 };
 
-const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
-                                        17, 33, 65, 129, 257, 513 };
-const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+const int16_t av1_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                          17, 33, 65, 129, 257, 513 };
+const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };

diff --git a/libaom/av1/common/txb_common.h b/libaom/av1/common/txb_common.h
index 8a3932d..5a62fa8 100644
--- a/libaom/av1/common/txb_common.h
+++ b/libaom/av1/common/txb_common.h

@@ -12,10 +12,10 @@
 #ifndef AOM_AV1_COMMON_TXB_COMMON_H_
 #define AOM_AV1_COMMON_TXB_COMMON_H_
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
-extern const int16_t k_eob_group_start[12];
-extern const int16_t k_eob_offset_bits[12];
+extern const int16_t av1_eob_group_start[12];
+extern const int16_t av1_eob_offset_bits[12];
 
 extern const int8_t av1_coeff_band_4x4[16];
 
@@ -386,7 +386,9 @@
     if (plane_bsize == txsize_to_bsize[tx_size]) {
       txb_ctx->txb_skip_ctx = 0;
     } else {
-      // This is the algorithm to generate table skip_contexts[min][max].
+      // This is the algorithm to generate table skip_contexts[top][left].
+      //    const int max = AOMMIN(top | left, 4);
+      //    const int min = AOMMIN(AOMMIN(top, left), 4);
       //    if (!max)
       //      txb_skip_ctx = 1;
       //    else if (!min)
@@ -398,10 +400,15 @@
       //    else
       //      txb_skip_ctx = 6;
       static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 6 } };
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 3, 5, 5, 5, 6 } };
+      // For top and left, we only care about which of the following three
+      // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
+      // spec calculates top and left with the Max() function. We can calculate
+      // an approximate max with bitwise OR because the real max and the
+      // approximate max belong to the same category.
       int top = 0;
       int left = 0;
 
@@ -410,16 +417,16 @@
         top |= a[k];
       } while (++k < txb_w_unit);
       top &= COEFF_CONTEXT_MASK;
+      top = AOMMIN(top, 4);
 
       k = 0;
       do {
         left |= l[k];
       } while (++k < txb_h_unit);
       left &= COEFF_CONTEXT_MASK;
-      const int max = AOMMIN(top | left, 4);
-      const int min = AOMMIN(AOMMIN(top, left), 4);
+      left = AOMMIN(left, 4);
 
-      txb_ctx->txb_skip_ctx = skip_contexts[min][max];
+      txb_ctx->txb_skip_ctx = skip_contexts[top][left];
     }
   } else {
     const int ctx_base = get_entropy_context(tx_size, a, l);

diff --git a/libaom/av1/common/warped_motion.c b/libaom/av1/common/warped_motion.c
index e232e10..4e9fab9 100644
--- a/libaom/av1/common/warped_motion.c
+++ b/libaom/av1/common/warped_motion.c

@@ -20,85 +20,13 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-#define WARP_ERROR_BLOCK 32
-
-/* clang-format off */
-static const int error_measure_lut[512] = {
-  // pow 0.7
-  16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
-  16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
-  15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
-  15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
-  14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
-  14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
-  14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
-  13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
-  13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
-  12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
-  12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
-  12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
-  11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
-  11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
-  10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
-  10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
-  10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
-  9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
-  9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
-  8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
-  8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
-  7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
-  7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
-  6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
-  6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
-  5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
-  5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
-  4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
-  3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
-  3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
-  2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
-  1323, 1187, 1045,  894,  731,  550,  339,    0,
-  339,  550,  731,  894, 1045, 1187, 1323, 1452,
-  1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
-  2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
-  3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
-  3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
-  4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
-  5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
-  5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
-  6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
-  6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
-  7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
-  7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
-  8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
-  8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
-  9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
-  9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
-  10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
-  10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
-  11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
-  11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
-  11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
-  12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
-  12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
-  13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
-  13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
-  13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
-  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
-  14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
-  15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
-  15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
-  15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
-  16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
 #if WARPEDPIXEL_PREC_BITS == 6
   // [-1, 0)
   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
@@ -345,7 +273,7 @@
 }
 
 // Returns 1 on success or 0 on an invalid affine set
-int get_shear_params(WarpedMotionParams *wm) {
+int av1_get_shear_params(WarpedMotionParams *wm) {
   const int32_t *mat = wm->wmmat;
   if (!is_affine_valid(wm)) return 0;
   wm->alpha =
@@ -376,6 +304,7 @@
   return 1;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int highbd_error_measure(int err, int bd) {
   const int b = bd - 8;
   const int bmask = (1 << b) - 1;
@@ -447,7 +376,7 @@
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -468,7 +397,7 @@
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
@@ -514,12 +443,11 @@
   }
 }
 
-static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
-                              int width, int height, int stride,
-                              const uint8_t *const pred8, int p_col, int p_row,
-                              int p_width, int p_height, int p_stride,
-                              int subsampling_x, int subsampling_y, int bd,
-                              ConvolveParams *conv_params) {
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params) {
   assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
@@ -531,17 +459,15 @@
   const int16_t gamma = wm->gamma;
   const int16_t delta = wm->delta;
 
-  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                          p_width, p_height, p_stride, subsampling_x,
                          subsampling_y, bd, conv_params, alpha, beta, gamma,
                          delta);
 }
 
-static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
-                                  const uint16_t *const dst, int p_width,
-                                  int p_height, int p_stride, int bd) {
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -552,41 +478,33 @@
   return sum_error;
 }
 
-static int64_t highbd_warp_error(
-    WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
-    int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
-    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
-    int64_t best_error) {
-  int64_t gm_sumerr = 0;
+static int64_t highbd_segmented_frame_error(
+    const uint16_t *const ref, int stride, const uint16_t *const dst,
+    int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
+    int segment_map_stride) {
+  int patch_w, patch_h;
   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
 
-  ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  conv_params.use_dist_wtd_comp_avg = 0;
-  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
-      // avoid warping extra 8x8 blocks in the padded region of the frame
-      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      highbd_warp_plane(wm, ref8, width, height, stride,
-                        CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
-                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
-                        &conv_params);
-
-      gm_sumerr += highbd_frame_error(
-          tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
-          warp_w, warp_h, p_stride, bd);
-      if (gm_sumerr > best_error) return gm_sumerr;
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
+                                               dst + j + i * p_stride, patch_w,
+                                               patch_h, p_stride, bd);
     }
   }
-  return gm_sumerr;
+  return sum_error;
 }
-
-static INLINE int error_measure(int err) {
-  return error_measure_lut[255 + err];
-}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    * Split the input into 8x8 blocks
@@ -732,7 +650,7 @@
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -756,7 +674,7 @@
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
@@ -801,11 +719,10 @@
   }
 }
 
-static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
-                       int width, int height, int stride, uint8_t *pred,
-                       int p_col, int p_row, int p_width, int p_height,
-                       int p_stride, int subsampling_x, int subsampling_y,
-                       ConvolveParams *conv_params) {
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params) {
   assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
@@ -821,9 +738,9 @@
                   alpha, beta, gamma, delta);
 }
 
-static int64_t frame_error(const uint8_t *const ref, int stride,
-                           const uint8_t *const dst, int p_width, int p_height,
-                           int p_stride) {
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+                               const uint8_t *const dst, int p_width,
+                               int p_height, int p_stride) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -834,61 +751,64 @@
   return sum_error;
 }
 
-static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
-                          int width, int height, int stride,
-                          const uint8_t *const dst, int p_col, int p_row,
-                          int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          int64_t best_error) {
-  int64_t gm_sumerr = 0;
-  int warp_w, warp_h;
-  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 8);
-  conv_params.use_dist_wtd_comp_avg = 0;
+static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
+                                     const uint8_t *const dst, int p_width,
+                                     int p_height, int p_stride,
+                                     uint8_t *segment_map,
+                                     int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
 
-  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
-      // avoid warping extra 8x8 blocks in the padded region of the frame
-      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
-
-      gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
-                               warp_w, warp_h, p_stride);
-      if (gm_sumerr > best_error) return gm_sumerr;
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
+                                        dst + j + i * p_stride, patch_w,
+                                        patch_h, p_stride);
     }
   }
-  return gm_sumerr;
+  return sum_error;
 }
 
 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
                         uint8_t *dst, int p_width, int p_height, int p_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
-    return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                              CONVERT_TO_SHORTPTR(dst), p_width, p_height,
-                              p_stride, bd);
+    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                                       CONVERT_TO_SHORTPTR(dst), p_width,
+                                       p_height, p_stride, bd);
   }
-  return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error) {
-  if (wm->wmtype <= AFFINE)
-    if (!get_shear_params(wm)) return 1;
-  if (use_hbd)
-    return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
-                             p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, bd, best_error);
-  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y,
-                    best_error);
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map,
+                                  int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    return highbd_segmented_frame_error(
+        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
+        p_height, p_stride, bd, segment_map, segment_map_stride);
+  }
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
+                               segment_map, segment_map_stride);
 }
 
 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
@@ -896,13 +816,21 @@
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
                     int subsampling_y, ConvolveParams *conv_params) {
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd)
-    highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
-                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      bd, conv_params);
+    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
+                      p_height, p_stride, subsampling_x, subsampling_y, bd,
+                      conv_params);
   else
     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#else
+  (void)use_hbd;
+  (void)bd;
+  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#endif
 }
 
 #define LS_MV_MAX 256  // max mv in 1/8-pel
@@ -1023,18 +951,15 @@
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
-  const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int rsuy = bh / 2 - 1;
+  const int rsux = bw / 2 - 1;
   const int suy = rsuy * 8;
   const int sux = rsux * 8;
   const int duy = suy + mvy;
   const int dux = sux + mvx;
-  const int isuy = (mi_row * MI_SIZE + rsuy);
-  const int isux = (mi_col * MI_SIZE + rsux);
 
   // Assume the center pixel of the block has exactly the same motion vector
   // as transmitted for the block. First shift the origin of the source
@@ -1059,7 +984,7 @@
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
   // Contribution from neighbor block
-  for (i = 0; i < np; i++) {
+  for (int i = 0; i < np; i++) {
     const int dx = pts2[i * 2] - dux;
     const int dy = pts2[i * 2 + 1] - duy;
     const int sx = pts1[i * 2] - sux;
@@ -1087,13 +1012,12 @@
   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
 
-  int64_t Det;
-  int16_t iDet, shift;
-
   // Compute Determinant of A
-  Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
   if (Det == 0) return 1;
-  iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+
+  int16_t shift;
+  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
   shift -= WARPEDMODEL_PREC_BITS;
   if (shift < 0) {
     iDet <<= (-shift);
@@ -1101,7 +1025,6 @@
   }
 
   int64_t Px[2], Py[2];
-
   // These divided by the Det, are the least squares solutions
   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
@@ -1113,16 +1036,18 @@
   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
 
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
   // Note: In the vx, vy expressions below, the max value of each of the
   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
   // for the first term so that the overall sum in the worst case fits
   // within 32 bits overall.
-  int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
-               (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
-                isuy * wm->wmmat[3]);
-  int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
-               (isux * wm->wmmat[4] +
-                isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+                      isuy * wm->wmmat[3]);
+  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * wm->wmmat[4] +
+                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
   wm->wmmat[0] =
       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
   wm->wmmat[1] =
@@ -1132,9 +1057,9 @@
   return 0;
 }
 
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
-                    int mvx, WarpedMotionParams *wm_params, int mi_row,
-                    int mi_col) {
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
   assert(wm_params->wmtype == AFFINE);
 
   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
@@ -1142,7 +1067,7 @@
     return 1;
 
   // check compatibility with the fast warp filter
-  if (!get_shear_params(wm_params)) return 1;
+  if (!av1_get_shear_params(wm_params)) return 1;
 
   return 0;
 }

diff --git a/libaom/av1/common/warped_motion.h b/libaom/av1/common/warped_motion.h
index a1a4f06..14dc0fe 100644
--- a/libaom/av1/common/warped_motion.h
+++ b/libaom/av1/common/warped_motion.h

@@ -31,8 +31,83 @@
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
 #define WARPED_MOTION_DEBUG 0
 #define DEFAULT_WMTYPE AFFINE
+#define WARP_ERROR_BLOCK_LOG 5
+#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
 
-extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+DECLARE_ALIGNED(8, extern const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+    // pow 0.7
+    16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+    16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+    15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+    15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+    14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+    14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+    14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+    13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+    13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+    12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+    12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+    12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+    11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+    11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+    10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+    10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+    10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+    9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+    9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+    8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+    8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+    7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+    7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+    6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+    6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+    5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+    5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+    4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+    3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+    3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+    2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+    1323, 1187, 1045,  894,  731,  550,  339,    0,
+    339,  550,  731,  894, 1045, 1187, 1323, 1452,
+    1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+    2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+    3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+    3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+    4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+    5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+    5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+    6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+    6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+    7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+    7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+    8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+    8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+    9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+    9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+    10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+    10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+    11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+    11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+    11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+    12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+    12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+    13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+    13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+    13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+    14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+    14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+    15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+    15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+    15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+    16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
 
 static const uint8_t warp_pad_left[14][16] = {
   { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -68,28 +143,44 @@
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };
 
-// Returns the error between the result of applying motion 'wm' to the frame
-// described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error);
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
 
 // Returns the error between the frame described by 'ref' and the frame
 // described by 'dst'.
 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
                         uint8_t *dst, int p_width, int p_height, int p_stride);
 
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map, int segment_map_stride);
+
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd);
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params);
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params);
+
 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
                     int subsampling_y, ConvolveParams *conv_params);
 
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
-                    int mvx, WarpedMotionParams *wm_params, int mi_row,
-                    int mi_col);
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col);
 
-int get_shear_params(WarpedMotionParams *wm);
+int av1_get_shear_params(WarpedMotionParams *wm);
 #endif  // AOM_AV1_COMMON_WARPED_MOTION_H_

diff --git a/libaom/av1/common/x86/av1_convolve_scale_sse4.c b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
index 8f44238..1966181 100644
--- a/libaom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/libaom/av1/common/x86/av1_convolve_scale_sse4.c

@@ -129,8 +129,8 @@
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt0 = _mm_set1_epi16((short)w0);
+  const __m128i wt1 = _mm_set1_epi16((short)w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
 
   int y_qn = subpel_y_qn;
@@ -236,8 +236,7 @@
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
-  // TODO(yaowu): remove unnecessary initializations
-  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
 

diff --git a/libaom/av1/common/x86/av1_highbd_convolve_sse4.c b/libaom/av1/common/x86/av1_highbd_convolve_sse4.c
deleted file mode 100644
index 212d3bd..0000000
--- a/libaom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ /dev/null

@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/filter.h"
-
-typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
-                              int src_stride, uint16_t *dst, int dst_stride,
-                              int bd);
-
-// pixelsNum 0: write all 4 pixels
-//           1/2/3: residual pixels 1/2/3
-static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
-                       int dst_stride) {
-  if (2 == width) {
-    if (0 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
-      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
-    } else if (1 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-    } else if (2 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-    } else if (3 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
-    }
-  } else {
-    if (0 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
-      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
-    } else if (1 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-    } else if (2 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-    } else if (3 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
-    }
-  }
-}
-
-// 16-bit pixels clip with bd (10/12)
-static void highbd_clip(__m128i *p, int numVecs, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-  int i;
-
-  for (i = 0; i < numVecs; i++) {
-    mask = _mm_cmpgt_epi16(p[i], max);
-    clamped = _mm_andnot_si128(mask, p[i]);
-    mask = _mm_and_si128(mask, max);
-    clamped = _mm_or_si128(mask, clamped);
-    mask = _mm_cmpgt_epi16(clamped, zero);
-    p[i] = _mm_and_si128(clamped, mask);
-  }
-}
-
-static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
-  __m128i v0, v1;
-  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-
-  u[0] = _mm_loadu_si128((__m128i const *)src);
-  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
-  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  u[0] = _mm_add_epi32(u[0], rnd);
-  u[1] = _mm_add_epi32(u[1], rnd);
-  u[2] = _mm_add_epi32(u[2], rnd);
-  u[3] = _mm_add_epi32(u[3], rnd);
-
-  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
-  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
-  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
-  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
-
-  u[0] = _mm_packus_epi32(u[0], u[1]);
-  u[1] = _mm_packus_epi32(u[2], u[3]);
-
-  highbd_clip(u, 2, bd);
-
-  v0 = _mm_unpacklo_epi16(u[0], u[1]);
-  v1 = _mm_unpackhi_epi16(u[0], u[1]);
-
-  u[0] = _mm_unpacklo_epi16(v0, v1);
-  u[2] = _mm_unpackhi_epi16(v0, v1);
-
-  u[1] = _mm_srli_si128(u[0], 8);
-  u[3] = _mm_srli_si128(u[2], 8);
-}
-
-// pixelsNum = 0     : all 4 rows of pixels will be saved.
-// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
-void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
-                    uint16_t *dst, int dst_stride, int bd) {
-  __m128i u[4];
-  transClipPixel(src, src_stride, u, bd);
-  writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
-                          int src_stride, uint16_t *dst, int dst_stride,
-                          int bd) {
-  __m128i u[4], v[4];
-  const __m128i ones = _mm_set1_epi16(1);
-
-  transClipPixel(src, src_stride, u, bd);
-
-  v[0] = _mm_loadl_epi64((__m128i const *)dst);
-  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
-  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
-  u[0] = _mm_add_epi16(u[0], v[0]);
-  u[1] = _mm_add_epi16(u[1], v[1]);
-  u[2] = _mm_add_epi16(u[2], v[2]);
-  u[3] = _mm_add_epi16(u[3], v[3]);
-
-  u[0] = _mm_add_epi16(u[0], ones);
-  u[1] = _mm_add_epi16(u[1], ones);
-  u[2] = _mm_add_epi16(u[2], ones);
-  u[3] = _mm_add_epi16(u[3], ones);
-
-  u[0] = _mm_srai_epi16(u[0], 1);
-  u[1] = _mm_srai_epi16(u[1], 1);
-  u[2] = _mm_srai_epi16(u[2], 1);
-  u[3] = _mm_srai_epi16(u[3], 1);
-
-  writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-// Vertical convolutional filter
-
-typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
-
-static void highbdRndingPacks(__m128i *u) {
-  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-  u[0] = _mm_add_epi32(u[0], rnd);
-  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
-  u[0] = _mm_packus_epi32(u[0], u[0]);
-}
-
-static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
-}
-
-static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
-  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
-  const __m128i ones = _mm_set1_epi16(1);
-
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-
-  v = _mm_add_epi16(v, u[0]);
-  v = _mm_add_epi16(v, ones);
-  v = _mm_srai_epi16(v, 1);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
-}
-
-WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
-
-static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-  _mm_storel_epi64((__m128i *)dst, u[0]);
-}
-
-static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
-  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
-  const __m128i ones = _mm_set1_epi16(1);
-
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-
-  v = _mm_add_epi16(v, u[0]);
-  v = _mm_add_epi16(v, ones);
-  v = _mm_srai_epi16(v, 1);
-  _mm_storel_epi64((__m128i *)dst, v);
-}
-
-WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };

diff --git a/libaom/av1/common/x86/av1_inv_txfm_avx2.c b/libaom/av1/common/x86/av1_inv_txfm_avx2.c
index cf1f947..0fbd5ea 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/libaom/av1/common/x86/av1_inv_txfm_avx2.c

@@ -61,8 +61,7 @@
   btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
 }
 
-static void idct16_new_avx2(const __m256i *input, __m256i *output,
-                            int8_t cos_bit) {
+static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -133,8 +132,8 @@
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct16_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -181,8 +180,8 @@
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct16_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -303,8 +302,8 @@
   output[15] = _mm256_subs_epi16(__zero, x1[1]);
 }
 
-static void iadst16_new_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
+static void iadst16_avx2(const __m256i *input, __m256i *output,
+                         int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -365,8 +364,8 @@
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -401,8 +400,8 @@
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -568,8 +567,8 @@
   btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
 }
 
-static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct32_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -621,8 +620,8 @@
   output[16] = x[0];
 }
 
-static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct32_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -679,8 +678,8 @@
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct32_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -746,8 +745,7 @@
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_new_avx2(const __m256i *input, __m256i *output,
-                            int8_t cos_bit) {
+static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1104,8 +1102,8 @@
   btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 }
 
-static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct64_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -1191,8 +1189,8 @@
   output[32] = x[0];
 }
 
-static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct64_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1302,7 +1300,6 @@
   x[6] = x[1];
   x[5] = x[2];
   x[4] = x[3];
-  x[9] = x[9];
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
   idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
@@ -1312,8 +1309,8 @@
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct64_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1428,8 +1425,8 @@
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct64_low32_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1592,17 +1589,15 @@
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
       {
-          { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
-          { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
-            NULL },
+          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
-          idct32_new_avx2 },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
-          idct64_low32_new_avx2 },
+      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+          idct64_low32_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -1614,11 +1609,11 @@
   __m256i buf1[64 * 16];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
@@ -1638,6 +1633,7 @@
   assert(row_txfm != NULL);
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
   for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
     __m256i buf0[64];
     const int32_t *input_row = input + (i << 4) * input_stride;
@@ -1652,7 +1648,9 @@
       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
     }
     row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    for (int j = 0; j < txfm_size_col; ++j) {
+      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+    }
 
     __m256i *buf1_cur = buf1 + (i << 4);
     if (lr_flip) {
@@ -1668,10 +1666,13 @@
       }
     }
   }
+  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
   for (int i = 0; i < buf_size_w_div16; i++) {
     __m256i *buf1_cur = buf1 + i * txfm_size_row;
     col_txfm(buf1_cur, buf1_cur, cos_bit_col);
-    round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+    for (int j = 0; j < txfm_size_row; ++j) {
+      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+    }
   }
   for (int i = 0; i < buf_size_w_div16; i++) {
     lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
@@ -1748,7 +1749,7 @@
                                                   TX_SIZE tx_size,
                                                   int32_t eob) {
   (void)eob;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -1770,10 +1771,10 @@
     TX_SIZE tx_size, int eob) {
   int eobx, eoby;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
@@ -1810,10 +1811,10 @@
   __m256i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;

diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
index de0a561..46c051f 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c

@@ -24,8 +24,7 @@
 
 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
 
-static void idct4_new_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
+static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -51,7 +50,8 @@
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct4_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -77,8 +77,8 @@
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
+static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -102,7 +102,7 @@
   output[4] = x[0];
 }
 
-void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -150,7 +150,8 @@
   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
 }
 
-void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -236,8 +237,8 @@
   btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
 }
 
-static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -271,8 +272,8 @@
   output[8] = x[0];
 }
 
-static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -318,7 +319,7 @@
   idct16_stage7_sse2(output, x);
 }
 
-void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -390,7 +391,8 @@
   idct16_stage7_sse2(output, x);
 }
 
-void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -600,8 +602,8 @@
   btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
 }
 
-static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -653,8 +655,8 @@
   output[16] = x[0];
 }
 
-static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -711,8 +713,8 @@
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -778,8 +780,7 @@
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_new_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
+static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1138,8 +1139,8 @@
   btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
 }
 
-static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -1225,8 +1226,8 @@
   output[32] = x[0];
 }
 
-static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1336,7 +1337,6 @@
   x[6] = x[1];
   x[5] = x[2];
   x[4] = x[3];
-  x[9] = x[9];
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
@@ -1346,8 +1346,8 @@
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1462,8 +1462,8 @@
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1611,7 +1611,7 @@
   idct64_stage11_sse2(output, x);
 }
 
-void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1672,10 +1672,8 @@
   }
 }
 
-// TODO(binpengsmail@gmail.com):
-// To explore the reuse of VP9 versions of corresponding SSE2 functions and
-// evaluate whether there is a possibility for further speedup.
-void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1720,8 +1718,8 @@
   }
 }
 
-static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1767,7 +1765,7 @@
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1835,7 +1833,8 @@
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1994,8 +1993,8 @@
   output[15] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2043,8 +2042,8 @@
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2079,7 +2078,8 @@
   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   iadst16_stage9_ssse3(output, x);
 }
-void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst16_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2139,8 +2139,8 @@
   iadst16_stage9_ssse3(output, x);
 }
 
-void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
-                         int8_t cos_bit) {
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2233,8 +2233,8 @@
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
-                                 int8_t cos_bit) {
+static void iidentity4_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2244,16 +2244,16 @@
   }
 }
 
-static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
-                                int8_t cos_bit) {
+static void iidentity8_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 8; ++i) {
     output[i] = _mm_adds_epi16(input[i], input[i]);
   }
 }
 
-static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void iidentity16_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2300,11 +2300,11 @@
 // 1D functions process process 8 pixels at one time.
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
-      { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
-      { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
-      { idct32_new_sse2, NULL, NULL },
-      { idct64_low32_new_ssse3, NULL, NULL },
+      { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+      { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
+      { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+      { idct32_sse2, NULL, NULL },
+      { idct64_low32_ssse3, NULL, NULL },
     };
 
 // functions for blocks with eob at DC and within
@@ -2312,26 +2312,24 @@
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
-          { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
-          { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
-          { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+          { idct4_sse2, idct4_sse2, NULL, NULL },
+          { iadst4_sse2, iadst4_sse2, NULL, NULL },
+          { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
       },
-      { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
-        { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
-        { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+      { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
+        { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
+        { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
       {
-          { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
-            NULL },
-          { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
-            NULL },
+          { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+          { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
-          idct32_new_sse2 },
+      { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+          idct32_sse2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
-          idct64_low32_new_ssse3 },
+      { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+          idct64_low32_ssse3 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -2340,9 +2338,9 @@
 // used in 4x4, 4x8, 4x16, 8x4, 16x4
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
-      { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
-      { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+      { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+      { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+      { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
       { NULL, NULL, NULL },
       { NULL, NULL, NULL },
     };
@@ -2419,7 +2417,7 @@
 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
                                                    uint8_t *output, int stride,
                                                    TX_SIZE tx_size) {
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -2445,11 +2443,11 @@
   (void)eob;
   __m128i buf[4];
   const TX_SIZE tx_size = TX_4X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2511,11 +2509,11 @@
   __m128i buf1[64 * 8];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2581,12 +2579,12 @@
 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   int eobx, eoby;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = (eobx + 8) >> 3;
@@ -2627,10 +2625,10 @@
   __m128i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2717,11 +2715,11 @@
   (void)eob;
   __m128i buf[8];
   const TX_SIZE tx_size = TX_4X8;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2757,11 +2755,11 @@
   (void)eob;
   __m128i buf[8];
   const TX_SIZE tx_size = TX_8X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2797,11 +2795,11 @@
   (void)eob;
   __m128i buf[16];
   const TX_SIZE tx_size = TX_4X16;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2820,8 +2818,22 @@
     load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
                                   row_one_loop);
     transpose_16bit_4x8(buf_cur, buf_cur);
-    row_txfm(buf_cur, buf_cur, cos_bit_row);
-    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    if (row_txfm == iidentity4_ssse3) {
+      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+      const __m128i ones = _mm_set1_epi16(1);
+      for (int j = 0; j < 4; ++j) {
+        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+        const __m128i buf_32_lo =
+            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+        const __m128i buf_32_hi =
+            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+      }
+    } else {
+      row_txfm(buf_cur, buf_cur, cos_bit_row);
+      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    }
     if (lr_flip) {
       __m128i temp[8];
       flip_buf_sse2(buf_cur, temp, txfm_size_col);
@@ -2843,11 +2855,11 @@
   (void)eob;
   __m128i buf[16];
   const TX_SIZE tx_size = TX_16X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2867,8 +2879,22 @@
                                txfm_size_row);
     transpose_16bit_8x4(buf_cur, buf_cur);
   }
-  row_txfm(buf, buf, cos_bit_row);
-  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  if (row_txfm == iidentity16_ssse3) {
+    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+    const __m128i ones = _mm_set1_epi16(1);
+    for (int j = 0; j < 16; ++j) {
+      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+      const __m128i buf_32_lo =
+          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+      const __m128i buf_32_hi =
+          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+    }
+  } else {
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  }
   if (lr_flip) {
     __m128i temp[16];
     flip_buf_sse2(buf, temp, 16);
@@ -2916,22 +2942,14 @@
       break;
   }
 }
+
 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
   if (!txfm_param->lossless) {
-    switch (txfm_param->tx_size) {
-      case TX_4X16:
-      case TX_16X4:
-        // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test
-        // vector mismatches.
-        av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
-        break;
-      default:
-        av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
-                                       txfm_param->tx_size, txfm_param->eob);
-        break;
-    }
+    const TX_TYPE tx_type = txfm_param->tx_type;
+    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                   txfm_param->tx_size, txfm_param->eob);
+
   } else {
     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   }

diff --git a/libaom/av1/common/x86/cfl_avx2.c b/libaom/av1/common/x86/cfl_avx2.c
index d9bdf60..d9c6f99 100644
--- a/libaom/av1/common/x86/cfl_avx2.c
+++ b/libaom/av1/common/x86/cfl_avx2.c

@@ -16,34 +16,34 @@
 
 #include "av1/common/x86/cfl_simd.h"
 
-#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
-  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
-      TX_SIZE tx_size) {                                                   \
-    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
-      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
-      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
-      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
-      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
-      NULL,                                 /* 64x64 (invalid CFL size) */ \
-      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
-      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
-      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
-      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
-      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
-      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
-      NULL,                                 /* 32x64 (invalid CFL size) */ \
-      NULL,                                 /* 64x32 (invalid CFL size) */ \
-      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
-      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
-      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
-      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
-      NULL,                                 /* 16x64 (invalid CFL size) */ \
-      NULL,                                 /* 64x16 (invalid CFL size) */ \
-    };                                                                     \
-    return subfn_##sub[tx_size];                                           \
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                               \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                          \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(        \
+      TX_SIZE tx_size) {                                                       \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
+      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      NULL,                                     /* 64x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      NULL,                                     /* 32x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x32 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      NULL,                                     /* 16x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x16 (invalid CFL size) */ \
+    };                                                                         \
+    return subfn_##sub[tx_size];                                               \
   }
 
 /**
@@ -147,6 +147,7 @@
 
 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -238,6 +239,7 @@
 }
 
 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
                                         __m256i alpha_sign, __m256i dc_q0) {
@@ -273,33 +275,34 @@
 CFL_PREDICT_X(avx2, 32, 16, lbd);
 CFL_PREDICT_X(avx2, 32, 32, lbd);
 
-cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
   static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
-    predict_lbd_4x4_ssse3,   /* 4x4 */
-    predict_lbd_8x8_ssse3,   /* 8x8 */
-    predict_lbd_16x16_ssse3, /* 16x16 */
-    predict_lbd_32x32_avx2,  /* 32x32 */
-    NULL,                    /* 64x64 (invalid CFL size) */
-    predict_lbd_4x8_ssse3,   /* 4x8 */
-    predict_lbd_8x4_ssse3,   /* 8x4 */
-    predict_lbd_8x16_ssse3,  /* 8x16 */
-    predict_lbd_16x8_ssse3,  /* 16x8 */
-    predict_lbd_16x32_ssse3, /* 16x32 */
-    predict_lbd_32x16_avx2,  /* 32x16 */
-    NULL,                    /* 32x64 (invalid CFL size) */
-    NULL,                    /* 64x32 (invalid CFL size) */
-    predict_lbd_4x16_ssse3,  /* 4x16  */
-    predict_lbd_16x4_ssse3,  /* 16x4  */
-    predict_lbd_8x32_ssse3,  /* 8x32  */
-    predict_lbd_32x8_avx2,   /* 32x8  */
-    NULL,                    /* 16x64 (invalid CFL size) */
-    NULL,                    /* 64x16 (invalid CFL size) */
+    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
+    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
+    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
+    NULL,                        /* 64x64 (invalid CFL size) */
+    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
+    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
+    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
+    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
+    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
+    NULL,                        /* 32x64 (invalid CFL size) */
+    NULL,                        /* 64x32 (invalid CFL size) */
+    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
+    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
+    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
+    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
+    NULL,                        /* 16x64 (invalid CFL size) */
+    NULL,                        /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
   // function pointer array out of bounds.
   return pred[tx_size % TX_SIZES_ALL];
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static __m256i highbd_max_epi16(int bd) {
   const __m256i neg_one = _mm256_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -346,32 +349,33 @@
 CFL_PREDICT_X(avx2, 32, 16, hbd)
 CFL_PREDICT_X(avx2, 32, 32, hbd)
 
-cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
   static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
-    predict_hbd_4x4_ssse3,  /* 4x4 */
-    predict_hbd_8x8_ssse3,  /* 8x8 */
-    predict_hbd_16x16_avx2, /* 16x16 */
-    predict_hbd_32x32_avx2, /* 32x32 */
-    NULL,                   /* 64x64 (invalid CFL size) */
-    predict_hbd_4x8_ssse3,  /* 4x8 */
-    predict_hbd_8x4_ssse3,  /* 8x4 */
-    predict_hbd_8x16_ssse3, /* 8x16 */
-    predict_hbd_16x8_avx2,  /* 16x8 */
-    predict_hbd_16x32_avx2, /* 16x32 */
-    predict_hbd_32x16_avx2, /* 32x16 */
-    NULL,                   /* 32x64 (invalid CFL size) */
-    NULL,                   /* 64x32 (invalid CFL size) */
-    predict_hbd_4x16_ssse3, /* 4x16  */
-    predict_hbd_16x4_avx2,  /* 16x4  */
-    predict_hbd_8x32_ssse3, /* 8x32  */
-    predict_hbd_32x8_avx2,  /* 32x8  */
-    NULL,                   /* 16x64 (invalid CFL size) */
-    NULL,                   /* 64x16 (invalid CFL size) */
+    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
+    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
+    cfl_predict_hbd_16x16_avx2, /* 16x16 */
+    cfl_predict_hbd_32x32_avx2, /* 32x32 */
+    NULL,                       /* 64x64 (invalid CFL size) */
+    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
+    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
+    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
+    cfl_predict_hbd_16x32_avx2, /* 16x32 */
+    cfl_predict_hbd_32x16_avx2, /* 32x16 */
+    NULL,                       /* 32x64 (invalid CFL size) */
+    NULL,                       /* 64x32 (invalid CFL size) */
+    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
+    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
+    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
+    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
+    NULL,                       /* 16x64 (invalid CFL size) */
+    NULL,                       /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
   // function pointer array out of bounds.
   return pred[tx_size % TX_SIZES_ALL];
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Returns a vector where all the (32-bits) elements are the sum of all the
 // lanes in a.
@@ -463,27 +467,27 @@
 
 // Based on the observation that for small blocks AVX2 does not outperform
 // SSE2, we call the SSE2 code for block widths 4 and 8.
-cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
-    subtract_average_4x4_sse2,   /* 4x4 */
-    subtract_average_8x8_sse2,   /* 8x8 */
-    subtract_average_16x16_avx2, /* 16x16 */
-    subtract_average_32x32_avx2, /* 32x32 */
-    NULL,                        /* 64x64 (invalid CFL size) */
-    subtract_average_4x8_sse2,   /* 4x8 */
-    subtract_average_8x4_sse2,   /* 8x4 */
-    subtract_average_8x16_sse2,  /* 8x16 */
-    subtract_average_16x8_avx2,  /* 16x8 */
-    subtract_average_16x32_avx2, /* 16x32 */
-    subtract_average_32x16_avx2, /* 32x16 */
-    NULL,                        /* 32x64 (invalid CFL size) */
-    NULL,                        /* 64x32 (invalid CFL size) */
-    subtract_average_4x16_sse2,  /* 4x16 */
-    subtract_average_16x4_avx2,  /* 16x4 */
-    subtract_average_8x32_sse2,  /* 8x32 */
-    subtract_average_32x8_avx2,  /* 32x8 */
-    NULL,                        /* 16x64 (invalid CFL size) */
-    NULL,                        /* 64x16 (invalid CFL size) */
+    cfl_subtract_average_4x4_sse2,   /* 4x4 */
+    cfl_subtract_average_8x8_sse2,   /* 8x8 */
+    cfl_subtract_average_16x16_avx2, /* 16x16 */
+    cfl_subtract_average_32x32_avx2, /* 32x32 */
+    NULL,                            /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_sse2,   /* 4x8 */
+    cfl_subtract_average_8x4_sse2,   /* 8x4 */
+    cfl_subtract_average_8x16_sse2,  /* 8x16 */
+    cfl_subtract_average_16x8_avx2,  /* 16x8 */
+    cfl_subtract_average_16x32_avx2, /* 16x32 */
+    cfl_subtract_average_32x16_avx2, /* 32x16 */
+    NULL,                            /* 32x64 (invalid CFL size) */
+    NULL,                            /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_sse2,  /* 4x16 */
+    cfl_subtract_average_16x4_avx2,  /* 16x4 */
+    cfl_subtract_average_8x32_sse2,  /* 8x32 */
+    cfl_subtract_average_32x8_avx2,  /* 32x8 */
+    NULL,                            /* 16x64 (invalid CFL size) */
+    NULL,                            /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
   // index the function pointer array out of bounds.

diff --git a/libaom/av1/common/x86/cfl_simd.h b/libaom/av1/common/x86/cfl_simd.h
index 3b342cd..03ae02a 100644
--- a/libaom/av1/common/x86/cfl_simd.h
+++ b/libaom/av1/common/x86/cfl_simd.h

@@ -15,229 +15,232 @@
 #include "av1/common/blockd.h"
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
-void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
-void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
-void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // SSE2 version is optimal for with == 4, we reuse them in AVX2
-void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
 
 // SSE2 version is optimal for with == 8, we reuse them in AVX2
-void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
 
-void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
 
-void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
 
-void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                             int dst_stride, int alpha_q3);
-void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                             int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
 
-void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
 
-void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
 
-void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                             int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                             int dst_stride, int alpha_q3, int bd);
-
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_

diff --git a/libaom/av1/common/x86/cfl_ssse3.c b/libaom/av1/common/x86/cfl_ssse3.c
index bbf0072..476b660 100644
--- a/libaom/av1/common/x86/cfl_ssse3.c
+++ b/libaom/av1/common/x86/cfl_ssse3.c

@@ -168,6 +168,7 @@
   } while (pred_buf_m128i < end);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -296,6 +297,7 @@
     pred_buf_q3 += CFL_BUF_LINE;
   } while (pred_buf_q3 < end);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
@@ -341,6 +343,7 @@
 
 CFL_PREDICT_FN(ssse3, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE __m128i highbd_max_epi16(int bd) {
   const __m128i neg_one = _mm_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -391,3 +394,4 @@
 }
 
 CFL_PREDICT_FN(ssse3, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/av1/common/x86/convolve_2d_avx2.c b/libaom/av1/common/x86/convolve_2d_avx2.c
index ae12a60..e19575d 100644
--- a/libaom/av1/common/x86/convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/convolve_2d_avx2.c

@@ -24,7 +24,7 @@
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   const int bd = 8;
   int im_stride = 8;
@@ -54,8 +54,8 @@
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
 
   // Condition for checking valid horz_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
@@ -214,12 +214,12 @@
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   if (w >= 16) {

diff --git a/libaom/av1/common/x86/convolve_2d_sse2.c b/libaom/av1/common/x86/convolve_2d_sse2.c
index 369922b..5376ea7 100644
--- a/libaom/av1/common/x86/convolve_2d_sse2.c
+++ b/libaom/av1/common/x86/convolve_2d_sse2.c

@@ -22,7 +22,7 @@
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   const int bd = 8;
 
@@ -45,7 +45,7 @@
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -111,7 +111,7 @@
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -205,7 +205,7 @@
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
 
         if (w == 2) {
-          *(uint16_t *)p = _mm_cvtsi128_si32(res);
+          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
         } else if (w == 4) {
           *(uint32_t *)p = _mm_cvtsi128_si32(res);
         } else {
@@ -240,12 +240,12 @@
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   if (w >= 16) {
@@ -357,15 +357,15 @@
 void av1_dist_wtd_convolve_2d_copy_sse2(
     const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;

diff --git a/libaom/av1/common/x86/convolve_avx2.c b/libaom/av1/common/x86/convolve_avx2.c
index 21b9fe4..1d5bc6f 100644
--- a/libaom/av1/common/x86/convolve_avx2.c
+++ b/libaom/av1/common/x86/convolve_avx2.c

@@ -21,7 +21,7 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   int i, j, is_vert_4tap = 0;
   // right shift is F-1 because we are already dividing
@@ -36,12 +36,12 @@
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
   __m256i coeffs[4], s[8];
   __m128i d[6];
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
   // Condition for checking valid vert_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
@@ -131,8 +131,8 @@
             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
             __m128i *const p_1 =
                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
           }
         }
         s[0] = s[1];
@@ -244,8 +244,8 @@
             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
             __m128i *const p_1 =
                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
           }
         }
         s[0] = s[1];
@@ -264,7 +264,7 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int bits = FILTER_BITS - conv_params->round_0;
 
@@ -275,7 +275,7 @@
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
   int i, is_horiz_4tap = 0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -286,7 +286,7 @@
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
 
   // Condition for checking valid horz_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
@@ -329,8 +329,8 @@
         } else {
           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
         }
       }
     } else {

diff --git a/libaom/av1/common/x86/convolve_sse2.c b/libaom/av1/common/x86/convolve_sse2.c
index 5016642..4323ac4 100644
--- a/libaom/av1/common/x86/convolve_sse2.c
+++ b/libaom/av1/common/x86/convolve_sse2.c

@@ -79,7 +79,7 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
@@ -88,14 +88,14 @@
   __m128i coeffs[4];
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w <= 4) {
     __m128i s[8], src6, res, res_round, res16;
@@ -132,7 +132,7 @@
       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
       if (w == 2)
-        *(uint16_t *)dst = res_int;
+        *(uint16_t *)dst = (uint16_t)res_int;
       else
         *(uint32_t *)dst = res_int;
 
@@ -145,7 +145,7 @@
       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
       if (w == 2)
-        *(uint16_t *)dst = res_int;
+        *(uint16_t *)dst = (uint16_t)res_int;
       else
         *(uint32_t *)dst = res_int;
 
@@ -240,7 +240,7 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_horiz;
@@ -253,13 +253,13 @@
   __m128i coeffs[4];
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w <= 4) {
     do {
@@ -284,7 +284,7 @@
 
       uint32_t r = _mm_cvtsi128_si32(res);
       if (w == 2)
-        *(uint16_t *)dst = r;
+        *(uint16_t *)dst = (uint16_t)r;
       else
         *(uint32_t *)dst = r;
 

diff --git a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
index 357df12..396aed0 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c

@@ -24,8 +24,8 @@
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
@@ -58,8 +58,8 @@
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -222,12 +222,12 @@
 void av1_highbd_convolve_2d_copy_sr_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
 

diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse2.c b/libaom/av1/common/x86/highbd_convolve_2d_sse2.c
index 15f8872..f758775 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_sse2.c

@@ -74,12 +74,12 @@
 void av1_highbd_convolve_2d_copy_sr_sse2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
   if (w >= 16) {

diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
index 3c1d5d1..d2ff47c 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c

@@ -24,14 +24,14 @@
 void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
@@ -171,8 +171,8 @@
 void av1_highbd_dist_wtd_convolve_2d_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -208,7 +208,7 @@
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -275,7 +275,7 @@
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3

diff --git a/libaom/av1/common/x86/highbd_convolve_2d_ssse3.c b/libaom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 1d029db..5318fca 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_ssse3.c

@@ -22,8 +22,8 @@
 void av1_highbd_convolve_2d_sr_ssse3(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
@@ -54,8 +54,8 @@
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */

diff --git a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
index fe22465..93e98e4 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c

@@ -47,6 +47,47 @@
   return clamped;
 }
 
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+  if (shift != 0) {
+    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+    in[0] = _mm256_add_epi32(in[0], rnding);
+    in[1] = _mm256_add_epi32(in[1], rnding);
+    in[2] = _mm256_add_epi32(in[2], rnding);
+    in[3] = _mm256_add_epi32(in[3], rnding);
+
+    in[0] = _mm256_srai_epi32(in[0], shift);
+    in[1] = _mm256_srai_epi32(in[1], shift);
+    in[2] = _mm256_srai_epi32(in[2], shift);
+    in[3] = _mm256_srai_epi32(in[3], shift);
+  }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+  round_shift_4x4_avx2(in, shift);
+  round_shift_4x4_avx2(in + 4, shift);
+  round_shift_4x4_avx2(in + 8, shift);
+  round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+                                    const __m256i *clamp_lo,
+                                    const __m256i *clamp_hi, int size) {
+  __m256i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm256_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+  }
+}
+
 static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
                                                  __m256i res0, __m256i res1,
                                                  const int bd) {
@@ -235,36 +276,6 @@
   *out1 = a1;
 }
 
-static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
-                                 __m256i *out0, __m256i *out1) {
-  __m256i a0 = _mm256_add_epi32(in0, in1);
-  __m256i a1 = _mm256_sub_epi32(in0, in1);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
-static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
-                              __m256i *out0, __m256i *out1,
-                              const __m256i *clamp_lo, const __m256i *clamp_hi,
-                              int shift) {
-  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
-  __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
-  __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
-  __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
-
-  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
-  a0 = _mm256_max_epi32(a0, *clamp_lo);
-  a0 = _mm256_min_epi32(a0, *clamp_hi);
-  a1 = _mm256_max_epi32(a1, *clamp_lo);
-  a1 = _mm256_min_epi32(a1, *clamp_hi);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
 static INLINE void idct32_stage4_avx2(
     __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
     const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
@@ -400,63 +411,32 @@
 static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
                                       const int do_cols, const int bd,
                                       const int out_shift,
-                                      const int log_range) {
-  if (do_cols) {
-    addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
-    addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
-    addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
-    addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
-    addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
-    addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
-    addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
-    addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
-    addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
-    addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
-    addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
-    addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
-    addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
-    addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
-    addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
-    addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
-  } else {
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi) {
+  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -466,8 +446,8 @@
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
-  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i x;
   // stage 0
   // stage 1
@@ -483,22 +463,16 @@
   // stage 7
   // stage 8
   // stage 9
-  if (do_cols) {
-    x = _mm256_max_epi32(x, clamp_lo);
-    x = _mm256_min_epi32(x, clamp_hi);
-  } else {
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
     __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
     x = _mm256_add_epi32(offset, x);
     x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-    x = _mm256_max_epi32(x, clamp_lo_out);
-    x = _mm256_min_epi32(x, clamp_hi_out);
   }
-
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
   out[0] = x;
   out[1] = x;
   out[2] = x;
@@ -642,7 +616,7 @@
                        &rounding, bit);
 
     // stage 9
-    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -792,7 +766,7 @@
                        &rounding, bit);
 
     // stage 9
-    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -1150,62 +1124,31 @@
     bf0[31] = bf1[31];
 
     // stage 9
-    if (do_cols) {
-      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
-      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
-      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
-      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
-      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
-      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
-      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
-      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
-      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
-      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
-      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
-      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
-      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
-      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
-      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
-      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
-    } else {
+    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
     }
   }
 }
@@ -1215,8 +1158,8 @@
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
-  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 
   {
     // stage 0
@@ -1231,22 +1174,16 @@
     // stage 5
     // stage 6
     // stage 7
-    if (do_cols) {
-      in[0] = _mm256_max_epi32(in[0], clamp_lo);
-      in[0] = _mm256_min_epi32(in[0], clamp_hi);
-    } else {
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
       __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
       in[0] = _mm256_add_epi32(in[0], offset);
       in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
-      in[0] = _mm256_max_epi32(in[0], clamp_lo_out);
-      in[0] = _mm256_min_epi32(in[0], clamp_hi_out);
     }
-
+    in[0] = _mm256_max_epi32(in[0], clamp_lo);
+    in[0] = _mm256_min_epi32(in[0], clamp_hi);
     out[0] = in[0];
     out[1] = in[0];
     out[2] = in[0];
@@ -1392,38 +1329,23 @@
     u[12] = _mm256_add_epi32(u[12], rnding);
     u[12] = _mm256_srai_epi32(u[12], bit);
     // stage 7
-    if (do_cols) {
-      addsub_no_clamp_avx2(u[0], u[15], out + 0, out + 15);
-      addsub_no_clamp_avx2(u[1], u[14], out + 1, out + 14);
-      addsub_no_clamp_avx2(u[2], u[13], out + 2, out + 13);
-      addsub_no_clamp_avx2(u[3], u[12], out + 3, out + 12);
-      addsub_no_clamp_avx2(u[4], u[11], out + 4, out + 11);
-      addsub_no_clamp_avx2(u[5], u[10], out + 5, out + 10);
-      addsub_no_clamp_avx2(u[6], u[9], out + 6, out + 9);
-      addsub_no_clamp_avx2(u[7], u[8], out + 7, out + 8);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 
-      addsub_shift_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
     }
   }
 }
@@ -1590,38 +1512,23 @@
     v[15] = u[15];
 
     // stage 7
-    if (do_cols) {
-      addsub_no_clamp_avx2(v[0], v[15], out + 0, out + 15);
-      addsub_no_clamp_avx2(v[1], v[14], out + 1, out + 14);
-      addsub_no_clamp_avx2(v[2], v[13], out + 2, out + 13);
-      addsub_no_clamp_avx2(v[3], v[12], out + 3, out + 12);
-      addsub_no_clamp_avx2(v[4], v[11], out + 4, out + 11);
-      addsub_no_clamp_avx2(v[5], v[10], out + 5, out + 10);
-      addsub_no_clamp_avx2(v[6], v[9], out + 6, out + 9);
-      addsub_no_clamp_avx2(v[7], v[8], out + 7, out + 8);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 
-      addsub_shift_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
     }
   }
 }
@@ -2510,6 +2417,8 @@
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i x;
 
   // stage 0
@@ -2524,18 +2433,14 @@
   // stage 5
   if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
     __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
     x = _mm256_add_epi32(x, offset);
     x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-    x = _mm256_max_epi32(x, clamp_lo_out);
-    x = _mm256_min_epi32(x, clamp_hi_out);
   }
-
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
   out[0] = x;
   out[1] = x;
   out[2] = x;
@@ -2640,26 +2545,20 @@
   u5 = _mm256_add_epi32(u5, rnding);
   u5 = _mm256_srai_epi32(u5, bit);
 
+  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
   // stage 5
-  if (do_cols) {
-    addsub_no_clamp_avx2(u0, u7, out + 0, out + 7);
-    addsub_no_clamp_avx2(u1, u6, out + 1, out + 6);
-    addsub_no_clamp_avx2(u2, u5, out + 2, out + 5);
-    addsub_no_clamp_avx2(u3, u4, out + 3, out + 4);
-  } else {
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-    addsub_shift_avx2(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
-                      out_shift);
-    addsub_shift_avx2(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
-                      out_shift);
-    addsub_shift_avx2(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
-                      out_shift);
-    addsub_shift_avx2(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
-                      out_shift);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4_avx2(out, out_shift);
+    round_shift_4x4_avx2(out + 4, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
   }
 }
 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
@@ -3037,22 +2936,23 @@
 
 static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
                                        int bd, int out_shift,
-                                       const int log_range) {
-  if (do_cols) {
-    for (int i = 0; i < 32; i++) {
-      addsub_no_clamp_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
-    }
-  } else {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+                                       const __m256i *clamp_lo,
+                                       const __m256i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+  }
 
-    for (int i = 0; i < 32; i++) {
-      addsub_shift_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
-                        &clamp_lo_out, &clamp_hi_out, out_shift);
-    }
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    round_shift_8x8_avx2(out + 32, out_shift);
+    round_shift_8x8_avx2(out + 48, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
   }
 }
 
@@ -3061,8 +2961,8 @@
   const int32_t *cospi = cospi_arr(bit);
   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
-  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 
@@ -3081,24 +2981,18 @@
     // stage 9
     // stage 10
     // stage 11
-    if (do_cols) {
-      x = _mm256_max_epi32(x, clamp_lo);
-      x = _mm256_min_epi32(x, clamp_hi);
-    } else {
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
-      x = _mm256_add_epi32(x, offset);
-      x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-
-      x = _mm256_max_epi32(x, clamp_lo_out);
-      x = _mm256_min_epi32(x, clamp_hi_out);
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+        x = _mm256_add_epi32(x, offset);
+        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
     }
-
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
     out[0] = x;
     out[1] = x;
     out[2] = x;
@@ -3379,7 +3273,6 @@
     u[6] = u[1];
     u[5] = u[2];
     u[4] = u[3];
-    u[9] = u[9];
 
     idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
                        &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
@@ -3393,7 +3286,7 @@
                         bit);
 
     // stage 11
-    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
@@ -3702,7 +3595,7 @@
                         bit);
 
     // stage 11
-    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
@@ -4164,21 +4057,22 @@
     for (i = 56; i < 64; i++) v[i] = u[i];
 
     // stage 11
-    if (do_cols) {
-      for (i = 0; i < 32; i++) {
-        addsub_no_clamp_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
-      }
-    } else {
+    for (i = 0; i < 32; i++) {
+      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                  &clamp_hi);
+    }
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-      for (i = 0; i < 32; i++) {
-        addsub_shift_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      }
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      round_shift_8x8_avx2(out + 32, out_shift);
+      round_shift_8x8_avx2(out + 48, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
     }
   }
 }
@@ -4219,7 +4113,7 @@
   __m256i buf1[64 * 8];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -4255,7 +4149,8 @@
       av1_round_shift_rect_array_32_avx2(
           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m256i *_buf1 = buf1 + i * 8;
     if (lr_flip) {
@@ -4272,7 +4167,7 @@
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
                                   buf1 + i * txfm_size_row, txfm_size_row,

diff --git a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
index 8a8641d..03eaef8 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c

@@ -37,16 +37,61 @@
   return clamped;
 }
 
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  if (shift != 0) {
+    __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[1] = _mm_add_epi32(in[1], rnding);
+    in[2] = _mm_add_epi32(in[2], rnding);
+    in[3] = _mm_add_epi32(in[3], rnding);
+
+    in[0] = _mm_srai_epi32(in[0], shift);
+    in[1] = _mm_srai_epi32(in[1], shift);
+    in[2] = _mm_srai_epi32(in[2], shift);
+    in[3] = _mm_srai_epi32(in[3], shift);
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+                                      const __m128i *clamp_lo,
+                                      const __m128i *clamp_hi, int size) {
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+
 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
                                                   __m128i res0, __m128i res1,
                                                   const int bd) {
   __m128i x0 = _mm_cvtepi16_epi32(pred);
   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
-
+  __m128i min_clip_val = _mm_setzero_si128();
+  __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
   x0 = _mm_add_epi32(res0, x0);
   x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_max_epi32(x0, min_clip_val);
+  x0 = _mm_min_epi32(x0, max_clip_val);
+  x1 = _mm_max_epi32(x1, min_clip_val);
+  x1 = _mm_min_epi32(x1, max_clip_val);
   x0 = _mm_packus_epi32(x0, x1);
-  x0 = highbd_clamp_epi16(x0, bd);
   return x0;
 }
 
@@ -115,34 +160,23 @@
   *out1 = a1;
 }
 
-static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
-                                   __m128i *out0, __m128i *out1) {
-  __m128i a0 = _mm_add_epi32(in0, in1);
-  __m128i a1 = _mm_sub_epi32(in0, in1);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
-static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
-                                __m128i *out0, __m128i *out1,
-                                const __m128i *clamp_lo,
-                                const __m128i *clamp_hi, int shift) {
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+                                   const __m128i *clamp_lo,
+                                   const __m128i *clamp_hi, int shift) {
   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
-  __m128i in0_w_offset = _mm_add_epi32(in0, offset);
-  __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
-  __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+  __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+  __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+  in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+  in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
 
-  a0 = _mm_max_epi32(a0, *clamp_lo);
-  a0 = _mm_min_epi32(a0, *clamp_hi);
-  a1 = _mm_max_epi32(a1, *clamp_lo);
-  a1 = _mm_min_epi32(a1, *clamp_hi);
+  in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+  in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+  in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+  in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
 
-  *out0 = a0;
-  *out1 = a1;
+  *in0 = in0_w_offset;
+  *in1 = in1_w_offset;
 }
 
 static INLINE void idct32_stage4_sse4_1(
@@ -298,63 +332,34 @@
 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
                                         const int do_cols, const int bd,
                                         const int out_shift,
-                                        const int log_range) {
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
-    addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
-    addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
-    addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
-    addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
-    addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
-    addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
-    addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
-    addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
-    addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
-    addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
-    addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
-    addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
-    addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
-    addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
-    addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
-  } else {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi) {
+  addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
 
-    addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    for (int i = 0; i < 32; i += 8) {
+      round_shift_4x4(out + i, out_shift);
+      round_shift_4x4(out + i + 4, out_shift);
+    }
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -380,17 +385,21 @@
 
 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                            int bd, int out_shift) {
-  (void)out_shift;
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
 
+  // Stage 0
+  // Stage 1
+  // Stage 2
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
   v2 = _mm_unpacklo_epi32(in[2], in[3]);
@@ -423,23 +432,27 @@
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(v0, v3, out + 0, out + 3);
-    addsub_no_clamp_sse4_1(v1, v2, out + 1, out + 2);
-  } else {
-    const int log_range = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-    addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+  // Stage 3
+  addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+    shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
   }
 }
 
 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                             int bd, int out_shift) {
-  (void)out_shift;
   const int32_t *sinpi = sinpi_arr(bit);
-  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_set1_epi32(0);
+  __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+  rnding = _mm_unpacklo_epi32(rnding, zero);
+  const __m128i mul = _mm_set1_epi32(1 << 4);
   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
@@ -449,6 +462,8 @@
   __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
+  __m128i u0_low, u1_low, u2_low, u3_low;
+  __m128i u0_high, u1_high, u2_high, u3_high;
 
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
@@ -483,51 +498,78 @@
   t = _mm_add_epi32(s0, s1);
   u3 = _mm_sub_epi32(t, s3);
 
-  u0 = _mm_add_epi32(u0, rnding);
-  u0 = _mm_srai_epi32(u0, bit);
+  // u0
+  u0_low = _mm_mul_epi32(u0, mul);
+  u0_low = _mm_add_epi64(u0_low, rnding);
 
-  u1 = _mm_add_epi32(u1, rnding);
-  u1 = _mm_srai_epi32(u1, bit);
+  u0 = _mm_srli_si128(u0, 4);
+  u0_high = _mm_mul_epi32(u0, mul);
+  u0_high = _mm_add_epi64(u0_high, rnding);
 
-  u2 = _mm_add_epi32(u2, rnding);
-  u2 = _mm_srai_epi32(u2, bit);
+  u0_low = _mm_srli_si128(u0_low, 2);
+  u0_high = _mm_srli_si128(u0_high, 2);
 
-  u3 = _mm_add_epi32(u3, rnding);
-  u3 = _mm_srai_epi32(u3, bit);
+  u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+  u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+  u0 = _mm_unpacklo_epi64(u0, u0_high);
 
-  if (!do_cols) {
-    const int log_range = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  // u1
+  u1_low = _mm_mul_epi32(u1, mul);
+  u1_low = _mm_add_epi64(u1_low, rnding);
 
-    u0 = _mm_max_epi32(u0, clamp_lo);
-    u0 = _mm_min_epi32(u0, clamp_hi);
-    u1 = _mm_max_epi32(u1, clamp_lo);
-    u1 = _mm_min_epi32(u1, clamp_hi);
-    u2 = _mm_max_epi32(u2, clamp_lo);
-    u2 = _mm_min_epi32(u2, clamp_hi);
-    u3 = _mm_max_epi32(u3, clamp_lo);
-    u3 = _mm_min_epi32(u3, clamp_hi);
-  }
+  u1 = _mm_srli_si128(u1, 4);
+  u1_high = _mm_mul_epi32(u1, mul);
+  u1_high = _mm_add_epi64(u1_high, rnding);
+
+  u1_low = _mm_srli_si128(u1_low, 2);
+  u1_high = _mm_srli_si128(u1_high, 2);
+
+  u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+  u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+  u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+  // u2
+  u2_low = _mm_mul_epi32(u2, mul);
+  u2_low = _mm_add_epi64(u2_low, rnding);
+
+  u2 = _mm_srli_si128(u2, 4);
+  u2_high = _mm_mul_epi32(u2, mul);
+  u2_high = _mm_add_epi64(u2_high, rnding);
+
+  u2_low = _mm_srli_si128(u2_low, 2);
+  u2_high = _mm_srli_si128(u2_high, 2);
+
+  u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+  u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+  u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+  // u3
+  u3_low = _mm_mul_epi32(u3, mul);
+  u3_low = _mm_add_epi64(u3_low, rnding);
+
+  u3 = _mm_srli_si128(u3, 4);
+  u3_high = _mm_mul_epi32(u3, mul);
+  u3_high = _mm_add_epi64(u3_high, rnding);
+
+  u3_low = _mm_srli_si128(u3_low, 2);
+  u3_high = _mm_srli_si128(u3_high, 2);
+
+  u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+  u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+  u3 = _mm_unpacklo_epi64(u3, u3_high);
 
   out[0] = u0;
   out[1] = u1;
   out[2] = u2;
   out[3] = u3;
-}
 
-static INLINE void round_shift_4x4(__m128i *in, int shift) {
-  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
-
-  in[0] = _mm_add_epi32(in[0], rnding);
-  in[1] = _mm_add_epi32(in[1], rnding);
-  in[2] = _mm_add_epi32(in[2], rnding);
-  in[3] = _mm_add_epi32(in[3], rnding);
-
-  in[0] = _mm_srai_epi32(in[0], shift);
-  in[1] = _mm_srai_epi32(in[1], shift);
-  in[2] = _mm_srai_epi32(in[2], shift);
-  in[3] = _mm_srai_epi32(in[3], shift);
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
 }
 
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
@@ -583,52 +625,39 @@
   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
 }
-static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
-                                      const __m128i *clamp_lo,
-                                      const __m128i *clamp_hi, int size) {
-  __m128i a0, a1;
-  for (int i = 0; i < size; i += 4) {
-    a0 = _mm_max_epi32(in[i], *clamp_lo);
-    out[i] = _mm_min_epi32(a0, *clamp_hi);
 
-    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
-    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
-
-    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
-    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
-
-    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
-    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
-  }
-}
 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   (void)bit;
-  (void)out_shift;
   __m128i v[4];
+  __m128i zero = _mm_set1_epi32(0);
   __m128i fact = _mm_set1_epi32(NewSqrt2);
   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
-  __m128i a0, a1;
+  __m128i a0_low, a1_low;
+  __m128i a0_high, a1_high;
 
-  a0 = _mm_mullo_epi32(in[0], fact);
-  a1 = _mm_mullo_epi32(in[1], fact);
-  a0 = _mm_add_epi32(a0, offset);
-  a1 = _mm_add_epi32(a1, offset);
-  out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
-  out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+  offset = _mm_unpacklo_epi32(offset, zero);
 
-  a0 = _mm_mullo_epi32(in[2], fact);
-  a1 = _mm_mullo_epi32(in[3], fact);
-  a0 = _mm_add_epi32(a0, offset);
-  a1 = _mm_add_epi32(a1, offset);
-  out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
-  out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
+  for (int i = 0; i < 4; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+  }
 
   if (!do_cols) {
     const int log_range = AOMMAX(16, bd + 6);
     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-
+    round_shift_4x4(out, out_shift);
     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
   }
 
@@ -643,108 +672,116 @@
   out[2] = _mm_unpacklo_epi64(v[1], v[3]);
   out[3] = _mm_unpackhi_epi64(v[1], v[3]);
 }
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
   const int txw_idx = get_txw_idx(TX_4X4);
   const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case IDTX:
-      load_buffer_4x4(coeff, in);
-      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_DCT:
-      load_buffer_4x4(coeff, in);
-      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case H_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_ADST:
-      load_buffer_4x4(coeff, in);
-      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case H_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case H_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     default: assert(0);
@@ -874,26 +911,22 @@
     u5 = _mm_srai_epi32(u5, bit);
 
     // stage 5
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
-      addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
-      addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
-      addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-      addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-    }
+    addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   }
 }
 
@@ -1217,69 +1250,29 @@
                      &clamp_hi_out, out_shift);
   }
 }
-static void shift_sse4_1(const __m128i *in, __m128i *out,
-                         const __m128i *clamp_lo, const __m128i *clamp_hi,
-                         int shift, int size) {
-  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
-  __m128i shift_vec = _mm_cvtsi32_si128(shift);
-  __m128i a0, a1;
-  for (int i = 0; i < size; i += 4) {
-    a0 = _mm_add_epi32(in[i], offset);
-    a1 = _mm_add_epi32(in[i + 1], offset);
-    a0 = _mm_sra_epi32(a0, shift_vec);
-    a1 = _mm_sra_epi32(a1, shift_vec);
-    a0 = _mm_max_epi32(a0, *clamp_lo);
-    a1 = _mm_max_epi32(a1, *clamp_lo);
-    out[i] = _mm_min_epi32(a0, *clamp_hi);
-    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
-
-    a0 = _mm_add_epi32(in[i + 2], offset);
-    a1 = _mm_add_epi32(in[i + 3], offset);
-    a0 = _mm_sra_epi32(a0, shift_vec);
-    a1 = _mm_sra_epi32(a1, shift_vec);
-    a0 = _mm_max_epi32(a0, *clamp_lo);
-    a1 = _mm_max_epi32(a1, *clamp_lo);
-    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
-    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
-  }
-}
 
 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   (void)bit;
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i v[8];
-  v[0] = _mm_add_epi32(in[0], in[0]);
-  v[1] = _mm_add_epi32(in[1], in[1]);
-  v[2] = _mm_add_epi32(in[2], in[2]);
-  v[3] = _mm_add_epi32(in[3], in[3]);
-  v[4] = _mm_add_epi32(in[4], in[4]);
-  v[5] = _mm_add_epi32(in[5], in[5]);
-  v[6] = _mm_add_epi32(in[6], in[6]);
-  v[7] = _mm_add_epi32(in[7], in[7]);
+  out[0] = _mm_add_epi32(in[0], in[0]);
+  out[1] = _mm_add_epi32(in[1], in[1]);
+  out[2] = _mm_add_epi32(in[2], in[2]);
+  out[3] = _mm_add_epi32(in[3], in[3]);
+  out[4] = _mm_add_epi32(in[4], in[4]);
+  out[5] = _mm_add_epi32(in[5], in[5]);
+  out[6] = _mm_add_epi32(in[6], in[6]);
+  out[7] = _mm_add_epi32(in[7], in[7]);
 
   if (!do_cols) {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
-  } else {
-    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
   }
 }
 
-static void round_shift_8x8(__m128i *in, int shift) {
-  round_shift_4x4(&in[0], shift);
-  round_shift_4x4(&in[4], shift);
-  round_shift_4x4(&in[8], shift);
-  round_shift_4x4(&in[12], shift);
-}
-
 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
                              int fliplr, int bd) {
   __m128i x0, x1;
@@ -1349,93 +1342,93 @@
   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
 }
 
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
   const int txw_idx = get_txw_idx(TX_8X8);
   const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     default: assert(0);
@@ -1448,6 +1441,8 @@
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i x;
 
   // stage 0
@@ -1462,18 +1457,16 @@
   // stage 5
   if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
     x = _mm_add_epi32(x, offset);
     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-    x = _mm_max_epi32(x, clamp_lo_out);
-    x = _mm_min_epi32(x, clamp_hi_out);
   }
 
+  x = _mm_max_epi32(x, clamp_lo);
+  x = _mm_min_epi32(x, clamp_hi);
   out[0] = x;
   out[1] = x;
   out[2] = x;
@@ -1580,25 +1573,19 @@
   u5 = _mm_srai_epi32(u5, bit);
 
   // stage 5
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
-    addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
-    addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
-    addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
-  } else {
+  addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-    addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
   }
 }
 
@@ -1867,56 +1854,50 @@
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  in[0] = _mm_mullo_epi32(in[0], cospi32);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
 
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    // stage 3
-    // stage 4
-    in[0] = _mm_mullo_epi32(in[0], cospi32);
-    in[0] = _mm_add_epi32(in[0], rnding);
-    in[0] = _mm_srai_epi32(in[0], bit);
-
-    // stage 5
-    // stage 6
-    // stage 7
-    if (do_cols) {
-      in[0] = _mm_max_epi32(in[0], clamp_lo);
-      in[0] = _mm_min_epi32(in[0], clamp_hi);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+  // stage 5
+  // stage 6
+  // stage 7
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    if (out_shift != 0) {
       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
       in[0] = _mm_add_epi32(in[0], offset);
       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
-      in[0] = _mm_max_epi32(in[0], clamp_lo_out);
-      in[0] = _mm_min_epi32(in[0], clamp_hi_out);
     }
-
-    out[0] = in[0];
-    out[1] = in[0];
-    out[2] = in[0];
-    out[3] = in[0];
-    out[4] = in[0];
-    out[5] = in[0];
-    out[6] = in[0];
-    out[7] = in[0];
-    out[8] = in[0];
-    out[9] = in[0];
-    out[10] = in[0];
-    out[11] = in[0];
-    out[12] = in[0];
-    out[13] = in[0];
-    out[14] = in[0];
-    out[15] = in[0];
   }
+
+  in[0] = _mm_max_epi32(in[0], clamp_lo);
+  in[0] = _mm_min_epi32(in[0], clamp_hi);
+  out[0] = in[0];
+  out[1] = in[0];
+  out[2] = in[0];
+  out[3] = in[0];
+  out[4] = in[0];
+  out[5] = in[0];
+  out[6] = in[0];
+  out[7] = in[0];
+  out[8] = in[0];
+  out[9] = in[0];
+  out[10] = in[0];
+  out[11] = in[0];
+  out[12] = in[0];
+  out[13] = in[0];
+  out[14] = in[0];
+  out[15] = in[0];
 }
 
 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -1944,140 +1925,120 @@
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], x, y;
+  // stage 0
+  // stage 1
+  u[0] = in[0];
+  u[2] = in[4];
+  u[4] = in[2];
+  u[6] = in[6];
+  u[8] = in[1];
+  u[10] = in[5];
+  u[12] = in[3];
+  u[14] = in[7];
 
-  {
-    // stage 0
-    // stage 1
-    u[0] = in[0];
-    u[2] = in[4];
-    u[4] = in[2];
-    u[6] = in[6];
-    u[8] = in[1];
-    u[10] = in[5];
-    u[12] = in[3];
-    u[14] = in[7];
+  // stage 2
+  u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+  u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
 
-    // stage 2
-    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
-    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+  u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+  u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
 
-    u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
-    u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+  u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+  u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
 
-    u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
-    u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+  u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+  u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
 
-    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
-    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+  // stage 3
+  u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+  u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+  u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+  u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
 
-    // stage 3
-    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
-    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
-    u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
-    u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+  addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 
-    addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+  // stage 4
+  x = _mm_mullo_epi32(u[0], cospi32);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+  u[1] = u[0];
 
-    // stage 4
-    x = _mm_mullo_epi32(u[0], cospi32);
-    u[0] = _mm_add_epi32(x, rnding);
-    u[0] = _mm_srai_epi32(u[0], bit);
-    u[1] = u[0];
+  u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+  u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
 
-    u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
-    u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+  addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 
-    addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+  x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+  u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+  u[9] = x;
+  y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+  u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+  u[10] = y;
 
-    x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
-    u[9] = x;
-    y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    u[10] = y;
+  // stage 5
+  addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 
-    // stage 5
-    addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    x = _mm_mullo_epi32(u[5], cospi32);
-    y = _mm_mullo_epi32(u[6], cospi32);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
-    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  // stage 6
+  addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 
-    // stage 6
-    addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+  x = _mm_mullo_epi32(u[10], cospi32);
+  y = _mm_mullo_epi32(u[13], cospi32);
+  u[10] = _mm_sub_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
 
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[13], cospi32);
-    u[10] = _mm_sub_epi32(y, x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  u[13] = _mm_add_epi32(x, y);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
 
-    u[13] = _mm_add_epi32(x, y);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  y = _mm_mullo_epi32(u[12], cospi32);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
 
-    x = _mm_mullo_epi32(u[11], cospi32);
-    y = _mm_mullo_epi32(u[12], cospi32);
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  u[12] = _mm_add_epi32(x, y);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+  // stage 7
+  addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 
-    u[12] = _mm_add_epi32(x, y);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
-    // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
-      addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
-      addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
-      addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
-      addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
-      addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
-      addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
-      addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-    }
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   }
 }
 
@@ -2094,167 +2055,162 @@
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const __m128i zero = _mm_setzero_si128();
   __m128i v[16], x, y, temp1, temp2;
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(x, rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
 
-  // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    x = _mm_mullo_epi32(in[0], cospi62);
-    v[0] = _mm_add_epi32(x, rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(zero, x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
 
-    x = _mm_mullo_epi32(in[0], cospi2);
-    v[1] = _mm_sub_epi32(zero, x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  // stage 3
+  v[8] = v[0];
+  v[9] = v[1];
 
-    // stage 3
-    v[8] = v[0];
-    v[9] = v[1];
+  // stage 4
+  temp1 = _mm_mullo_epi32(v[8], cospi8);
+  x = _mm_mullo_epi32(v[9], cospi56);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    // stage 4
-    temp1 = _mm_mullo_epi32(v[8], cospi8);
-    x = _mm_mullo_epi32(v[9], cospi56);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
+  temp2 = _mm_mullo_epi32(v[8], cospi56);
+  x = _mm_mullo_epi32(v[9], cospi8);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[8] = temp1;
+  v[9] = temp2;
 
-    temp2 = _mm_mullo_epi32(v[8], cospi56);
-    x = _mm_mullo_epi32(v[9], cospi8);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[8] = temp1;
-    v[9] = temp2;
+  // stage 5
+  v[4] = v[0];
+  v[5] = v[1];
+  v[12] = v[8];
+  v[13] = v[9];
 
-    // stage 5
-    v[4] = v[0];
-    v[5] = v[1];
-    v[12] = v[8];
-    v[13] = v[9];
+  // stage 6
+  temp1 = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    // stage 6
-    temp1 = _mm_mullo_epi32(v[4], cospi16);
-    x = _mm_mullo_epi32(v[5], cospi48);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
+  temp2 = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[4] = temp1;
+  v[5] = temp2;
 
-    temp2 = _mm_mullo_epi32(v[4], cospi48);
-    x = _mm_mullo_epi32(v[5], cospi16);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[4] = temp1;
-    v[5] = temp2;
+  temp1 = _mm_mullo_epi32(v[12], cospi16);
+  x = _mm_mullo_epi32(v[13], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    temp1 = _mm_mullo_epi32(v[12], cospi16);
-    x = _mm_mullo_epi32(v[13], cospi48);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
+  temp2 = _mm_mullo_epi32(v[12], cospi48);
+  x = _mm_mullo_epi32(v[13], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[12] = temp1;
+  v[13] = temp2;
 
-    temp2 = _mm_mullo_epi32(v[12], cospi48);
-    x = _mm_mullo_epi32(v[13], cospi16);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[12] = temp1;
-    v[13] = temp2;
+  // stage 7
+  v[2] = v[0];
+  v[3] = v[1];
+  v[6] = v[4];
+  v[7] = v[5];
+  v[10] = v[8];
+  v[11] = v[9];
+  v[14] = v[12];
+  v[15] = v[13];
 
-    // stage 7
-    v[2] = v[0];
-    v[3] = v[1];
-    v[6] = v[4];
-    v[7] = v[5];
-    v[10] = v[8];
-    v[11] = v[9];
-    v[14] = v[12];
-    v[15] = v[13];
+  // stage 8
+  y = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
 
-    // stage 8
-    y = _mm_mullo_epi32(v[2], cospi32);
-    x = _mm_mullo_epi32(v[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  y = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
 
-    y = _mm_mullo_epi32(v[6], cospi32);
-    x = _mm_mullo_epi32(v[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  y = _mm_mullo_epi32(v[10], cospi32);
+  x = _mm_mullo_epi32(v[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
 
-    y = _mm_mullo_epi32(v[10], cospi32);
-    x = _mm_mullo_epi32(v[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  y = _mm_mullo_epi32(v[14], cospi32);
+  x = _mm_mullo_epi32(v[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
 
-    y = _mm_mullo_epi32(v[14], cospi32);
-    x = _mm_mullo_epi32(v[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = v[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2] = v[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4] = v[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6] = v[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8] = v[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10] = v[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12] = v[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14] = v[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
-
-      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -2291,291 +2247,287 @@
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i zero = _mm_setzero_si128();
   __m128i u[16], x, y;
 
-  // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    __m128i zero = _mm_setzero_si128();
-    x = _mm_mullo_epi32(in[0], cospi62);
-    u[0] = _mm_add_epi32(x, rnding);
-    u[0] = _mm_srai_epi32(u[0], bit);
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    x = _mm_mullo_epi32(in[0], cospi2);
-    u[1] = _mm_sub_epi32(zero, x);
-    u[1] = _mm_add_epi32(u[1], rnding);
-    u[1] = _mm_srai_epi32(u[1], bit);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  u[1] = _mm_sub_epi32(zero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    x = _mm_mullo_epi32(in[2], cospi54);
-    u[2] = _mm_add_epi32(x, rnding);
-    u[2] = _mm_srai_epi32(u[2], bit);
+  x = _mm_mullo_epi32(in[2], cospi54);
+  u[2] = _mm_add_epi32(x, rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    x = _mm_mullo_epi32(in[2], cospi10);
-    u[3] = _mm_sub_epi32(zero, x);
-    u[3] = _mm_add_epi32(u[3], rnding);
-    u[3] = _mm_srai_epi32(u[3], bit);
+  x = _mm_mullo_epi32(in[2], cospi10);
+  u[3] = _mm_sub_epi32(zero, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    x = _mm_mullo_epi32(in[4], cospi46);
-    u[4] = _mm_add_epi32(x, rnding);
-    u[4] = _mm_srai_epi32(u[4], bit);
+  x = _mm_mullo_epi32(in[4], cospi46);
+  u[4] = _mm_add_epi32(x, rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    x = _mm_mullo_epi32(in[4], cospi18);
-    u[5] = _mm_sub_epi32(zero, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  x = _mm_mullo_epi32(in[4], cospi18);
+  u[5] = _mm_sub_epi32(zero, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    x = _mm_mullo_epi32(in[6], cospi38);
-    u[6] = _mm_add_epi32(x, rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  x = _mm_mullo_epi32(in[6], cospi38);
+  u[6] = _mm_add_epi32(x, rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    x = _mm_mullo_epi32(in[6], cospi26);
-    u[7] = _mm_sub_epi32(zero, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
+  x = _mm_mullo_epi32(in[6], cospi26);
+  u[7] = _mm_sub_epi32(zero, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    u[8] = _mm_mullo_epi32(in[7], cospi34);
-    u[8] = _mm_add_epi32(u[8], rnding);
-    u[8] = _mm_srai_epi32(u[8], bit);
+  u[8] = _mm_mullo_epi32(in[7], cospi34);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
 
-    u[9] = _mm_mullo_epi32(in[7], cospi30);
-    u[9] = _mm_add_epi32(u[9], rnding);
-    u[9] = _mm_srai_epi32(u[9], bit);
+  u[9] = _mm_mullo_epi32(in[7], cospi30);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
 
-    u[10] = _mm_mullo_epi32(in[5], cospi42);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  u[10] = _mm_mullo_epi32(in[5], cospi42);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
 
-    u[11] = _mm_mullo_epi32(in[5], cospi22);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  u[11] = _mm_mullo_epi32(in[5], cospi22);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
 
-    u[12] = _mm_mullo_epi32(in[3], cospi50);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
+  u[12] = _mm_mullo_epi32(in[3], cospi50);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
 
-    u[13] = _mm_mullo_epi32(in[3], cospi14);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  u[13] = _mm_mullo_epi32(in[3], cospi14);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
 
-    u[14] = _mm_mullo_epi32(in[1], cospi58);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  u[14] = _mm_mullo_epi32(in[1], cospi58);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
 
-    u[15] = _mm_mullo_epi32(in[1], cospi6);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  u[15] = _mm_mullo_epi32(in[1], cospi6);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    // stage 3
-    addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+  // stage 3
+  addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 4
-    y = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    u[8] = _mm_mullo_epi32(u[8], cospi8);
-    u[8] = _mm_add_epi32(u[8], x);
-    u[8] = _mm_add_epi32(u[8], rnding);
-    u[8] = _mm_srai_epi32(u[8], bit);
+  // stage 4
+  y = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  u[8] = _mm_mullo_epi32(u[8], cospi8);
+  u[8] = _mm_add_epi32(u[8], x);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
 
-    x = _mm_mullo_epi32(u[9], cospi8);
-    u[9] = _mm_sub_epi32(y, x);
-    u[9] = _mm_add_epi32(u[9], rnding);
-    u[9] = _mm_srai_epi32(u[9], bit);
+  x = _mm_mullo_epi32(u[9], cospi8);
+  u[9] = _mm_sub_epi32(y, x);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
 
-    x = _mm_mullo_epi32(u[11], cospi24);
-    y = _mm_mullo_epi32(u[10], cospi24);
-    u[10] = _mm_mullo_epi32(u[10], cospi40);
-    u[10] = _mm_add_epi32(u[10], x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  x = _mm_mullo_epi32(u[11], cospi24);
+  y = _mm_mullo_epi32(u[10], cospi24);
+  u[10] = _mm_mullo_epi32(u[10], cospi40);
+  u[10] = _mm_add_epi32(u[10], x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
 
-    x = _mm_mullo_epi32(u[11], cospi40);
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  x = _mm_mullo_epi32(u[11], cospi40);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
 
-    x = _mm_mullo_epi32(u[13], cospi8);
-    y = _mm_mullo_epi32(u[12], cospi8);
-    u[12] = _mm_mullo_epi32(u[12], cospim56);
-    u[12] = _mm_add_epi32(u[12], x);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
+  x = _mm_mullo_epi32(u[13], cospi8);
+  y = _mm_mullo_epi32(u[12], cospi8);
+  u[12] = _mm_mullo_epi32(u[12], cospim56);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
 
-    x = _mm_mullo_epi32(u[13], cospim56);
-    u[13] = _mm_sub_epi32(y, x);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  x = _mm_mullo_epi32(u[13], cospim56);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
 
-    x = _mm_mullo_epi32(u[15], cospi40);
-    y = _mm_mullo_epi32(u[14], cospi40);
-    u[14] = _mm_mullo_epi32(u[14], cospim24);
-    u[14] = _mm_add_epi32(u[14], x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  x = _mm_mullo_epi32(u[15], cospi40);
+  y = _mm_mullo_epi32(u[14], cospi40);
+  u[14] = _mm_mullo_epi32(u[14], cospim24);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
 
-    x = _mm_mullo_epi32(u[15], cospim24);
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  x = _mm_mullo_epi32(u[15], cospim24);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    // stage 5
-    addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+  // stage 5
+  addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 6
-    x = _mm_mullo_epi32(u[5], cospi48);
-    y = _mm_mullo_epi32(u[4], cospi48);
-    u[4] = _mm_mullo_epi32(u[4], cospi16);
-    u[4] = _mm_add_epi32(u[4], x);
-    u[4] = _mm_add_epi32(u[4], rnding);
-    u[4] = _mm_srai_epi32(u[4], bit);
+  // stage 6
+  x = _mm_mullo_epi32(u[5], cospi48);
+  y = _mm_mullo_epi32(u[4], cospi48);
+  u[4] = _mm_mullo_epi32(u[4], cospi16);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    x = _mm_mullo_epi32(u[5], cospi16);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  x = _mm_mullo_epi32(u[5], cospi16);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    x = _mm_mullo_epi32(u[7], cospi16);
-    y = _mm_mullo_epi32(u[6], cospi16);
-    u[6] = _mm_mullo_epi32(u[6], cospim48);
-    u[6] = _mm_add_epi32(u[6], x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  x = _mm_mullo_epi32(u[7], cospi16);
+  y = _mm_mullo_epi32(u[6], cospi16);
+  u[6] = _mm_mullo_epi32(u[6], cospim48);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    x = _mm_mullo_epi32(u[7], cospim48);
-    u[7] = _mm_sub_epi32(y, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
+  x = _mm_mullo_epi32(u[7], cospim48);
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    x = _mm_mullo_epi32(u[13], cospi48);
-    y = _mm_mullo_epi32(u[12], cospi48);
-    u[12] = _mm_mullo_epi32(u[12], cospi16);
-    u[12] = _mm_add_epi32(u[12], x);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
+  x = _mm_mullo_epi32(u[13], cospi48);
+  y = _mm_mullo_epi32(u[12], cospi48);
+  u[12] = _mm_mullo_epi32(u[12], cospi16);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
 
-    x = _mm_mullo_epi32(u[13], cospi16);
-    u[13] = _mm_sub_epi32(y, x);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  x = _mm_mullo_epi32(u[13], cospi16);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
 
-    x = _mm_mullo_epi32(u[15], cospi16);
-    y = _mm_mullo_epi32(u[14], cospi16);
-    u[14] = _mm_mullo_epi32(u[14], cospim48);
-    u[14] = _mm_add_epi32(u[14], x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  x = _mm_mullo_epi32(u[15], cospi16);
+  y = _mm_mullo_epi32(u[14], cospi16);
+  u[14] = _mm_mullo_epi32(u[14], cospim48);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
 
-    x = _mm_mullo_epi32(u[15], cospim48);
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  x = _mm_mullo_epi32(u[15], cospim48);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    // stage 7
-    addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+  // stage 7
+  addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 8
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    u[2] = _mm_add_epi32(y, x);
-    u[2] = _mm_add_epi32(u[2], rnding);
-    u[2] = _mm_srai_epi32(u[2], bit);
+  // stage 8
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  u[2] = _mm_add_epi32(y, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    u[3] = _mm_sub_epi32(y, x);
-    u[3] = _mm_add_epi32(u[3], rnding);
-    u[3] = _mm_srai_epi32(u[3], bit);
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  u[3] = _mm_sub_epi32(y, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    u[7] = _mm_sub_epi32(y, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    u[10] = _mm_add_epi32(y, x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  u[10] = _mm_add_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
 
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
 
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    u[14] = _mm_add_epi32(y, x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  u[14] = _mm_add_epi32(y, x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
 
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = u[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
-      out[2] = u[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
-      out[4] = u[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
-      out[6] = u[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
-      out[8] = u[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
-      out[10] = u[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
-      out[12] = u[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
-      out[14] = u[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+  // stage 9
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(zero, u[8]);
+    out[2] = u[12];
+    out[3] = _mm_sub_epi32(zero, u[4]);
+    out[4] = u[6];
+    out[5] = _mm_sub_epi32(zero, u[14]);
+    out[6] = u[10];
+    out[7] = _mm_sub_epi32(zero, u[2]);
+    out[8] = u[3];
+    out[9] = _mm_sub_epi32(zero, u[11]);
+    out[10] = u[15];
+    out[11] = _mm_sub_epi32(zero, u[7]);
+    out[12] = u[5];
+    out[13] = _mm_sub_epi32(zero, u[13]);
+    out[14] = u[9];
+    out[15] = _mm_sub_epi32(zero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-      neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+    neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -2741,38 +2693,22 @@
     v[15] = u[15];
 
     // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
-      addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
-      addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
-      addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
-      addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
-      addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
-      addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
-      addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 
-      addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8(out, out_shift);
+      highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
     }
   }
 }
@@ -2810,403 +2746,379 @@
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  const __m128i zero = _mm_setzero_si128();
   __m128i u[16], v[16], x, y;
-
   // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15], cospi2);
-    x = _mm_mullo_epi32(in[0], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = _mm_mullo_epi32(in[15], cospi2);
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(v[0], x);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
 
-    v[1] = _mm_mullo_epi32(in[15], cospi62);
-    x = _mm_mullo_epi32(in[0], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  v[1] = _mm_mullo_epi32(in[15], cospi62);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(v[1], x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
 
-    v[2] = _mm_mullo_epi32(in[13], cospi10);
-    x = _mm_mullo_epi32(in[2], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  v[2] = _mm_mullo_epi32(in[13], cospi10);
+  x = _mm_mullo_epi32(in[2], cospi54);
+  v[2] = _mm_add_epi32(v[2], x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_mullo_epi32(in[13], cospi54);
-    x = _mm_mullo_epi32(in[2], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  v[3] = _mm_mullo_epi32(in[13], cospi54);
+  x = _mm_mullo_epi32(in[2], cospi10);
+  v[3] = _mm_sub_epi32(v[3], x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = _mm_mullo_epi32(in[11], cospi18);
-    x = _mm_mullo_epi32(in[4], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+  v[4] = _mm_mullo_epi32(in[11], cospi18);
+  x = _mm_mullo_epi32(in[4], cospi46);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
 
-    v[5] = _mm_mullo_epi32(in[11], cospi46);
-    x = _mm_mullo_epi32(in[4], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+  v[5] = _mm_mullo_epi32(in[11], cospi46);
+  x = _mm_mullo_epi32(in[4], cospi18);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
 
-    v[6] = _mm_mullo_epi32(in[9], cospi26);
-    x = _mm_mullo_epi32(in[6], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  v[6] = _mm_mullo_epi32(in[9], cospi26);
+  x = _mm_mullo_epi32(in[6], cospi38);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_mullo_epi32(in[9], cospi38);
-    x = _mm_mullo_epi32(in[6], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  v[7] = _mm_mullo_epi32(in[9], cospi38);
+  x = _mm_mullo_epi32(in[6], cospi26);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = _mm_mullo_epi32(in[7], cospi34);
-    x = _mm_mullo_epi32(in[8], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  v[8] = _mm_mullo_epi32(in[7], cospi34);
+  x = _mm_mullo_epi32(in[8], cospi30);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
 
-    v[9] = _mm_mullo_epi32(in[7], cospi30);
-    x = _mm_mullo_epi32(in[8], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  v[9] = _mm_mullo_epi32(in[7], cospi30);
+  x = _mm_mullo_epi32(in[8], cospi34);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
 
-    v[10] = _mm_mullo_epi32(in[5], cospi42);
-    x = _mm_mullo_epi32(in[10], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  v[10] = _mm_mullo_epi32(in[5], cospi42);
+  x = _mm_mullo_epi32(in[10], cospi22);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_mullo_epi32(in[5], cospi22);
-    x = _mm_mullo_epi32(in[10], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  v[11] = _mm_mullo_epi32(in[5], cospi22);
+  x = _mm_mullo_epi32(in[10], cospi42);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = _mm_mullo_epi32(in[3], cospi50);
-    x = _mm_mullo_epi32(in[12], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  v[12] = _mm_mullo_epi32(in[3], cospi50);
+  x = _mm_mullo_epi32(in[12], cospi14);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
 
-    v[13] = _mm_mullo_epi32(in[3], cospi14);
-    x = _mm_mullo_epi32(in[12], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  v[13] = _mm_mullo_epi32(in[3], cospi14);
+  x = _mm_mullo_epi32(in[12], cospi50);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
 
-    v[14] = _mm_mullo_epi32(in[1], cospi58);
-    x = _mm_mullo_epi32(in[14], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  v[14] = _mm_mullo_epi32(in[1], cospi58);
+  x = _mm_mullo_epi32(in[14], cospi6);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_mullo_epi32(in[1], cospi6);
-    x = _mm_mullo_epi32(in[14], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  v[15] = _mm_mullo_epi32(in[1], cospi6);
+  x = _mm_mullo_epi32(in[14], cospi58);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    // stage 3
-    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+  // stage 3
+  addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 4
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
 
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  v[8] = _mm_mullo_epi32(u[8], cospi8);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
 
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  v[9] = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi8);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
 
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  v[10] = _mm_mullo_epi32(u[10], cospi40);
+  x = _mm_mullo_epi32(u[11], cospi24);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  v[11] = _mm_mullo_epi32(u[10], cospi24);
+  x = _mm_mullo_epi32(u[11], cospi40);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  v[12] = _mm_mullo_epi32(u[12], cospim56);
+  x = _mm_mullo_epi32(u[13], cospi8);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
 
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  v[13] = _mm_mullo_epi32(u[12], cospi8);
+  x = _mm_mullo_epi32(u[13], cospim56);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  v[14] = _mm_mullo_epi32(u[14], cospim24);
+  x = _mm_mullo_epi32(u[15], cospi40);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  v[15] = _mm_mullo_epi32(u[14], cospi40);
+  x = _mm_mullo_epi32(u[15], cospim24);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    // stage 5
-    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+  // stage 5
+  addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
 
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+  v[4] = _mm_mullo_epi32(u[4], cospi16);
+  x = _mm_mullo_epi32(u[5], cospi48);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
 
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+  v[5] = _mm_mullo_epi32(u[4], cospi48);
+  x = _mm_mullo_epi32(u[5], cospi16);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
 
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  v[6] = _mm_mullo_epi32(u[6], cospim48);
+  x = _mm_mullo_epi32(u[7], cospi16);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  v[7] = _mm_mullo_epi32(u[6], cospi16);
+  x = _mm_mullo_epi32(u[7], cospim48);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
 
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  v[12] = _mm_mullo_epi32(u[12], cospi16);
+  x = _mm_mullo_epi32(u[13], cospi48);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
 
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  v[13] = _mm_mullo_epi32(u[12], cospi48);
+  x = _mm_mullo_epi32(u[13], cospi16);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  v[14] = _mm_mullo_epi32(u[14], cospim48);
+  x = _mm_mullo_epi32(u[15], cospi16);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  v[15] = _mm_mullo_epi32(u[14], cospi16);
+  x = _mm_mullo_epi32(u[15], cospim48);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    // stage 7
-    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+  // stage 7
+  addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
+  // stage 8
+  v[0] = u[0];
+  v[1] = u[1];
 
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = u[4];
-    v[5] = u[5];
+  v[4] = u[4];
+  v[5] = u[5];
 
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
+  v[8] = u[8];
+  v[9] = u[9];
 
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = u[12];
-    v[13] = u[13];
+  v[12] = u[12];
+  v[13] = u[13];
 
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = v[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2] = v[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4] = v[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6] = v[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8] = v[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10] = v[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12] = v[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14] = v[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                                int bd, int out_shift) {
   (void)bit;
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i v[16];
   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
-  __m128i a0, a1, a2, a3;
+  __m128i a0_low, a0_high, a1_low, a1_high;
+  __m128i zero = _mm_set1_epi32(0);
+  offset = _mm_unpacklo_epi32(offset, zero);
 
-  for (int i = 0; i < 16; i += 8) {
-    a0 = _mm_mullo_epi32(in[i], fact);
-    a1 = _mm_mullo_epi32(in[i + 1], fact);
-    a0 = _mm_add_epi32(a0, offset);
-    a1 = _mm_add_epi32(a1, offset);
-    v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
-    v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+  for (int i = 0; i < 16; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
 
-    a2 = _mm_mullo_epi32(in[i + 2], fact);
-    a3 = _mm_mullo_epi32(in[i + 3], fact);
-    a2 = _mm_add_epi32(a2, offset);
-    a3 = _mm_add_epi32(a3, offset);
-    v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
-    v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
 
-    a0 = _mm_mullo_epi32(in[i + 4], fact);
-    a1 = _mm_mullo_epi32(in[i + 5], fact);
-    a0 = _mm_add_epi32(a0, offset);
-    a1 = _mm_add_epi32(a1, offset);
-    v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
-    v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
-
-    a2 = _mm_mullo_epi32(in[i + 6], fact);
-    a3 = _mm_mullo_epi32(in[i + 7], fact);
-    a2 = _mm_add_epi32(a2, offset);
-    a3 = _mm_add_epi32(a3, offset);
-    v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
-    v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
   }
 
   if (!do_cols) {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
-  } else {
-    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
   }
 }
 static INLINE void idct64_stage8_sse4_1(
@@ -3327,21 +3239,21 @@
 
 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
                                          int bd, int out_shift,
-                                         const int log_range) {
-  if (do_cols) {
-    for (int i = 0; i < 32; i++) {
-      addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
-    }
-  } else {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+  }
 
-    for (int i = 0; i < 32; i++) {
-      addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    for (int i = 0; i < 64; i += 4) {
+      round_shift_4x4(out + i, out_shift);
+      highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+                                4);
     }
   }
 }
@@ -3351,8 +3263,8 @@
   const int32_t *cospi = cospi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 
@@ -3371,24 +3283,18 @@
     // stage 9
     // stage 10
     // stage 11
-    if (do_cols) {
-      x = _mm_max_epi32(x, clamp_lo);
-      x = _mm_min_epi32(x, clamp_hi);
-    } else {
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
-      x = _mm_add_epi32(x, offset);
-      x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-
-      x = _mm_max_epi32(x, clamp_lo_out);
-      x = _mm_min_epi32(x, clamp_hi_out);
+      clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+        x = _mm_add_epi32(x, offset);
+        x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
     }
-
+    x = _mm_max_epi32(x, clamp_lo);
+    x = _mm_min_epi32(x, clamp_hi);
     out[0] = x;
     out[1] = x;
     out[2] = x;
@@ -3670,7 +3576,6 @@
     u[6] = u[1];
     u[5] = u[2];
     u[4] = u[3];
-    u[9] = u[9];
 
     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
@@ -3684,7 +3589,7 @@
                           bit);
 
     // stage 11
-    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -3994,7 +3899,7 @@
                           bit);
 
     // stage 11
-    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -4457,20 +4362,20 @@
     for (i = 56; i < 64; i++) v[i] = u[i];
 
     // stage 11
-    if (do_cols) {
-      for (i = 0; i < 32; i++) {
-        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
-      }
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    for (i = 0; i < 32; i++) {
+      addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                    &clamp_hi);
+    }
 
-      for (i = 0; i < 32; i++) {
-        addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
-                            &clamp_lo_out, &clamp_hi_out, out_shift);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      for (i = 0; i < 64; i += 4) {
+        round_shift_4x4(out + i, out_shift);
+        highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+                                  &clamp_hi_out, 4);
       }
     }
   }
@@ -4482,8 +4387,8 @@
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i bf1;
 
   // stage 0
@@ -4505,17 +4410,17 @@
     bf1 = _mm_min_epi32(bf1, clamp_hi);
   } else {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
-    bf1 = _mm_add_epi32(bf1, offset);
-    bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
-    bf1 = _mm_max_epi32(bf1, clamp_lo_out);
-    bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    if (out_shift != 0) {
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      bf1 = _mm_add_epi32(bf1, offset);
+      bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    }
   }
+
+  bf1 = _mm_max_epi32(bf1, clamp_lo);
+  bf1 = _mm_min_epi32(bf1, clamp_hi);
   out[0] = bf1;
   out[1] = bf1;
   out[2] = bf1;
@@ -4658,7 +4563,7 @@
                        &rounding, bit);
 
   // stage 9
-  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 }
 
 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -4804,9 +4709,8 @@
   // stage 8
   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
                        &rounding, bit);
-
   // stage 9
-  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 }
 
 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -5162,62 +5066,30 @@
   bf0[31] = bf1[31];
 
   // stage 9
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
-    addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
-    addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
-    addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
-    addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
-    addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
-    addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
-    addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
-    addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
-    addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
-    addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
-    addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
-    addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
-    addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
-    addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
-    addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
-  } else {
-    const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+  addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
 
-    addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -5265,38 +5137,32 @@
 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                                int bd, int out_shift) {
   (void)bit;
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i v[32];
   for (int i = 0; i < 32; i += 16) {
-    v[i] = _mm_slli_epi32(in[i], 2);
-    v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
-    v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
-    v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
-    v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
-    v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
-    v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
-    v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
-    v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
-    v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
-    v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
-    v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
-    v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
-    v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
-    v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
-    v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+    out[i] = _mm_slli_epi32(in[i], 2);
+    out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+    out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+    out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+    out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+    out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+    out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+    out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+    out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+    out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+    out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+    out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+    out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+    out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+    out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+    out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
   }
 
   if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
-  } else {
-    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 static const transform_1d_sse4_1
@@ -5333,7 +5199,7 @@
   __m128i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5361,7 +5227,8 @@
       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
                                            NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
 
@@ -5374,7 +5241,7 @@
   }
   for (int i = 0; i < buf_size_w_div4; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5395,7 +5262,7 @@
   __m128i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5427,7 +5294,8 @@
       av1_round_shift_rect_array_32_sse4_1(
           buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     if (lr_flip) {
@@ -5450,7 +5318,7 @@
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5472,7 +5340,7 @@
                                               int eob, const int bd) {
   (void)eob;
   __m128i buf1[64 * 4];
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5496,7 +5364,8 @@
       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
                                            NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     for (int j = 0; j < (input_stride >> 2); ++j) {
@@ -5508,7 +5377,7 @@
   }
   for (int i = 0; i < (input_stride >> 2); i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5532,7 +5401,7 @@
   __m128i buf1[64 * 16];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5570,7 +5439,8 @@
       av1_round_shift_rect_array_32_sse4_1(
           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     if (lr_flip) {
@@ -5594,7 +5464,7 @@
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5617,7 +5487,7 @@
                                             int eob, const int bd) {
   (void)eob;
   __m128i buf1[8];
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5640,8 +5510,8 @@
   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
                                        NewInvSqrt2);
-  row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
-  row_txfm(buf0 + 4, buf0 + 4, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
            -shift[0]);
 
   if (lr_flip) {
@@ -5659,7 +5529,7 @@
   }
 
   // 2nd stage: column transform
-  col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 
@@ -5674,7 +5544,7 @@
                                             int eob, const int bd) {
   (void)eob;
   __m128i buf1[8];
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5701,7 +5571,7 @@
 
   av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
                                        NewInvSqrt2);
-  row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
 
   __m128i *buf1_ptr;
   if (lr_flip) {
@@ -5714,7 +5584,7 @@
   // 2nd stage: column transform
   for (int i = 0; i < 2; i++) {
     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
   }
   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
   // write to buffer
@@ -5728,7 +5598,7 @@
                                               int eob, const int bd) {
   (void)eob;
   __m128i buf1[16];
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5752,11 +5622,9 @@
   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
   for (int i = 0; i < (txfm_size_row >> 2); i++) {
     row_txfm(buf0 + (i << 2), buf0 + (i << 2),
-             inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
   }
 
-  av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]);
-
   if (lr_flip) {
     for (int j = 0; j < buf_size_h_div8; ++j) {
       TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
@@ -5772,7 +5640,7 @@
   }
 
   // 2nd stage: column transform
-  col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 
@@ -5787,7 +5655,7 @@
                                               int eob, const int bd) {
   (void)eob;
   __m128i buf1[16];
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5812,7 +5680,7 @@
     TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
                   buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
   }
-  row_txfm(buf1, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
 
   __m128i *buf1_ptr;
   if (lr_flip) {
@@ -5825,7 +5693,7 @@
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
   }
   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 
@@ -5926,16 +5794,28 @@
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+      break;
     default:
-      // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
-      // cause test vector mismatches.
-      av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+          txfm_param->bd);
       break;
   }
 }

diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
index c5040c4..70f1ec7 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c

@@ -25,14 +25,14 @@
 void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
@@ -231,8 +231,8 @@
 void av1_highbd_dist_wtd_convolve_2d_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -277,8 +277,8 @@
   const __m256i clip_pixel_to_bd =
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -467,15 +467,15 @@
 void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   int i, j;
   __m256i s[4], coeffs_x[4];
@@ -504,7 +504,7 @@
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
   assert(bits >= 0);
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -636,15 +636,15 @@
 void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
@@ -672,7 +672,7 @@
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];

diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
index 7fea36a..f033a6f 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c

@@ -20,15 +20,15 @@
 void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
@@ -56,7 +56,7 @@
   const __m128i zero = _mm_setzero_si128();
   __m128i s[16], coeffs_y[4];
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -262,15 +262,15 @@
 void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   int i, j;
   __m128i s[4], coeffs_x[4];
@@ -299,7 +299,7 @@
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
   assert(bits >= 0);
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */

diff --git a/libaom/av1/common/x86/highbd_warp_plane_sse4.c b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
index 3765c5e..60a8193 100644
--- a/libaom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/libaom/av1/common/x86/highbd_warp_plane_sse4.c

@@ -15,9 +15,9 @@
 
 #include "av1/common/warped_motion.h"
 
-static const uint8_t warp_highbd_arrange_bytes[16] = {
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-};
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
+                                                       12, 14, 1,  3, 5, 7,
+                                                       9,  11, 13, 15 };
 
 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
@@ -25,24 +25,28 @@
 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
 };
-static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
-  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
-};
-static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
+                                                         10, 11, 8,  9,  10, 11,
+                                                         8,  9,  10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+                                                         14, 15, 12, 13, 14, 15,
+                                                         12, 13, 14, 15 };
 
 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
                                                           __m128i *coeff) {
   // Filter even-index pixels
-  const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_2 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_4 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_6 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
   // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -63,14 +67,18 @@
   coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
   // Filter odd-index pixels
-  const __m128i tmp_1 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_3 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_5 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_7 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -87,7 +95,7 @@
     int sx, __m128i *coeff) {
   // Filter coeff
   const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+      (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
 
   coeff[0] = _mm_shuffle_epi8(
       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
@@ -454,16 +462,16 @@
 
         // Filter even-index pixels
         const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
         const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -491,16 +499,16 @@
         const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
 
         const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
         const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);

diff --git a/libaom/av1/common/x86/intra_edge_sse4.c b/libaom/av1/common/x86/intra_edge_sse4.c
index 0c857b5..fc69f41 100644
--- a/libaom/av1/common/x86/intra_edge_sse4.c
+++ b/libaom/av1/common/x86/intra_edge_sse4.c

@@ -212,10 +212,10 @@
     { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
   };
 
-  DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
-    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
-    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
-  };
+  DECLARE_ALIGNED(
+      16, static const int8_t,
+      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
 
   // Extend first/last samples (upper-left p[-1], last p[sz-1])
   // to support 4-tap filter

diff --git a/libaom/av1/common/x86/jnt_convolve_avx2.c b/libaom/av1/common/x86/jnt_convolve_avx2.c
index 23cd6ab..6de6157 100644
--- a/libaom/av1/common/x86/jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/jnt_convolve_avx2.c

@@ -23,8 +23,8 @@
 static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
   const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
   return wt;
 }
@@ -39,7 +39,7 @@
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -65,14 +65,14 @@
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   __m256i filt[4], coeffs[4];
 
   filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
 
   // Condition for checking valid horz_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
@@ -191,7 +191,7 @@
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -220,11 +220,11 @@
 
   assert((FILTER_BITS - conv_params->round_0) >= 0);
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
   (void)conv_params;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   // Condition for checking valid vert_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
@@ -596,7 +596,7 @@
                                    uint8_t *dst0, int dst_stride0, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -633,8 +633,8 @@
   filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   // Condition for checking valid horz_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
@@ -805,15 +805,15 @@
 void av1_dist_wtd_convolve_2d_copy_avx2(
     const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;

diff --git a/libaom/av1/common/x86/jnt_convolve_sse2.c b/libaom/av1/common/x86/jnt_convolve_sse2.c
index 641cd02..f8f640a 100644
--- a/libaom/av1/common/x86/jnt_convolve_sse2.c
+++ b/libaom/av1/common/x86/jnt_convolve_sse2.c

@@ -20,7 +20,7 @@
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -48,9 +48,9 @@
   __m128i coeffs[4];
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w == 4) {
     do {
@@ -154,7 +154,7 @@
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -180,9 +180,9 @@
   __m128i coeffs[4];
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
@@ -388,7 +388,7 @@
                                    uint8_t *dst0, int dst_stride0, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -424,7 +424,7 @@
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -507,7 +507,7 @@
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3

diff --git a/libaom/av1/common/x86/jnt_convolve_ssse3.c b/libaom/av1/common/x86/jnt_convolve_ssse3.c
index 9aeab29..f45e3b2 100644
--- a/libaom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/libaom/av1/common/x86/jnt_convolve_ssse3.c

@@ -19,8 +19,8 @@
 void av1_dist_wtd_convolve_2d_ssse3(
     const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -55,7 +55,7 @@
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -123,7 +123,7 @@
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3

diff --git a/libaom/av1/common/x86/reconinter_avx2.c b/libaom/av1/common/x86/reconinter_avx2.c
index f645e04..a38bd83 100644
--- a/libaom/av1/common/x86/reconinter_avx2.c
+++ b/libaom/av1/common/x86/reconinter_avx2.c

@@ -28,8 +28,8 @@
 }
 void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
                                           DIFFWTD_MASK_TYPE mask_type,
-                                          const uint8_t *src0, int stride0,
-                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *src0, int src0_stride,
+                                          const uint8_t *src1, int src1_stride,
                                           int h, int w) {
   const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
   const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
@@ -37,18 +37,18 @@
   if (4 == w) {
     do {
       const __m128i s0A = xx_loadl_32(src0);
-      const __m128i s0B = xx_loadl_32(src0 + stride0);
-      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
-      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+      const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
       const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
       const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
       const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
       const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
 
       const __m128i s1A = xx_loadl_32(src1);
-      const __m128i s1B = xx_loadl_32(src1 + stride1);
-      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
-      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+      const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
       const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
       const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
       const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
@@ -58,40 +58,40 @@
       const __m128i x_m8 =
           _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
       xx_storeu_128(mask, x_m8);
-      src0 += (stride0 << 2);
-      src1 += (stride1 << 2);
+      src0 += (src0_stride << 2);
+      src1 += (src1_stride << 2);
       mask += 16;
       i += 4;
     } while (i < h);
   } else if (8 == w) {
     do {
       const __m128i s0A = xx_loadl_64(src0);
-      const __m128i s0B = xx_loadl_64(src0 + stride0);
-      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
-      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
       const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
       const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
       const __m128i s1A = xx_loadl_64(src1);
-      const __m128i s1B = xx_loadl_64(src1 + stride1);
-      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
-      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
       const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
       const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
       const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
       const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
       const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
       yy_storeu_256(mask, m8);
-      src0 += stride0 << 2;
-      src1 += stride1 << 2;
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
       mask += 32;
       i += 4;
     } while (i < h);
   } else if (16 == w) {
     do {
       const __m128i s0A = xx_load_128(src0);
-      const __m128i s0B = xx_load_128(src0 + stride0);
+      const __m128i s0B = xx_load_128(src0 + src0_stride);
       const __m128i s1A = xx_load_128(src1);
-      const __m128i s1B = xx_load_128(src1 + stride1);
+      const __m128i s1B = xx_load_128(src1 + src1_stride);
       const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
       const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
       const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
@@ -103,8 +103,8 @@
       const __m256i m8 =
           _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
       yy_storeu_256(mask, m8);
-      src0 += stride0 << 1;
-      src1 += stride1 << 1;
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
       mask += 32;
       i += 2;
     } while (i < h);
@@ -127,8 +127,8 @@
         yy_storeu_256(mask + j, m8);
         j += 32;
       } while (j < w);
-      src0 += stride0;
-      src1 += stride1;
+      src0 += src0_stride;
+      src1 += src1_stride;
       mask += w;
       i += 1;
     } while (i < h);

diff --git a/libaom/av1/common/x86/selfguided_avx2.c b/libaom/av1/common/x86/selfguided_avx2.c
index 0aaf1f4..3c5558d 100644
--- a/libaom/av1/common/x86/selfguided_avx2.c
+++ b/libaom/av1/common/x86/selfguided_avx2.c

@@ -219,12 +219,12 @@
 static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                     int width, int height, int buf_stride, int bit_depth,
                     int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -263,7 +263,7 @@
                             SGRPROJ_MTABLE_BITS),
           _mm256_set1_epi32(255));
 
-      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
 
       yy_storeu_256(A + i * buf_stride + j, a_res);
 
@@ -356,12 +356,12 @@
                          const int32_t *D, int width, int height,
                          int buf_stride, int bit_depth, int sgr_params_idx,
                          int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -400,7 +400,7 @@
                             SGRPROJ_MTABLE_BITS),
           _mm256_set1_epi32(255));
 
-      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
 
       yy_storeu_256(A + i * buf_stride + j, a_res);
 
@@ -604,7 +604,7 @@
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
@@ -630,11 +630,11 @@
   return 0;
 }
 
-void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
-                                       int height, int stride, int eps,
-                                       const int *xqd, uint8_t *dst8,
-                                       int dst_stride, int32_t *tmpbuf,
-                                       int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -642,9 +642,9 @@
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   __m256i xq0 = _mm256_set1_epi32(xq[0]);
   __m256i xq1 = _mm256_set1_epi32(xq[1]);

diff --git a/libaom/av1/common/x86/selfguided_sse4.c b/libaom/av1/common/x86/selfguided_sse4.c
index ea3f6d9..72c7708 100644
--- a/libaom/av1/common/x86/selfguided_sse4.c
+++ b/libaom/av1/common/x86/selfguided_sse4.c

@@ -170,12 +170,12 @@
 static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                     int width, int height, int buf_stride, int bit_depth,
                     int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -216,10 +216,11 @@
 
       // 'Gather' type instructions are not available pre-AVX2, so synthesize a
       // gather using scalar loads.
-      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
       xx_storeu_128(A + i * buf_stride + j, a_res);
 
@@ -310,12 +311,12 @@
                          const int32_t *D, int width, int height,
                          int buf_stride, int bit_depth, int sgr_params_idx,
                          int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -356,10 +357,11 @@
 
       // 'Gather' type instructions are not available pre-AVX2, so synthesize a
       // gather using scalar loads.
-      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
       xx_storeu_128(A + i * buf_stride + j, a_res);
 
@@ -554,7 +556,7 @@
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
@@ -580,11 +582,11 @@
   return 0;
 }
 
-void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
-                                         int height, int stride, int eps,
-                                         const int *xqd, uint8_t *dst8,
-                                         int dst_stride, int32_t *tmpbuf,
-                                         int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                             int height, int stride, int eps,
+                                             const int *xqd, uint8_t *dst8,
+                                             int dst_stride, int32_t *tmpbuf,
+                                             int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -592,9 +594,9 @@
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
   __m128i xq1 = _mm_set1_epi32(xq[1]);

diff --git a/libaom/av1/common/x86/warp_plane_avx2.c b/libaom/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 0000000..53a928d
--- /dev/null
+++ b/libaom/av1/common/x86/warp_plane_avx2.c

@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
+                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
+                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
+                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
+                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
+                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+                                          __m256i *coeff,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift, int row) {
+  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+  horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+                                                        int sx,
+                                                        __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+  __m128i tmp_8 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+  __m128i tmp_9 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+  __m128i tmp_10 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+  __m128i tmp_11 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+  tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+  tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+  tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+  tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+                                                              __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+                                                               __m256i *coeff) {
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+  coeff[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+  coeff[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+  coeff[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+  coeff[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+                                          int sx, int alpha, int beta, int row,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift) {
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+                         row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m256i *coeff) {
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+                           round_const, shift);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)beta;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+  *res_sub_const =
+      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                        (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16((short)w0);
+  const __m256i wt1 = _mm256_set1_epi16((short)w1);
+  *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+                                                       int sy,
+                                                       __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m128i filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  __m256i filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  __m256i filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  __m256i filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+                                                              __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+                                                              __m256i *coeffs) {
+  const __m128i filt_0 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  const __m128i filt_1 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+  coeffs[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+  coeffs[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+  coeffs[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+  coeffs[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+                                                   __m256i *src,
+                                                   __m256i *coeffs,
+                                                   __m256i *res_lo,
+                                                   __m256i *res_hi, int row) {
+  const __m256i src_6 = horz_out[row + 3];
+  const __m256i src_7 =
+      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+                                            _mm256_add_epi32(res_4, res_6));
+
+  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+                                           _mm256_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+    const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+    const __m256i *wt, const __m256i *res_sub_const,
+    const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+    int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m256i res_lo_1 = *res_lo;
+  __m256i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p_0 =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    __m128i *const p_1 =
+        (__m128i *)&conv_params
+            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+                                 reduce_bits_vert);
+
+    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+    __m256i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      __m128i *const dst8_1 =
+          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+      const __m256i p_16 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+      if (conv_params->use_dist_wtd_comp_avg) {
+        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+        const __m256i shifted_32 =
+            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+      }
+      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+      res_lo_16 = _mm256_srai_epi16(
+          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+      *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+      *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+    } else {
+      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+      _mm_storel_epi64(p_0, temp_lo_16_0);
+      _mm_storel_epi64(p_1, temp_lo_16_1);
+    }
+    if (p_width > 4) {
+      __m128i *const p4_0 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      __m128i *const p4_1 =
+          (__m128i *)&conv_params
+              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+                                   reduce_bits_vert);
+      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+      __m256i res_hi_16;
+      if (conv_params->do_average) {
+        __m128i *const dst8_4_0 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        __m128i *const dst8_4_1 =
+            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+        const __m256i p4_16 = _mm256_inserti128_si256(
+            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+        if (conv_params->use_dist_wtd_comp_avg) {
+          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+          const __m256i shifted_32 =
+              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+        res_hi_16 = _mm256_srai_epi16(
+            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+        *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+        *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+      } else {
+        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+        _mm_storel_epi64(p4_0, temp_hi_16_0);
+        _mm_storel_epi64(p4_1, temp_hi_16_1);
+      }
+    }
+  } else {
+    const __m256i res_lo_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m256i res_hi_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
+      *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
+    } else {
+      _mm_storel_epi64(p, res_8bit0);
+      _mm_storel_epi64(p1, res_8bit1);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)delta;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else
+    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+                              p_height, p_stride, p_width, i, j, sy4,
+                              reduce_bits_vert, res_add_const, round_bits,
+                              res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0_avx2(
+        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        round_const, shift, shuffle_src);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                       alpha, beta, p_height, height, i,
+                                       round_const, shift, shuffle_src);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                      alpha, beta, p_height, height, i,
+                                      round_const, shift, shuffle_src);
+  else
+    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+                                beta, p_height, height, i, round_const, shift,
+                                shuffle_src);
+}
+
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m256i row_error, col_error;
+  __m256i zero = _mm256_set1_epi16(0);
+  __m256i dup_255 = _mm256_set1_epi16(255);
+  col_error = zero;
+
+  for (i = 0; i < (p_height / 4); i++) {
+    row_error = _mm256_set1_epi16(0);
+    for (j = 0; j < (p_width / 16); j++) {
+      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+      __m256i diff_1 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+      __m256i diff_2 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+      __m256i diff_3 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+      __m256i diff_4 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+      __m256i error_1_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+      __m256i error_1_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+      __m256i error_2_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+      __m256i error_2_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+      __m256i error_3_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+      __m256i error_3_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+      __m256i error_4_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+      __m256i error_4_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+    }
+    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm256_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = j * 16; l < p_width; ++l) {
+          sum_error +=
+              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+                                     ref[l + ((i * 4) + k) * ref_stride]);
+        }
+      }
+    }
+  }
+  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, sum_error_q_0);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  // Error summation for remaining height, which is not multiple of 4
+  if (p_height & 0x3) {
+    for (int k = i * 4; k < p_height; ++k) {
+      for (int l = 0; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+                                            ref[l + k * ref_stride]);
+      }
+    }
+  }
+  return sum_error;
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  __m256i horz_out[8];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m256i reduce_bits_vert_const =
+      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const __m256i round_const = _mm256_set1_epi16(
+      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+  __m256i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+                                          &res_sub_const, &round_bits_const,
+                                          &wt);
+
+  __m256i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                        ((1 << reduce_bits_vert) >> 1));
+  }
+  const int32_t const1 = alpha * (-4) + beta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const2 = gamma * (-4) + delta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+  __m256i shuffle_src[4];
+  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += const1;
+      sy4 += const2;
+
+      sx4 &= ~const3;
+      sy4 &= ~const3;
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+
+      if (ix4 <= -7) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+      } else if (ix4 >= width + 6) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] =
+            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        int iy, sx, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src0 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src1 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+          }
+          sx = sx4 + beta * (k + 4);
+          const __m256i src_01 =
+              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+                                 shuffle_src, &round_const, &shift);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+        if (out_of_boundary_left >= 0) {
+          const __m128i shuffle_reg_left =
+              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_left);
+        }
+        if (out_of_boundary_right >= 0) {
+          const __m128i shuffle_reg_right =
+              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_right);
+        }
+        sx = sx4 + beta * (k + 4);
+        const __m256i src_01 = _mm256_castsi128_si256(src);
+        __m256i coeff[4];
+        prepare_horizontal_filter_coeff(alpha, sx, coeff);
+        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+                               &round_const, &shift, row);
+      } else {
+        prepare_warp_horizontal_filter_avx2(
+            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+            i, &round_const, &shift, shuffle_src);
+      }
+
+      // Vertical filter
+      prepare_warp_vertical_filter_avx2(
+          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+          &res_sub_const, &round_bits_const, &wt);
+    }
+  }
+}

diff --git a/libaom/av1/common/x86/warp_plane_sse2.c b/libaom/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 0000000..6ff6665
--- /dev/null
+++ b/libaom/av1/common/x86/warp_plane_sse2.c

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m128i row_error, col_error;
+  __m128i zero = _mm_set1_epi16(0);
+  __m128i dup_255 = _mm_set1_epi16(255);
+  col_error = zero;
+  for (i = 0; i < (p_height); i++) {
+    row_error = zero;
+    for (j = 0; j < (p_width / 16); j++) {
+      __m128i ref_8 =
+          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+      __m128i dst_8 =
+          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+      __m128i diff_1 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+      __m128i diff_2 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+      __m128i error_1_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+      __m128i error_1_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+      __m128i error_2_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+      __m128i error_2_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+      row_error = _mm_add_epi32(row_error, error_1_2);
+    }
+    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int l = j * 16; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+                                            ref[l + i * ref_stride]);
+      }
+    }
+  }
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, col_error);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  return sum_error;
+}

diff --git a/libaom/av1/common/x86/warp_plane_sse4.c b/libaom/av1/common/x86/warp_plane_sse4.c
index 4532d17..10ddf92 100644
--- a/libaom/av1/common/x86/warp_plane_sse4.c
+++ b/libaom/av1/common/x86/warp_plane_sse4.c

@@ -16,7 +16,7 @@
 
 #include "av1/common/warped_motion.h"
 
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
    * Each coefficient is stored in 8 bits instead of 16 bits
    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
 
@@ -31,8 +31,8 @@
      coefficients into the correct order more quickly.
 */
 /* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
-                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+DECLARE_ALIGNED(8, const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
 #if WARPEDPIXEL_PREC_BITS == 6
   // [-1, 0)
   { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
@@ -198,40 +198,53 @@
 // in an SSE register into two sequences:
 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
-                                       8, 10, 10, 12, 12, 14, 14, 0 };
-static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
-                                      9, 11, 11, 13, 13, 15, 15, 0 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                   8, 10, 10, 12, 12, 14, 14, 0 };
 
-static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
-                                                   0, 1, 0, 1, 0, 1, 0, 1 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                  9, 11, 11, 13, 13, 15, 15, 0 };
 
-static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
-                                                   2, 3, 2, 3, 2, 3, 2, 3 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                               0, 1, 0, 1, 0, 1, 0, 1 };
 
-static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
-                                                   4, 5, 4, 5, 4, 5, 4, 5 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                               2, 3, 2, 3, 2, 3, 2, 3 };
 
-static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
-                                                   6, 7, 6, 7, 6, 7, 6, 7 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                               4, 5, 4, 5, 4, 5, 4, 5 };
 
-static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
-                                                  0, 1, 2, 3, 0, 1, 2, 3 };
-static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
-                                                  4, 5, 6, 7, 4, 5, 6, 7 };
-static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
-                                                  8, 9, 10, 11, 8, 9, 10, 11 };
-static const uint8_t shuffle_gamma0_mask3[16] = {
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                               6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                              0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                              4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                              8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+                                              12, 13, 14, 15, 12, 13, 14, 15 };
 
 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
                                      const int reduce_bits_horiz, int k) {
   const __m128i src_even =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
   const __m128i src_odd =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
   // The pixel order we need for 'src' is:
   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
@@ -271,21 +284,21 @@
                                                    __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
 
   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
   const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
@@ -319,20 +332,20 @@
                                                           __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 =
-      _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
 
   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  coeff[0] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  coeff[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  coeff[1] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  coeff[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  coeff[2] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  coeff[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  coeff[3] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+  coeff[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
 }
 
 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
@@ -449,21 +462,25 @@
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
   *wt = _mm_unpacklo_epi16(wt0, wt1);
 }
 
 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
                                                   __m128i *coeffs) {
-  const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_2 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_4 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_6 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
@@ -476,14 +493,18 @@
   coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-  const __m128i tmp_1 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_3 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_5 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_7 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -500,17 +521,17 @@
 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
                                                          __m128i *coeffs) {
   const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
 
   // even coeffs
   coeffs[0] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
   coeffs[1] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
   coeffs[2] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
   coeffs[3] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
 
   // odd coeffs
   coeffs[4] = coeffs[0];

diff --git a/libaom/av1/common/x86/wiener_convolve_avx2.c b/libaom/av1/common/x86/wiener_convolve_avx2.c
index 87a6e12..b7ac683 100644
--- a/libaom/av1/common/x86/wiener_convolve_avx2.c
+++ b/libaom/av1/common/x86/wiener_convolve_avx2.c

@@ -17,6 +17,7 @@
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
@@ -25,6 +26,20 @@
 // on the left.
 // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
 // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
+//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
 void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
@@ -37,224 +52,190 @@
   (void)x_step_q4;
   (void)y_step_q4;
 
-  DECLARE_ALIGNED(32, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 2;
-  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+  int im_h = h + SUBPEL_TAPS - 2;
+  int im_stride = 8;
+  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+  int i, j;
+  const int center_tap = (SUBPEL_TAPS - 1) / 2;
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m256i zero_256 = _mm256_setzero_si256();
+  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
 
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+  assert(conv_params->round_0 > 0);
 
-  const __m256i clamp_low = zero_256;
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_h[0] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_h[1] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_h[2] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_h[3] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+  const __m256i round_const_h =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+  const __m256i round_const_horz =
+      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+  const __m256i clamp_low = _mm256_setzero_si256();
   const __m256i clamp_high =
       _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
 
-  /* Horizontal filter */
-  {
-    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
-    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
 
-    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
 
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
 
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+  const __m256i round_const_v =
+      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                        (1 << (bd + conv_params->round_1 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-    const __m256i round_const = _mm256_set1_epi32(
-        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
 
-    for (int i = 0; i < intermediate_height; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
 
-        // Load 8-bit src data
-        const __m128i data_0 = xx_loadu_128(data_ij + 0);
-        const __m128i data_1 = xx_loadu_128(data_ij + 1);
-        const __m128i data_2 = xx_loadu_128(data_ij + 2);
-        const __m128i data_3 = xx_loadu_128(data_ij + 3);
-        const __m128i data_4 = xx_loadu_128(data_ij + 4);
-        const __m128i data_5 = xx_loadu_128(data_ij + 5);
-        const __m128i data_6 = xx_loadu_128(data_ij + 6);
-        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
 
-        // (Zero-)Extend 8-bit data to 16-bit data
-        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
-        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
-        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
-        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
-        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
-        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
-        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
-        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
 
-        // Multiply src data by filter coeffs and sum pairs
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
 
-        // Calculate scalar product for even- and odd-indices separately,
-        // increasing to 32-bit precision
-        const __m256i res_even_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
-        const __m256i res_odd_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-
-        const __m256i res_even = _mm256_srai_epi32(
-            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
-        const __m256i res_odd = _mm256_srai_epi32(
-            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
-
-        // Reduce to 16-bit precision and pack even- and odd-index results
-        // back into one register. The _mm256_packs_epi32 intrinsic returns
-        // a register with the pixels ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
-        const __m256i res_clamped =
-            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-
-        // Store in a temporary array
-        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
-      }
+      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+      // the result
+      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+      res = _mm256_add_epi16(res, data_0);
+      res = _mm256_add_epi16(res, round_const_horz);
+      const __m256i res_clamped =
+          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
     }
-  }
 
-  /* Vertical filter */
-  {
-    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
-    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
-    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
 
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
 
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+      for (i = 0; i < h - 1; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
 
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
-                          (1 << (bd + conv_params->round_1 - 1)));
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
 
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
 
-        // Load 16-bit data from the output of the horizontal filter in
-        // which the pixels are ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
-        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
-        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
-        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
-        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
-        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
-        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
-        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
 
-        // Filter the even-indices, increasing to 32-bit precision
-        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
-        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
-        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
-        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
 
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
 
-        const __m256i res_even = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
 
-        // Filter the odd-indices, increasing to 32-bit precision
-        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
-        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
-        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
-        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
 
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+        _mm_storel_epi64(p_0, res_0);
+        _mm_storel_epi64(p_1, res_1);
 
-        const __m256i res_odd = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
 
-        // Pixels are currently in the following order:
-        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
-        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
-        //
-        // Rearrange the pixels into the following order:
-        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
-        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
-        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
-        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+      if (h - i) {
+        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
 
-        const __m256i res_lo_round = _mm256_srai_epi32(
-            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
-        const __m256i res_hi_round = _mm256_srai_epi32(
-            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+        const int16_t *data = &im_block[i * im_stride];
+        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
 
-        // Reduce to 16-bit precision and pack into the correct order:
-        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
-        const __m256i res_16bit =
-            _mm256_packs_epi32(res_lo_round, res_hi_round);
+        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
 
-        // Reduce to 8-bit precision. This messes up the order:
-        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
-        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit =
-            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+        __m256i convolveres = convolve(s, coeffs_v);
 
-        // Swap the two central 32-bit values to get the order:
-        // [ - - - - - - - - - - - - - - - - ]
-        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+        const __m256i res_round = _mm256_sra_epi32(
+            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
 
-        // Store the lower 128-bit lane in the dst array
-        xx_storeu_128(dst + i * dst_stride + j,
-                      _mm256_castsi256_si128(res_8bit2));
+        /* rounding code */
+        // 16 bit conversion
+        __m128i reslo = _mm256_castsi256_si128(res_round);
+        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+        // 8 bit conversion and saturation to uint8
+        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p_0, res_8b);
       }
     }
   }

diff --git a/libaom/av1/decoder/accounting.c b/libaom/av1/decoder/accounting.c
index 8d8f3df..2e58d09 100644
--- a/libaom/av1/decoder/accounting.c
+++ b/libaom/av1/decoder/accounting.c

@@ -17,7 +17,7 @@
 #include "aom/aom_integer.h"
 #include "av1/decoder/accounting.h"
 
-static int aom_accounting_hash(const char *str) {
+static int accounting_hash(const char *str) {
   uint32_t val;
   const unsigned char *ustr;
   val = 0;
@@ -34,7 +34,7 @@
   size_t len;
   AccountingDictionary *dictionary;
   dictionary = &accounting->syms.dictionary;
-  hash = aom_accounting_hash(str);
+  hash = accounting_hash(str);
   while (accounting->hash_dictionary[hash] != -1) {
     if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
       return accounting->hash_dictionary[hash];

diff --git a/libaom/av1/decoder/accounting.h b/libaom/av1/decoder/accounting.h
index 288e5e6..ad2e8b6 100644
--- a/libaom/av1/decoder/accounting.h
+++ b/libaom/av1/decoder/accounting.h

@@ -42,7 +42,7 @@
 
 /** Dictionary for translating strings into id. */
 typedef struct {
-  char *(strs[MAX_SYMBOL_TYPES]);
+  char *strs[MAX_SYMBOL_TYPES];
   int num_strs;
 } AccountingDictionary;
 

diff --git a/libaom/av1/decoder/decodeframe.c b/libaom/av1/decoder/decodeframe.c
index b7fc370..7abfac4 100644
--- a/libaom/av1/decoder/decodeframe.c
+++ b/libaom/av1/decoder/decodeframe.c

@@ -88,9 +88,9 @@
 }
 
 // Use only_chroma = 1 to only set the chroma planes
-static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
-                                       const YV12_BUFFER_CONFIG *const buf,
-                                       int only_chroma) {
+static AOM_INLINE void set_planes_to_neutral_grey(
+    const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf,
+    int only_chroma) {
   if (seq_params->use_highbitdepth) {
     const int val = 1 << (seq_params->bit_depth - 1);
     for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
@@ -117,28 +117,17 @@
   }
 }
 
-static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
-                                            MACROBLOCKD *xd,
-                                            aom_reader *const r, int plane,
-                                            int runit_idx);
-
-static void setup_compound_reference_mode(AV1_COMMON *cm) {
-  cm->comp_fwd_ref[0] = LAST_FRAME;
-  cm->comp_fwd_ref[1] = LAST2_FRAME;
-  cm->comp_fwd_ref[2] = LAST3_FRAME;
-  cm->comp_fwd_ref[3] = GOLDEN_FRAME;
-
-  cm->comp_bwd_ref[0] = BWDREF_FRAME;
-  cm->comp_bwd_ref[1] = ALTREF2_FRAME;
-  cm->comp_bwd_ref[2] = ALTREF_FRAME;
-}
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+    int runit_idx);
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static TX_MODE read_tx_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  if (cm->coded_lossless) return ONLY_4X4;
+static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb,
+                            int coded_lossless) {
+  if (coded_lossless) return ONLY_4X4;
   return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
 }
 
@@ -151,10 +140,11 @@
   }
 }
 
-static void inverse_transform_block(MACROBLOCKD *xd, int plane,
-                                    const TX_TYPE tx_type,
-                                    const TX_SIZE tx_size, uint8_t *dst,
-                                    int stride, int reduced_tx_set) {
+static AOM_INLINE void inverse_transform_block(MACROBLOCKD *xd, int plane,
+                                               const TX_TYPE tx_type,
+                                               const TX_SIZE tx_size,
+                                               uint8_t *dst, int stride,
+                                               int reduced_tx_set) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
   eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
@@ -165,11 +155,9 @@
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
-static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd,
-                                       aom_reader *const r, const int plane,
-                                       const int row, const int col,
-                                       const TX_SIZE tx_size) {
+static AOM_INLINE void read_coeffs_tx_intra_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int row, const int col, const TX_SIZE tx_size) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (!mbmi->skip) {
 #if TXCOEFF_TIMER
@@ -186,10 +174,11 @@
   }
 }
 
-static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                              aom_reader *const r, const int plane,
-                              const int row, const int col,
-                              const TX_SIZE tx_size) {
+static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
+                                         MACROBLOCKD *const xd,
+                                         aom_reader *const r, const int plane,
+                                         const int row, const int col,
+                                         const TX_SIZE tx_size) {
   (void)cm;
   (void)xd;
   (void)r;
@@ -199,23 +188,21 @@
   (void)tx_size;
 }
 
-static void predict_inter_block_void(AV1_COMMON *const cm,
-                                     MACROBLOCKD *const xd, int mi_row,
-                                     int mi_col, BLOCK_SIZE bsize) {
+static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
+                                                MACROBLOCKD *const xd,
+                                                BLOCK_SIZE bsize) {
   (void)cm;
   (void)xd;
-  (void)mi_row;
-  (void)mi_col;
   (void)bsize;
 }
 
-static void cfl_store_inter_block_void(AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd) {
+static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm,
+                                                  MACROBLOCKD *const xd) {
   (void)cm;
   (void)xd;
 }
 
-static void predict_and_reconstruct_intra_block(
+static AOM_INLINE void predict_and_reconstruct_intra_block(
     const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
   (void)r;
@@ -226,16 +213,15 @@
 
   if (!mbmi->skip) {
     struct macroblockd_plane *const pd = &xd->plane[plane];
-
-    // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
-                                            cm->reduced_tx_set_used);
     eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
     if (eob_data->eob) {
-      uint8_t *dst =
-          &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+      const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+      // tx_type was read out in av1_read_coeffs_txb.
+      const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+                                              reduced_tx_set_used);
+      uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
       inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                              cm->reduced_tx_set_used);
+                              reduced_tx_set_used);
     }
   }
   if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
@@ -243,29 +229,29 @@
   }
 }
 
-static void inverse_transform_inter_block(const AV1_COMMON *const cm,
-                                          MACROBLOCKD *const xd,
-                                          aom_reader *const r, const int plane,
-                                          const int blk_row, const int blk_col,
-                                          const TX_SIZE tx_size) {
+static AOM_INLINE void inverse_transform_inter_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int blk_row, const int blk_col,
+    const TX_SIZE tx_size) {
   (void)r;
   PLANE_TYPE plane_type = get_plane_type(plane);
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-
-  // tx_type will be read out in av1_read_coeffs_txb_facade
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
+  const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+  // tx_type was read out in av1_read_coeffs_txb.
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
 
   uint8_t *dst =
-      &pd->dst
-           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+      &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
   inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                          cm->reduced_tx_set_used);
+                          reduced_tx_set_used);
 #if CONFIG_MISMATCH_DEBUG
   int pixel_c, pixel_r;
   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   int blk_w = block_size_wide[bsize];
   int blk_h = block_size_high[bsize];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
                   pd->subsampling_x, pd->subsampling_y);
   mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
@@ -274,18 +260,17 @@
 #endif
 }
 
-static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
-                                  int plane) {
+static AOM_INLINE void set_cb_buffer_offsets(MACROBLOCKD *const xd,
+                                             TX_SIZE tx_size, int plane) {
   xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
   xd->txb_offset[plane] =
       xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
 }
 
-static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
-                                  aom_reader *r, MB_MODE_INFO *const mbmi,
-                                  int plane, BLOCK_SIZE plane_bsize,
-                                  int blk_row, int blk_col, int block,
-                                  TX_SIZE tx_size, int *eob_total) {
+static AOM_INLINE void decode_reconstruct_tx(
+    AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
+    MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
+    int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
   MACROBLOCKD *const xd = &td->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
@@ -333,61 +318,58 @@
   }
 }
 
-static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                        BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
-                        int bh, int x_mis, int y_mis) {
+static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   int bw, int bh, int x_mis, int y_mis) {
   const int num_planes = av1_num_planes(cm);
-
-  const int offset = mi_row * cm->mi_stride + mi_col;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const TileInfo *const tile = &xd->tile;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = &cm->mi[offset];
-  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
-  // passing bsize from decode_partition().
+  set_mi_offsets(mi_params, xd, mi_row, mi_col);
   xd->mi[0]->sb_type = bsize;
 #if CONFIG_RD_DEBUG
   xd->mi[0]->mi_row = mi_row;
   xd->mi[0]->mi_col = mi_col;
 #endif
-  xd->cfl.mi_row = mi_row;
-  xd->cfl.mi_col = mi_col;
 
   assert(x_mis && y_mis);
   for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
-  int idx = cm->mi_stride;
+  int idx = mi_params->mi_stride;
   for (int y = 1; y < y_mis; ++y) {
     memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
-    idx += cm->mi_stride;
+    idx += mi_params->mi_stride;
   }
 
   set_plane_n4(xd, bw, bh, num_planes);
-  set_skip_context(xd, mi_row, mi_col, num_planes);
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
 
   av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
                        num_planes);
 }
 
-static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                              int mi_row, int mi_col, aom_reader *r,
-                              PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
+                                         MACROBLOCKD *const xd, int mi_row,
+                                         int mi_col, aom_reader *r,
+                                         PARTITION_TYPE partition,
+                                         BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
 
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   xd->mi[0]->partition = partition;
-  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  av1_read_mode_info(pbi, xd, r, x_mis, y_mis);
   if (bsize >= BLOCK_8X8 &&
       (seq_params->subsampling_x || seq_params->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -406,9 +388,11 @@
   int y1;
 } PadBlock;
 
-static void highbd_build_mc_border(const uint8_t *src8, int src_stride,
-                                   uint8_t *dst8, int dst_stride, int x, int y,
-                                   int b_w, int b_h, int w, int h) {
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8,
+                                              int src_stride, uint8_t *dst8,
+                                              int dst_stride, int x, int y,
+                                              int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -443,10 +427,11 @@
     if (y > 0 && y < h) ref_row += src_stride;
   } while (--b_h);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int x, int y, int b_w, int b_h,
-                            int w, int h) {
+static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
+                                       uint8_t *dst, int dst_stride, int x,
+                                       int y, int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
   const uint8_t *ref_row = src - x - y * src_stride;
 
@@ -533,6 +518,7 @@
     const int b_w = block.x1 - block.x0;
     const int b_h = block.y1 - block.y0;
 
+#if CONFIG_AV1_HIGHBITDEPTH
     // Extend the border.
     if (highbd) {
       highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
@@ -542,27 +528,36 @@
       build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
                       b_h, pre_buf->width, pre_buf->height);
     }
+#else
+    (void)highbd;
+    build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                    b_h, pre_buf->width, pre_buf->height);
+#endif
     *src_stride = b_w;
     *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
            x_pad * (AOM_INTERP_EXTEND - 1);
   }
 }
 
-static INLINE void dec_calc_subpel_params(
-    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
-    int plane, const int pre_x, const int pre_y, int x, int y,
-    struct buf_2d *const pre_buf, SubpelParams *subpel_params, int bw, int bh,
-    PadBlock *block, int mi_x, int mi_y, MV32 *scaled_mv, int *subpel_x_mv,
-    int *subpel_y_mv) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
+static void dec_calc_subpel_params(const MV *const src_mv,
+                                   InterPredParams *const inter_pred_params,
+                                   const MACROBLOCKD *const xd, int mi_x,
+                                   int mi_y, uint8_t **pre,
+                                   SubpelParams *subpel_params, int *src_stride,
+                                   PadBlock *block, MV32 *scaled_mv,
+                                   int *subpel_x_mv, int *subpel_y_mv) {
+  const struct scale_factors *sf = inter_pred_params->scale_factors;
+  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+  const int bw = inter_pred_params->block_width;
+  const int bh = inter_pred_params->block_height;
   const int is_scaled = av1_is_scaled(sf);
   if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
-    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int ssx = inter_pred_params->subsampling_x;
+    int ssy = inter_pred_params->subsampling_y;
+    int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+    orig_pos_y += src_mv->row * (1 << (1 - ssy));
+    int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    orig_pos_x += src_mv->col * (1 << (1 - ssx));
     int pos_y = sf->scale_value_y(orig_pos_y, sf);
     int pos_x = sf->scale_value_x(orig_pos_x, sf);
     pos_x += SCALE_EXTRA_OFF;
@@ -592,9 +587,10 @@
         ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
 
     MV temp_mv;
-    temp_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
-                                        pd->subsampling_y);
-    *scaled_mv = av1_scale_mv(&temp_mv, (mi_x + x), (mi_y + y), sf);
+    temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh,
+                                        inter_pred_params->subsampling_x,
+                                        inter_pred_params->subsampling_y);
+    *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf);
     scaled_mv->row += SCALE_EXTRA_OFF;
     scaled_mv->col += SCALE_EXTRA_OFF;
 
@@ -602,11 +598,12 @@
     *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
   } else {
     // Get block position in current frame.
-    int pos_x = (pre_x + x) << SUBPEL_BITS;
-    int pos_y = (pre_y + y) << SUBPEL_BITS;
+    int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
 
     const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+        xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y);
     subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
     subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
     subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
@@ -626,295 +623,79 @@
     *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
     *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
   }
+  *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0;
+  *src_stride = pre_buf->stride;
 }
 
-static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, int plane,
-                                              const MB_MODE_INFO *mi,
-                                              int build_for_obmc, int bw,
-                                              int bh, int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int is_compound = has_second_ref(mi);
-  int ref;
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  int is_global[2] = { 0, 0 };
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
-                     (block_size_high[bsize] < 8 && ss_y);
-
-  if (is_intrabc) sub8x8_inter = 0;
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  sub8x8_inter = sub8x8_inter && !build_for_obmc;
-  if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
-      for (int col = col_start; col <= 0; ++col) {
-        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
-        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
-      }
-    }
-  }
-
-  if (sub8x8_inter) {
-    // block size
-    const int b4_w = block_size_wide[bsize] >> ss_x;
-    const int b4_h = block_size_high[bsize] >> ss_y;
-    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
-    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
-    const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    assert(!is_compound);
-
-    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
-    int row = row_start;
-    int src_stride;
-    for (int y = 0; y < b8_h; y += b4_h) {
-      int col = col_start;
-      for (int x = 0; x < b8_w; x += b4_w) {
-        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        is_compound = has_second_ref(this_mbmi);
-        int tmp_dst_stride = 8;
-        assert(bw < 8 || bh < 8);
-        ConvolveParams conv_params = get_conv_params_no_round(
-            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_dist_wtd_comp_avg = 0;
-        struct buf_2d *const dst_buf = &pd->dst;
-        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
-        ref = 0;
-        const RefCntBuffer *ref_buf =
-            get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
-        const struct scale_factors *ref_scale_factors =
-            get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
-
-        pd->pre[ref].buf0 =
-            (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
-        pd->pre[ref].buf =
-            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
-                                                     ref_buf->buf.uv_stride,
-                                                     ref_scale_factors);
-        pd->pre[ref].width = ref_buf->buf.uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf.uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf.uv_stride;
-
-        const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : ref_scale_factors;
-        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
-        const MV mv = this_mbmi->mv[ref].as_mv;
-
-        uint8_t *pre;
-        SubpelParams subpel_params;
-        PadBlock block;
-        MV32 scaled_mv;
-        int subpel_x_mv, subpel_y_mv;
-        int highbd;
-        WarpTypesAllowed warp_types;
-        warp_types.global_warp_allowed = is_global[ref];
-        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
-        dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf,
-                               &subpel_params, bw, bh, &block, mi_x, mi_y,
-                               &scaled_mv, &subpel_x_mv, &subpel_y_mv);
-        pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
-        src_stride = pre_buf->stride;
-        highbd = is_cur_buf_hbd(xd);
-        extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
-                         subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
-                         &pre, &src_stride);
-        conv_params.do_average = ref;
-        if (is_masked_compound_type(mi->interinter_comp.type)) {
-          // masked compound type has its own average mechanism
-          conv_params.do_average = 0;
-        }
-
-        av1_make_inter_predictor(
-            pre, src_stride, dst, dst_buf->stride, &subpel_params, sf, b4_w,
-            b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
-            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
-            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
-        ++col;
-      }
-      ++row;
-    }
-
-    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
-    return;
-  }
-
-  {
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf;
-    uint8_t *pre[2];
-    SubpelParams subpel_params[2];
-    int src_stride[2];
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-      const MV mv = mi->mv[ref].as_mv;
-      PadBlock block;
-      MV32 scaled_mv;
-      int subpel_x_mv, subpel_y_mv;
-      int highbd;
-
-      dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
-                             &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
-                             &scaled_mv, &subpel_x_mv, &subpel_y_mv);
-      pre[ref] = pre_buf->buf0 + (int64_t)block.y0 * pre_buf->stride + block.x0;
-      src_stride[ref] = pre_buf->stride;
-      highbd = is_cur_buf_hbd(xd);
-
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global[ref];
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-      int do_warp = (bw >= 8 && bh >= 8 &&
-                     av1_allow_warp(mi, &warp_types,
-                                    &xd->global_motion[mi->ref_frame[ref]],
-                                    build_for_obmc, sf, NULL));
-      do_warp = (do_warp && xd->cur_frame_force_integer_mv == 0);
-
-      extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv,
-                       do_warp, is_intrabc, highbd, xd->mc_buf[ref], &pre[ref],
-                       &src_stride[ref]);
-    }
-
-    ConvolveParams conv_params = get_conv_params_no_round(
-        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
-        &conv_params.use_dist_wtd_comp_avg, is_compound);
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global[ref];
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-      conv_params.do_average = ref;
-      if (is_masked_compound_type(mi->interinter_comp.type)) {
-        // masked compound type has its own average mechanism
-        conv_params.do_average = 0;
-      }
-
-      if (ref && is_masked_compound_type(mi->interinter_comp.type))
-        av1_make_masked_inter_predictor(
-            pre[ref], src_stride[ref], dst, dst_buf->stride,
-            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
-            plane, &warp_types, mi_x >> pd->subsampling_x,
-            mi_y >> pd->subsampling_y, ref, xd, cm->allow_warped_motion);
-      else
-        av1_make_inter_predictor(
-            pre[ref], src_stride[ref], dst, dst_buf->stride,
-            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
-            &warp_types, mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y,
-            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-    }
-  }
+static void dec_calc_subpel_params_and_extend(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **pre,
+    SubpelParams *subpel_params, int *src_stride) {
+  PadBlock block;
+  MV32 scaled_mv;
+  int subpel_x_mv, subpel_y_mv;
+  dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre,
+                         subpel_params, src_stride, &block, &scaled_mv,
+                         &subpel_x_mv, &subpel_y_mv);
+  extend_mc_border(
+      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
+      scaled_mv, block, subpel_x_mv, subpel_y_mv,
+      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
+      inter_pred_params->use_hbd_buf, xd->mc_buf[ref], pre, src_stride);
 }
 
-static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
-                                                  MACROBLOCKD *xd,
-                                                  BLOCK_SIZE bsize, int mi_row,
-                                                  int mi_col, int plane_from,
-                                                  int plane_to) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = pd->width;
-    const int bh = pd->height;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
-
-    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
-  }
+static void dec_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                       int plane, const MB_MODE_INFO *mi,
+                                       int build_for_obmc, int bw, int bh,
+                                       int mi_x, int mi_y) {
+  av1_build_inter_predictors(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x,
+                             mi_y, dec_calc_subpel_params_and_extend);
 }
 
-static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
-                                           MACROBLOCKD *xd, int mi_row,
-                                           int mi_col, const BUFFER_SET *ctx,
-                                           BLOCK_SIZE bsize) {
-  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
-                               { xd->plane[0].dst.stride, 0, 0 } };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride, ctx, 0, bsize);
-  }
-}
-
-static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
-                                            MACROBLOCKD *xd, int mi_row,
-                                            int mi_col, const BUFFER_SET *ctx,
-                                            BLOCK_SIZE bsize) {
-  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
-                                        MAX_MB_PLANE - 1);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = {
-      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
-      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
-    };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbuv(
-        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
-  }
-}
-
-static void dec_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int mi_row,
+                                                 int mi_col, BLOCK_SIZE bsize) {
   const int num_planes = av1_num_planes(cm);
-  dec_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  if (num_planes > 1)
-    dec_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0,
+                               xd->plane[plane].width, xd->plane[plane].height,
+                               mi_x, mi_y);
+    if (is_interintra_pred(xd->mi[0])) {
+      BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
+                           xd->plane[2].dst.buf },
+                         { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+                           xd->plane[2].dst.stride } };
+      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+                                     xd->plane[plane].dst.stride, &ctx, plane,
+                                     bsize);
+    }
+  }
 }
 
 static INLINE void dec_build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
-    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  const int above_mi_col = xd->mi_col + rel_mi_col;
   int mi_x, mi_y;
   MB_MODE_INFO backup_mbmi = *above_mbmi;
 
-  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+  (void)rel_mi_row;
+  (void)dir;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size,
                                            &backup_mbmi, ctxt, num_planes);
   mi_x = above_mi_col << MI_SIZE_LOG2;
-  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+  mi_y = xd->mi_row << MI_SIZE_LOG2;
 
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
     int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
                    block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
@@ -924,44 +705,45 @@
   }
 }
 
-static void dec_build_prediction_by_above_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
-    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+static AOM_INLINE void dec_build_prediction_by_above_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
+    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
+    int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->up_available) return;
 
   // Adjust mb_to_bottom_edge to have the correct value for the OBMC
   // prediction block. This is half the height of the original block,
   // except for 128-wide blocks, where we only use a height of 32.
-  int this_height = xd->n4_h * MI_SIZE;
-  int pred_height = AOMMIN(this_height / 2, 32);
-  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
+  const int this_height = xd->height * MI_SIZE;
+  const int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_above(cm, xd, mi_col,
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 dec_build_prediction_by_above_pred, &ctxt);
 
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE);
   xd->mb_to_right_edge = ctxt.mb_to_far_edge;
-  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+  xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
 }
 
 static INLINE void dec_build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
-    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  const int left_mi_row = xd->mi_row + rel_mi_row;
   int mi_x, mi_y;
   MB_MODE_INFO backup_mbmi = *left_mbmi;
 
-  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+  (void)rel_mi_col;
+  (void)dir;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size,
                                           &backup_mbmi, ctxt, num_planes);
-  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_x = xd->mi_col << MI_SIZE_LOG2;
   mi_y = left_mi_row << MI_SIZE_LOG2;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
@@ -969,7 +751,7 @@
     const struct macroblockd_plane *pd = &xd->plane[j];
     int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
                    block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
-    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+    int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
     dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
@@ -977,36 +759,59 @@
   }
 }
 
-static void dec_build_prediction_by_left_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
-    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+static AOM_INLINE void dec_build_prediction_by_left_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
+    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
+    int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->left_available) return;
 
   // Adjust mb_to_right_edge to have the correct value for the OBMC
   // prediction block. This is half the width of the original block,
   // except for 128-wide blocks, where we only use a width of 32.
-  int this_width = xd->n4_w * MI_SIZE;
-  int pred_width = AOMMIN(this_width / 2, 32);
-  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+  const int this_width = xd->width * MI_SIZE;
+  const int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
 
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_left(cm, xd, mi_row,
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                dec_build_prediction_by_left_pred, &ctxt);
 
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE);
+  xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width);
   xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
-static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
-                                               MACROBLOCKD *xd, int mi_row,
-                                               int mi_col) {
+static void set_dst_buf(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                        uint8_t **dst_buf2) {
+  dst_buf1[0] = xd->tmp_obmc_bufs[0];
+  dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+  dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+  dst_buf2[0] = xd->tmp_obmc_bufs[1];
+  dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+  dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void set_dst_buf_highbd(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                               uint8_t **dst_buf2) {
+  int len = sizeof(uint16_t);
+  dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+  dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+  dst_buf1[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+  dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+  dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+  dst_buf2[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+}
+#endif
+
+static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                                          MACROBLOCKD *xd) {
   const int num_planes = av1_num_planes(cm);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -1016,47 +821,43 @@
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-    dst_buf1[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-    dst_buf1[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-    dst_buf2[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-    dst_buf2[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+    set_dst_buf_highbd(xd, dst_buf1, dst_buf2);
   } else {
-    dst_buf1[0] = xd->tmp_obmc_bufs[0];
-    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = xd->tmp_obmc_bufs[1];
-    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+    set_dst_buf(xd, dst_buf1, dst_buf2);
   }
-  dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_width1, dst_height1, dst_stride1);
-  dec_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_width2, dst_height2, dst_stride2);
+#else
+  set_dst_buf(xd, dst_buf1, dst_buf2);
+#endif
+
+  dec_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+                                      dst_stride1);
+  dec_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+                                     dst_stride2);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
                        mi_row, mi_col, 0, num_planes);
-  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
-                                  dst_buf2, dst_stride2);
+  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+                                  dst_stride2);
 }
 
-static void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) {
+static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm,
+                                             MACROBLOCKD *const xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (store_cfl_required(cm, xd)) {
     cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
 }
 
-static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
+                                           MACROBLOCKD *const xd,
+                                           BLOCK_SIZE bsize) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     if (frame < LAST_FRAME) {
@@ -1074,9 +875,9 @@
     }
   }
 
-  dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+  dec_build_inter_predictor(cm, xd, mi_row, mi_col, bsize);
   if (mbmi->motion_mode == OBMC_CAUSAL) {
-    dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    dec_build_obmc_inter_predictors_sb(cm, xd);
   }
 #if CONFIG_MISMATCH_DEBUG
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -1095,8 +896,8 @@
 #endif
 }
 
-static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
-                                       aom_reader *r) {
+static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
+                                                  int plane, aom_reader *r) {
   (void)r;
   Av1ColorMapParam params;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -1105,18 +906,14 @@
   xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
 
-static void decode_token_recon_block(AV1Decoder *const pbi,
-                                     ThreadData *const td, int mi_row,
-                                     int mi_col, aom_reader *r,
-                                     BLOCK_SIZE bsize) {
+static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
+                                                ThreadData *const td,
+                                                aom_reader *r,
+                                                BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &td->xd;
   const int num_planes = av1_num_planes(cm);
-
   MB_MODE_INFO *mbmi = xd->mi[0];
-  CFL_CTX *const cfl = &xd->cfl;
-  cfl->is_chroma_reference = is_chroma_reference(
-      mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
 
   if (!is_inter_block(mbmi)) {
     int row, col;
@@ -1125,21 +922,16 @@
     const int max_blocks_wide = max_block_wide(xd, bsize, 0);
     const int max_blocks_high = max_block_high(xd, bsize, 0);
     const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
-    int mu_blocks_wide =
-        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-    int mu_blocks_high =
-        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
     mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
     mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
     for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
       for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
         for (int plane = 0; plane < num_planes; ++plane) {
+          if (plane && !xd->is_chroma_ref) break;
           const struct macroblockd_plane *const pd = &xd->plane[plane];
-          if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                   pd->subsampling_y))
-            continue;
-
           const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
           const int stepr = tx_size_high_unit[tx_size];
           const int stepc = tx_size_wide_unit[tx_size];
@@ -1164,7 +956,7 @@
       }
     }
   } else {
-    td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize);
+    td->predict_inter_block_visit(cm, xd, bsize);
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
@@ -1177,10 +969,8 @@
       assert(max_unit_bsize ==
              get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
                                   xd->plane[0].subsampling_y));
-      int mu_blocks_wide =
-          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-      int mu_blocks_high =
-          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+      int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+      int mu_blocks_high = mi_size_high[max_unit_bsize];
 
       mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
       mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
@@ -1188,15 +978,12 @@
       for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
         for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
           for (int plane = 0; plane < num_planes; ++plane) {
+            if (plane && !xd->is_chroma_ref) break;
             const struct macroblockd_plane *const pd = &xd->plane[plane];
-            if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                     pd->subsampling_y))
-              continue;
-            const BLOCK_SIZE bsizec =
-                scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-            const BLOCK_SIZE plane_bsize = get_plane_block_size(
-                bsizec, pd->subsampling_x, pd->subsampling_y);
-
+            const int ss_x = pd->subsampling_x;
+            const int ss_y = pd->subsampling_y;
+            const BLOCK_SIZE plane_bsize =
+                get_plane_block_size(bsize, ss_x, ss_y);
             const TX_SIZE max_tx_size =
                 get_vartx_max_txsize(xd, plane_bsize, plane);
             const int bh_var_tx = tx_size_high_unit[max_tx_size];
@@ -1206,15 +993,13 @@
                 tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
             int blk_row, blk_col;
             const int unit_height = ROUND_POWER_OF_TWO(
-                AOMMIN(mu_blocks_high + row, max_blocks_high),
-                pd->subsampling_y);
+                AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y);
             const int unit_width = ROUND_POWER_OF_TWO(
-                AOMMIN(mu_blocks_wide + col, max_blocks_wide),
-                pd->subsampling_x);
+                AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x);
 
-            for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+            for (blk_row = row >> ss_y; blk_row < unit_height;
                  blk_row += bh_var_tx) {
-              for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+              for (blk_col = col >> ss_x; blk_col < unit_width;
                    blk_col += bw_var_tx) {
                 decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
                                       blk_row, blk_col, block, max_tx_size,
@@ -1229,20 +1014,13 @@
     td->cfl_store_inter_block_visit(cm, xd);
   }
 
-  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
-                    set_color_index_map_offset);
+  av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
 }
 
-#if LOOP_FILTER_BITMASK
-static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                MB_MODE_INFO *mbmi);
-#endif
-
-static void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
-                              int tx_w_log2, int tx_h_log2, int min_txs,
-                              int split_size, int txs, int blk_row,
-                              int blk_col) {
+static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
+                                         int tx_w_log2, int tx_h_log2,
+                                         int min_txs, int split_size, int txs,
+                                         int blk_row, int blk_col) {
   for (int idy = 0; idy < tx_size_high_unit[split_size];
        idy += tx_size_high_unit[min_txs]) {
     for (int idx = 0; idx < tx_size_wide_unit[split_size];
@@ -1254,13 +1032,14 @@
   }
 }
 
-static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
-                               TX_SIZE tx_size, int depth,
-#if LOOP_FILTER_BITMASK
-                               AV1_COMMON *cm, int mi_row, int mi_col,
-                               int store_bitmask,
+static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                                          TX_SIZE tx_size, int depth,
+#if CONFIG_LPF_MASK
+                                          AV1_COMMON *cm, int mi_row,
+                                          int mi_col, int store_bitmask,
 #endif
-                               int blk_row, int blk_col, aom_reader *r) {
+                                          int blk_row, int blk_col,
+                                          aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   int is_split = 0;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -1301,18 +1080,18 @@
       mbmi->tx_size = sub_txs;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, sub_txs, tx_size);
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
       if (store_bitmask) {
-        store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                            txsize_to_bsize[tx_size], TX_4X4, mbmi);
+        av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                                txsize_to_bsize[tx_size], TX_4X4, mbmi);
       }
 #endif
       return;
     }
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
     if (depth + 1 == MAX_VARTX_DEPTH && store_bitmask) {
-      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                          txsize_to_bsize[tx_size], sub_txs, mbmi);
+      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                              txsize_to_bsize[tx_size], sub_txs, mbmi);
       store_bitmask = 0;
     }
 #endif
@@ -1323,7 +1102,7 @@
         int offsetr = blk_row + row;
         int offsetc = blk_col + col;
         read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                            cm, mi_row, mi_col, store_bitmask,
 #endif
                            offsetr, offsetc, r);
@@ -1335,16 +1114,17 @@
     mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
     if (store_bitmask) {
-      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                          txsize_to_bsize[tx_size], tx_size, mbmi);
+      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                              txsize_to_bsize[tx_size], tx_size, mbmi);
     }
 #endif
   }
 }
 
-static TX_SIZE read_selected_tx_size(MACROBLOCKD *xd, aom_reader *r) {
+static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd,
+                                     aom_reader *r) {
   // TODO(debargha): Clean up the logic here. This function should only
   // be called for intra.
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
@@ -1359,9 +1139,9 @@
   return tx_size;
 }
 
-static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
-                            int allow_select_inter, aom_reader *r) {
-  const TX_MODE tx_mode = cm->tx_mode;
+static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
+                            int is_inter, int allow_select_inter,
+                            aom_reader *r) {
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
 
@@ -1378,268 +1158,67 @@
   }
 }
 
-#if LOOP_FILTER_BITMASK
-static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                MB_MODE_INFO *mbmi) {
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
-    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (tx_size - TX_4X16);
-  }
-  int index = 0;
-  const int row = mi_row % MI_SIZE_64X64;
-  const int col = mi_col % MI_SIZE_64X64;
-  const int shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
-                                        BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
-  // Use a lookup table that provides one bitmask for a given block size and
-  // a univariant transform size.
-  int index;
-  int shift;
-  int row;
-  int col;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (mbmi->tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
-    mask_id =
-        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (mbmi->tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (mbmi->tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (mbmi->tx_size - TX_4X16);
-  }
-  row = mi_row % MI_SIZE_64X64;
-  col = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
-                                     int is_horz_coding_block_border,
-                                     int is_vert_coding_block_border) {
-  int index;
-  int shift;
-  int row;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const int row_start = mi_row % MI_SIZE_64X64;
-  const int col_start = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col_start, row_start, &index);
-  if (is_horz_coding_block_border) {
-    const int block_shift = shift + mi_size_wide[bsize];
-    assert(block_shift <= 64);
-    const uint64_t right_edge_shift =
-        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
-    const uint64_t left_edge_shift = (block_shift == 64)
-                                         ? (((uint64_t)1 << shift) - 1)
-                                         : ((uint64_t)1 << shift);
-    assert(right_edge_shift > left_edge_shift);
-    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
-    lfm->is_horz_border.bits[index] |= top_edge_mask;
-  }
-  if (is_vert_coding_block_border) {
-    const int is_vert_border = mask_id_table_vert_border[bsize];
-    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->is_vert_border.bits[i + index] |=
-          (left_mask_univariant_reordered[is_vert_border].bits[i]
-           << vert_shift);
-    }
-  }
-  const int is_skip = mbmi->skip && is_inter_block(mbmi);
-  if (is_skip) {
-    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->skip.bits[i + index] |=
-          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
-    }
-  }
-  const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
-  const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
-  const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
-  const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
-  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
-    index = 0;
-    row = r % MI_SIZE_64X64;
-    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_ver[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_hor[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_ver[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_hor[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-  }
-}
-#endif
-
-static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
-                               int mi_row, int mi_col, aom_reader *r,
-                               PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
+                                          ThreadData *const td, int mi_row,
+                                          int mi_col, aom_reader *r,
+                                          PARTITION_TYPE partition,
+                                          BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &td->xd;
   decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
 
-  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
-                    av1_decode_palette_tokens);
+  av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 
   AV1_COMMON *cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mbmi = xd->mi[0];
   int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
-  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
       !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
     const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
     const int bh = tx_size_high_unit[max_tx_size];
     const int bw = tx_size_wide_unit[max_tx_size];
-    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+    const int width = mi_size_wide[bsize];
+    const int height = mi_size_high[bsize];
 
     for (int idy = 0; idy < height; idy += bh)
       for (int idx = 0; idx < width; idx += bw)
         read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                            cm, mi_row, mi_col, 1,
 #endif
                            idy, idx, r);
   } else {
-    mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
+    mbmi->tx_size =
+        read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip, r);
     if (inter_block_tx)
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
                   mbmi->skip && is_inter_block(mbmi), xd);
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
     if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-      store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+      av1_store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
     } else {
       for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
         for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-          store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
-                                      BLOCK_64X64, mbmi);
+          av1_store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+                                          BLOCK_64X64, mbmi);
         }
       }
     }
 #endif
   }
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   const int w = mi_size_wide[bsize];
   const int h = mi_size_high[bsize];
   if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-    store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1);
+    av1_store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1);
   } else {
     for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
       for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-        store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
-                                 mbmi, row == 0, col == 0);
+        av1_store_bitmask_other_info(cm, mi_row + row, mi_col + col,
+                                     BLOCK_64X64, mbmi, row == 0, col == 0);
       }
     }
   }
@@ -1649,12 +1228,14 @@
     for (int i = 0; i < MAX_SEGMENTS; i++) {
       const int current_qindex =
           av1_get_qindex(&cm->seg, i, xd->current_qindex);
+      const CommonQuantParams *const quant_params = &cm->quant_params;
       for (int j = 0; j < num_planes; ++j) {
-        const int dc_delta_q =
-            j == 0 ? cm->y_dc_delta_q
-                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
-        const int ac_delta_q =
-            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+        const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
+                                      : (j == 1 ? quant_params->u_dc_delta_q
+                                                : quant_params->v_dc_delta_q);
+        const int ac_delta_q = j == 0 ? 0
+                                      : (j == 1 ? quant_params->u_ac_delta_q
+                                                : quant_params->v_ac_delta_q);
         xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
             current_qindex, dc_delta_q, cm->seq_params.bit_depth);
         xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
@@ -1662,43 +1243,48 @@
       }
     }
   }
-  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+  if (mbmi->skip) av1_reset_entropy_context(xd, bsize, num_planes);
 
-  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+  decode_token_recon_block(pbi, td, r, bsize);
 }
 
-static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
-                                           ThreadData *const td, int mi_row,
-                                           int mi_col, BLOCK_SIZE bsize) {
+static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+                                                      ThreadData *const td,
+                                                      int mi_row, int mi_col,
+                                                      BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *const xd = &td->xd;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int num_planes = av1_num_planes(cm);
 
-  const int offset = mi_row * cm->mi_stride + mi_col;
+  const int offset = mi_row * mi_params->mi_stride + mi_col;
   const TileInfo *const tile = &xd->tile;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->cfl.mi_row = mi_row;
-  xd->cfl.mi_col = mi_col;
+  xd->mi = mi_params->mi_grid_base + offset;
+  xd->tx_type_map =
+      &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
+  xd->tx_type_map_stride = mi_params->mi_stride;
 
   set_plane_n4(xd, bw, bh, num_planes);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
 
   av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
                        num_planes);
 }
 
-static void decode_block(AV1Decoder *const pbi, ThreadData *const td,
-                         int mi_row, int mi_col, aom_reader *r,
-                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                                    int mi_row, int mi_col, aom_reader *r,
+                                    PARTITION_TYPE partition,
+                                    BLOCK_SIZE bsize) {
   (void)partition;
   set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
-  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+  decode_token_recon_block(pbi, td, r, bsize);
 }
 
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1731,9 +1317,12 @@
 }
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
-static void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
-                             int mi_row, int mi_col, aom_reader *reader,
-                             BLOCK_SIZE bsize, int parse_decode_flag) {
+static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
+                                        ThreadData *const td, int mi_row,
+                                        int mi_col, aom_reader *reader,
+                                        BLOCK_SIZE bsize,
+                                        int parse_decode_flag) {
+  assert(bsize < BLOCK_SIZES_ALL);
   AV1_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &td->xd;
   const int bw = mi_size_wide[bsize];
@@ -1742,18 +1331,19 @@
   BLOCK_SIZE subsize;
   const int quarter_step = bw / 4;
   BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
 
   // parse_decode_flag takes the following values :
   // 01 - do parse only
   // 10 - do decode only
   // 11 - do parse and decode
-  static const block_visitor_fn_t block_visit[4] = {
-    NULL, parse_decode_block, decode_block, parse_decode_block
-  };
+  static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block,
+                                                     decode_block,
+                                                     parse_decode_block };
 
   if (parse_decode_flag & 1) {
     const int num_planes = av1_num_planes(cm);
@@ -1778,7 +1368,11 @@
     partition = get_partition(cm, mi_row, mi_col, bsize);
   }
   subsize = get_partition_subsize(bsize, partition);
-
+  if (subsize == BLOCK_INVALID) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Partition is invalid for block size %dx%d",
+                       block_size_wide[bsize], block_size_high[bsize]);
+  }
   // Check the bitstream is conformant: if there is subsampling on the
   // chroma planes, subsize must subsample to a valid block size.
   const struct macroblockd_plane *const pd_u = &xd->plane[1];
@@ -1837,14 +1431,14 @@
     case PARTITION_HORZ_4:
       for (int i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
         DEC_BLOCK(this_mi_row, mi_col, subsize);
       }
       break;
     case PARTITION_VERT_4:
       for (int i = 0; i < 4; ++i) {
         int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
         DEC_BLOCK(mi_row, this_mi_col, subsize);
       }
       break;
@@ -1860,10 +1454,10 @@
     update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
-                               const size_t read_size,
-                               struct aom_internal_error_info *error_info,
-                               aom_reader *r, uint8_t allow_update_cdf) {
+static AOM_INLINE void setup_bool_decoder(
+    const uint8_t *data, const uint8_t *data_end, const size_t read_size,
+    struct aom_internal_error_info *error_info, aom_reader *r,
+    uint8_t allow_update_cdf) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
@@ -1878,8 +1472,8 @@
   r->allow_update_cdf = allow_update_cdf;
 }
 
-static void setup_segmentation(AV1_COMMON *const cm,
-                               struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
+                                          struct aom_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
 
   seg->update_map = 0;
@@ -1889,21 +1483,22 @@
   seg->enabled = aom_rb_read_bit(rb);
   if (!seg->enabled) {
     if (cm->cur_frame->seg_map)
-      memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
+      memset(cm->cur_frame->seg_map, 0,
+             (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
 
     memset(seg, 0, sizeof(*seg));
     segfeatures_copy(&cm->cur_frame->seg, seg);
     return;
   }
   if (cm->seg.enabled && cm->prev_frame &&
-      (cm->mi_rows == cm->prev_frame->mi_rows) &&
-      (cm->mi_cols == cm->prev_frame->mi_cols)) {
+      (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+      (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
     cm->last_frame_seg_map = cm->prev_frame->seg_map;
   } else {
     cm->last_frame_seg_map = NULL;
   }
   // Read update flags
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     // These frames can't use previous frames, so must signal map + features
     seg->update_map = 1;
     seg->temporal_update = 0;
@@ -1944,18 +1539,18 @@
         av1_set_segdata(seg, i, j, data);
       }
     }
-    calculate_segdata(seg);
+    av1_calculate_segdata(seg);
   } else if (cm->prev_frame) {
     segfeatures_copy(seg, &cm->prev_frame->seg);
   }
   segfeatures_copy(&cm->cur_frame->seg, seg);
 }
 
-static void decode_restoration_mode(AV1_COMMON *cm,
-                                    struct aom_read_bit_buffer *rb) {
-  assert(!cm->all_lossless);
+static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
+                                               struct aom_read_bit_buffer *rb) {
+  assert(!cm->features.all_lossless);
   const int num_planes = av1_num_planes(cm);
-  if (cm->allow_intrabc) return;
+  if (cm->features.allow_intrabc) return;
   int all_none = 1, chroma_none = 1;
   for (int p = 0; p < num_planes; ++p) {
     RestorationInfo *rsi = &cm->rst_info[p];
@@ -2007,8 +1602,10 @@
   }
 }
 
-static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
-                               WienerInfo *ref_wiener_info, aom_reader *rb) {
+static AOM_INLINE void read_wiener_filter(int wiener_win,
+                                          WienerInfo *wiener_info,
+                                          WienerInfo *ref_wiener_info,
+                                          aom_reader *rb) {
   memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
   memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
 
@@ -2066,10 +1663,11 @@
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
-                                SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
+static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                           SgrprojInfo *ref_sgrproj_info,
+                                           aom_reader *rb) {
   sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
-  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 
   if (params->r[0] == 0) {
     sgrproj_info->xqd[0] = 0;
@@ -2102,15 +1700,14 @@
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
-static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
-                                            MACROBLOCKD *xd,
-                                            aom_reader *const r, int plane,
-                                            int runit_idx) {
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+    int runit_idx) {
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
   if (rsi->frame_restoration_type == RESTORE_NONE) return;
 
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *wiener_info = xd->wiener_info + plane;
@@ -2146,16 +1743,18 @@
   }
 }
 
-static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
+                                        struct aom_read_bit_buffer *rb) {
   const int num_planes = av1_num_planes(cm);
   struct loopfilter *lf = &cm->lf;
-  if (cm->allow_intrabc || cm->coded_lossless) {
+
+  if (cm->features.allow_intrabc || cm->features.coded_lossless) {
     // write default deltas to frame buffer
     av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
     av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
     return;
   }
-  assert(!cm->coded_lossless);
+  assert(!cm->features.coded_lossless);
   if (cm->prev_frame) {
     // write deltas to frame buffer
     memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
@@ -2197,13 +1796,13 @@
   memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
 }
 
-static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
+                                  struct aom_read_bit_buffer *rb) {
   const int num_planes = av1_num_planes(cm);
   CdefInfo *const cdef_info = &cm->cdef_info;
 
-  if (cm->allow_intrabc) return;
-  cdef_info->cdef_pri_damping = aom_rb_read_literal(rb, 2) + 3;
-  cdef_info->cdef_sec_damping = cdef_info->cdef_pri_damping;
+  if (cm->features.allow_intrabc) return;
+  cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3;
   cdef_info->cdef_bits = aom_rb_read_literal(rb, 2);
   cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits;
   for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) {
@@ -2217,80 +1816,86 @@
   return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
 }
 
-static void setup_quantization(AV1_COMMON *const cm,
-                               struct aom_read_bit_buffer *rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const int num_planes = av1_num_planes(cm);
-  cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
-  cm->y_dc_delta_q = read_delta_q(rb);
+static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
+                                          int num_planes,
+                                          bool separate_uv_delta_q,
+                                          struct aom_read_bit_buffer *rb) {
+  quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+  quant_params->y_dc_delta_q = read_delta_q(rb);
   if (num_planes > 1) {
     int diff_uv_delta = 0;
-    if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
-    cm->u_dc_delta_q = read_delta_q(rb);
-    cm->u_ac_delta_q = read_delta_q(rb);
+    if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    quant_params->u_dc_delta_q = read_delta_q(rb);
+    quant_params->u_ac_delta_q = read_delta_q(rb);
     if (diff_uv_delta) {
-      cm->v_dc_delta_q = read_delta_q(rb);
-      cm->v_ac_delta_q = read_delta_q(rb);
+      quant_params->v_dc_delta_q = read_delta_q(rb);
+      quant_params->v_ac_delta_q = read_delta_q(rb);
     } else {
-      cm->v_dc_delta_q = cm->u_dc_delta_q;
-      cm->v_ac_delta_q = cm->u_ac_delta_q;
+      quant_params->v_dc_delta_q = quant_params->u_dc_delta_q;
+      quant_params->v_ac_delta_q = quant_params->u_ac_delta_q;
     }
   } else {
-    cm->u_dc_delta_q = 0;
-    cm->u_ac_delta_q = 0;
-    cm->v_dc_delta_q = 0;
-    cm->v_ac_delta_q = 0;
+    quant_params->u_dc_delta_q = 0;
+    quant_params->u_ac_delta_q = 0;
+    quant_params->v_dc_delta_q = 0;
+    quant_params->v_ac_delta_q = 0;
   }
-  cm->using_qmatrix = aom_rb_read_bit(rb);
-  if (cm->using_qmatrix) {
-    cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
-    cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
-    if (!seq_params->separate_uv_delta_q)
-      cm->qm_v = cm->qm_u;
+  quant_params->using_qmatrix = aom_rb_read_bit(rb);
+  if (quant_params->using_qmatrix) {
+    quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    if (!separate_uv_delta_q)
+      quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
     else
-      cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+      quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
   } else {
-    cm->qm_y = 0;
-    cm->qm_u = 0;
-    cm->qm_v = 0;
+    quant_params->qmatrix_level_y = 0;
+    quant_params->qmatrix_level_u = 0;
+    quant_params->qmatrix_level_v = 0;
   }
 }
 
 // Build y/uv dequant values based on segmentation.
-static void setup_segmentation_dequant(AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd) {
+static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
+                                                  MACROBLOCKD *const xd) {
   const int bit_depth = cm->seq_params.bit_depth;
-  const int using_qm = cm->using_qmatrix;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
+  CommonQuantParams *const quant_params = &cm->quant_params;
   for (int i = 0; i < max_segments; ++i) {
     const int qindex = xd->qindex[i];
-    cm->y_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth);
-    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
-    cm->u_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth);
-    cm->u_dequant_QTX[i][1] =
-        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth);
-    cm->v_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth);
-    cm->v_dequant_QTX[i][1] =
-        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth);
-    const int lossless = xd->lossless[i];
+    quant_params->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth);
+    quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
+    quant_params->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth);
+    quant_params->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth);
+    quant_params->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth);
+    quant_params->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth);
+    const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i);
     // NB: depends on base index so there is only 1 set per frame
     // No quant weighting when lossless or signalled not using QM
-    int qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_y;
+    const int qmlevel_y =
+        use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
     for (int j = 0; j < TX_SIZES_ALL; ++j) {
-      cm->y_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_Y, j);
+      quant_params->y_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j);
     }
-    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_u;
+    const int qmlevel_u =
+        use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
     for (int j = 0; j < TX_SIZES_ALL; ++j) {
-      cm->u_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_U, j);
+      quant_params->u_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j);
     }
-    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_v;
+    const int qmlevel_v =
+        use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
     for (int j = 0; j < TX_SIZES_ALL; ++j) {
-      cm->v_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_V, j);
+      quant_params->v_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j);
     }
   }
 }
@@ -2300,7 +1905,8 @@
                              : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
 }
 
-static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_render_size(AV1_COMMON *cm,
+                                         struct aom_read_bit_buffer *rb) {
   cm->render_width = cm->superres_upscaled_width;
   cm->render_height = cm->superres_upscaled_height;
   if (aom_rb_read_bit(rb))
@@ -2308,8 +1914,9 @@
 }
 
 // TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
-static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
-                           int *width, int *height) {
+static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *width, int *height) {
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
 
@@ -2330,7 +1937,8 @@
   }
 }
 
-static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
+static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
+                                              int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -2345,7 +1953,8 @@
 
     // Allocations in av1_alloc_context_buffers() depend on individual
     // dimensions as well as the overall size.
-    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+    if (new_mi_cols > cm->mi_params.mi_cols ||
+        new_mi_rows > cm->mi_params.mi_rows) {
       if (av1_alloc_context_buffers(cm, width, height)) {
         // The cm->mi_* values have been cleared and any existing context
         // buffers have been freed. Clear cm->width and cm->height to be
@@ -2356,9 +1965,9 @@
                            "Failed to allocate context buffers");
       }
     } else {
-      av1_set_mb_mi(cm, width, height);
+      cm->mi_params.set_mb_mi(&cm->mi_params, width, height);
     }
-    av1_init_context_buffers(cm);
+    av1_init_mi_buffers(&cm->mi_params);
     cm->width = width;
     cm->height = height;
   }
@@ -2368,7 +1977,7 @@
   cm->cur_frame->height = cm->height;
 }
 
-static void setup_buffer_pool(AV1_COMMON *cm) {
+static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
   const SequenceHeader *const seq_params = &cm->seq_params;
 
@@ -2376,7 +1985,7 @@
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
           &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
     unlock_buffer_pool(pool);
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
@@ -2397,8 +2006,9 @@
   cm->cur_frame->buf.render_height = cm->render_height;
 }
 
-static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
-                             struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
+                                        int frame_size_override_flag,
+                                        struct aom_read_bit_buffer *rb) {
   const SequenceHeader *const seq_params = &cm->seq_params;
   int width, height;
 
@@ -2422,8 +2032,8 @@
   setup_buffer_pool(cm);
 }
 
-static void setup_sb_size(SequenceHeader *seq_params,
-                          struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params,
+                                     struct aom_read_bit_buffer *rb) {
   set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
 }
 
@@ -2435,8 +2045,8 @@
          ref_yss == this_yss;
 }
 
-static void setup_frame_size_with_refs(AV1_COMMON *cm,
-                                       struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void setup_frame_size_with_refs(
+    AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   int width, height;
   int found = 0;
   int has_valid_ref_frame = 0;
@@ -2515,68 +2125,73 @@
     return (v << 1) - m + aom_rb_read_bit(rb);
 }
 
-static void read_tile_info_max_tile(AV1_COMMON *const cm,
-                                    struct aom_read_bit_buffer *const rb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
-  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+static AOM_INLINE void read_tile_info_max_tile(
+    AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  int width_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  int height_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+  int width_sb = width_mi >> seq_params->mib_size_log2;
+  int height_sb = height_mi >> seq_params->mib_size_log2;
 
   av1_get_tile_limits(cm);
-  cm->uniform_tile_spacing_flag = aom_rb_read_bit(rb);
+  tiles->uniform_spacing = aom_rb_read_bit(rb);
 
   // Read tile columns
-  if (cm->uniform_tile_spacing_flag) {
-    cm->log2_tile_cols = cm->min_log2_tile_cols;
-    while (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+  if (tiles->uniform_spacing) {
+    tiles->log2_cols = tiles->min_log2_cols;
+    while (tiles->log2_cols < tiles->max_log2_cols) {
       if (!aom_rb_read_bit(rb)) {
         break;
       }
-      cm->log2_tile_cols++;
+      tiles->log2_cols++;
     }
   } else {
     int i;
     int start_sb;
     for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
       const int size_sb =
-          1 + rb_read_uniform(rb, AOMMIN(width_sb, cm->max_tile_width_sb));
-      cm->tile_col_start_sb[i] = start_sb;
+          1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb));
+      tiles->col_start_sb[i] = start_sb;
       start_sb += size_sb;
       width_sb -= size_sb;
     }
-    cm->tile_cols = i;
-    cm->tile_col_start_sb[i] = start_sb + width_sb;
+    tiles->cols = i;
+    tiles->col_start_sb[i] = start_sb + width_sb;
   }
-  av1_calculate_tile_cols(cm);
+  av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows,
+                          cm->mi_params.mi_cols, tiles);
 
   // Read tile rows
-  if (cm->uniform_tile_spacing_flag) {
-    cm->log2_tile_rows = cm->min_log2_tile_rows;
-    while (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+  if (tiles->uniform_spacing) {
+    tiles->log2_rows = tiles->min_log2_rows;
+    while (tiles->log2_rows < tiles->max_log2_rows) {
       if (!aom_rb_read_bit(rb)) {
         break;
       }
-      cm->log2_tile_rows++;
+      tiles->log2_rows++;
     }
   } else {
     int i;
     int start_sb;
     for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
       const int size_sb =
-          1 + rb_read_uniform(rb, AOMMIN(height_sb, cm->max_tile_height_sb));
-      cm->tile_row_start_sb[i] = start_sb;
+          1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb));
+      tiles->row_start_sb[i] = start_sb;
       start_sb += size_sb;
       height_sb -= size_sb;
     }
-    cm->tile_rows = i;
-    cm->tile_row_start_sb[i] = start_sb + height_sb;
+    tiles->rows = i;
+    tiles->row_start_sb[i] = start_sb + height_sb;
   }
-  av1_calculate_tile_rows(cm);
+  av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles);
 }
 
 void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
-  cm->single_tile_decoding = 0;
-  if (cm->large_scale_tile) {
+  cm->tiles.single_tile_decoding = 0;
+  if (cm->tiles.large_scale) {
     struct loopfilter *lf = &cm->lf;
     RestorationInfo *const rst_info = cm->rst_info;
     const CdefInfo *const cdef_info = &cm->cdef_info;
@@ -2590,24 +2205,24 @@
         rst_info[0].frame_restoration_type == RESTORE_NONE &&
         rst_info[1].frame_restoration_type == RESTORE_NONE &&
         rst_info[2].frame_restoration_type == RESTORE_NONE;
-    assert(IMPLIES(cm->coded_lossless, no_loopfilter && no_cdef));
-    assert(IMPLIES(cm->all_lossless, no_restoration));
-    cm->single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+    assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef));
+    assert(IMPLIES(cm->features.all_lossless, no_restoration));
+    cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
   }
 }
 
-static void read_tile_info(AV1Decoder *const pbi,
-                           struct aom_read_bit_buffer *const rb) {
+static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
+                                      struct aom_read_bit_buffer *const rb) {
   AV1_COMMON *const cm = &pbi->common;
 
   read_tile_info_max_tile(cm, rb);
 
-  cm->context_update_tile_id = 0;
-  if (cm->tile_rows * cm->tile_cols > 1) {
+  pbi->context_update_tile_id = 0;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
     // tile to use for cdf update
-    cm->context_update_tile_id =
-        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
-    if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+    pbi->context_update_tile_id =
+        aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
+    if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid context_update_tile_id");
     }
@@ -2617,8 +2232,8 @@
 }
 
 #if EXT_TILE_DEBUG
-static void read_ext_tile_info(AV1Decoder *const pbi,
-                               struct aom_read_bit_buffer *const rb) {
+static AOM_INLINE void read_ext_tile_info(
+    AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) {
   AV1_COMMON *const cm = &pbi->common;
 
   // This information is stored as a separate byte.
@@ -2626,7 +2241,7 @@
   if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
   assert(rb->bit_offset % CHAR_BIT == 0);
 
-  if (cm->tile_cols * cm->tile_rows > 1) {
+  if (cm->tiles.cols * cm->tiles.rows > 1) {
     // Read the number of bytes used to store tile size
     pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
     pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
@@ -2648,7 +2263,7 @@
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'. On return, '*data' is updated to point to the end of the
 // raw tile buffer in the bit stream.
-static void get_ls_tile_buffer(
+static AOM_INLINE void get_ls_tile_buffer(
     const uint8_t *const data_end, struct aom_internal_error_info *error_info,
     const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
     int tile_size_bytes, int col, int row, int tile_copy_mode) {
@@ -2694,13 +2309,13 @@
 }
 
 // Returns the end of the last tile buffer
-// (tile_buffers[cm->tile_rows - 1][cm->tile_cols - 1]).
+// (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]).
 static const uint8_t *get_ls_tile_buffers(
     AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
     TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   const int have_tiles = tile_cols * tile_rows > 1;
   const uint8_t *raw_data_end;  // The end of the last tile buffer
 
@@ -2795,16 +2410,16 @@
 
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
-static void get_tile_buffer(const uint8_t *const data_end,
-                            const int tile_size_bytes, int is_last,
-                            struct aom_internal_error_info *error_info,
-                            const uint8_t **data, TileBufferDec *const buf) {
+static AOM_INLINE void get_tile_buffer(
+    const uint8_t *const data_end, const int tile_size_bytes, int is_last,
+    struct aom_internal_error_info *error_info, const uint8_t **data,
+    TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
     if (!read_is_valid(*data, tile_size_bytes, data_end))
       aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt tile length");
+                         "Not enough data to read tile size");
 
     size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
     *data += tile_size_bytes;
@@ -2822,15 +2437,14 @@
   *data += size;
 }
 
-static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
-                             const uint8_t *data_end,
-                             TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
-                             int start_tile, int end_tile) {
+static AOM_INLINE void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile,
+    int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   int tc = 0;
-  int first_tile_in_tg = 0;
 
   for (int r = 0; r < tile_rows; ++r) {
     for (int c = 0; c < tile_cols; ++c, ++tc) {
@@ -2844,7 +2458,6 @@
       if (data + hdr_offset >= data_end)
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
-      first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
       data += hdr_offset;
       get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
                       &pbi->common.error, &data, buf);
@@ -2852,12 +2465,13 @@
   }
 }
 
-static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
-                          CB_BUFFER *cb_buffer_base, const int num_planes,
-                          int mi_row, int mi_col) {
+static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+                                     CB_BUFFER *cb_buffer_base,
+                                     const int num_planes, int mi_row,
+                                     int mi_col) {
   AV1_COMMON *const cm = &pbi->common;
   int mib_size_log2 = cm->seq_params.mib_size_log2;
-  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_BUFFER *cb_buffer = cb_buffer_base + offset;
 
@@ -2873,7 +2487,8 @@
   xd->color_index_map_offset[1] = 0;
 }
 
-static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
+static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi,
+                                               const int n_tiles) {
   AV1_COMMON *const cm = &pbi->common;
   aom_free(pbi->tile_data);
   CHECK_MEM_ERROR(cm, pbi->tile_data,
@@ -2905,8 +2520,8 @@
 }
 
 // Allocate memory for decoder row synchronization
-static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm,
-                             int rows) {
+static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync,
+                                        AV1_COMMON *cm, int rows) {
   dec_row_mt_sync->allocated_sb_rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -3014,12 +2629,13 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
-static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
-                               TileInfo tile_info, const int mi_row) {
+static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                                          TileInfo tile_info,
+                                          const int mi_row) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
   TileDataDec *const tile_data =
-      pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
+      pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
   const int sb_row_in_tile =
       (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
@@ -3064,7 +2680,8 @@
   return 0;
 }
 
-static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) {
+static AOM_INLINE void set_decode_func_pointers(ThreadData *td,
+                                                int parse_decode_flag) {
   td->read_coeffs_tx_intra_block_visit = decode_block_void;
   td->predict_and_recon_intra_block_visit = decode_block_void;
   td->read_coeffs_tx_inter_block_visit = decode_block_void;
@@ -3085,8 +2702,8 @@
   }
 }
 
-static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
-                        int tile_col) {
+static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
+                                   int tile_row, int tile_col) {
   TileInfo tile_info;
 
   AV1_COMMON *const cm = &pbi->common;
@@ -3128,8 +2745,9 @@
                                    int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
   ThreadData *const td = &pbi->td;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
   const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
@@ -3146,7 +2764,7 @@
   uint8_t allow_update_cdf;
   const uint8_t *raw_data_end = NULL;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
     tile_cols_start = single_col ? dec_tile_col : 0;
@@ -3167,20 +2785,20 @@
   // No tiles to decode.
   if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
       // First tile is larger than end_tile.
-      tile_rows_start * cm->tile_cols + tile_cols_start > end_tile ||
+      tile_rows_start * tiles->cols + tile_cols_start > end_tile ||
       // Last tile is smaller than start_tile.
-      (tile_rows_end - 1) * cm->tile_cols + tile_cols_end - 1 < start_tile)
+      (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile)
     return data;
 
-  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
 
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
 
 #if EXT_TILE_DEBUG
-  if (cm->large_scale_tile && !pbi->ext_tile_debug)
+  if (tiles->large_scale && !pbi->ext_tile_debug)
     raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
-  else if (cm->large_scale_tile && pbi->ext_tile_debug)
+  else if (tiles->large_scale && pbi->ext_tile_debug)
     raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
 #endif  // EXT_TILE_DEBUG
@@ -3212,17 +2830,17 @@
 
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
       const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
-      TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col;
       const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 
-      if (row * cm->tile_cols + col < start_tile ||
-          row * cm->tile_cols + col > end_tile)
+      if (row * tiles->cols + col < start_tile ||
+          row * tiles->cols + col > end_tile)
         continue;
 
       td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->cb_buffer_base.dqcoeff);
       av1_tile_init(&td->xd.tile, cm, row, col);
-      td->xd.current_qindex = cm->base_qindex;
+      td->xd.current_qindex = cm->quant_params.base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
                          &cm->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
@@ -3235,7 +2853,8 @@
       }
 #endif
       av1_init_macroblockd(cm, &td->xd, NULL);
-      av1_init_above_context(cm, &td->xd, row);
+      av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
+                             &td->xd);
 
       // Initialise the tile context from the frame context
       tile_data->tctx = *cm->fc;
@@ -3250,7 +2869,7 @@
     }
   }
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     if (n_tiles == 1) {
       // Find the end of the single tile buffer
       return aom_reader_find_end(&pbi->tile_data->bit_reader);
@@ -3280,11 +2899,10 @@
   return cur_job_info;
 }
 
-static void tile_worker_hook_init(AV1Decoder *const pbi,
-                                  DecWorkerData *const thread_data,
-                                  const TileBufferDec *const tile_buffer,
-                                  TileDataDec *const tile_data,
-                                  uint8_t allow_update_cdf) {
+static AOM_INLINE void tile_worker_hook_init(
+    AV1Decoder *const pbi, DecWorkerData *const thread_data,
+    const TileBufferDec *const tile_buffer, TileDataDec *const tile_data,
+    uint8_t allow_update_cdf) {
   AV1_COMMON *cm = &pbi->common;
   ThreadData *const td = thread_data->td;
   int tile_row = tile_data->tile_info.tile_row;
@@ -3293,7 +2911,7 @@
   td->bit_reader = &tile_data->bit_reader;
   av1_zero(td->cb_buffer_base.dqcoeff);
   av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-  td->xd.current_qindex = cm->base_qindex;
+  td->xd.current_qindex = cm->quant_params.base_qindex;
   setup_bool_decoder(tile_buffer->data, thread_data->data_end,
                      tile_buffer->size, &thread_data->error_info,
                      td->bit_reader, allow_update_cdf);
@@ -3308,7 +2926,8 @@
 #endif
   av1_init_macroblockd(cm, &td->xd, NULL);
   td->xd.error_info = &thread_data->error_info;
-  av1_init_above_context(cm, &td->xd, tile_row);
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                         &td->xd);
 
   // Initialise the tile context from the frame context
   tile_data->tctx = *cm->fc;
@@ -3338,12 +2957,12 @@
   }
   thread_data->error_info.setjmp = 1;
 
-  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
-  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
 
   set_decode_func_pointers(td, 0x3);
 
-  assert(cm->tile_cols > 0);
+  assert(cm->tiles.cols > 0);
   while (!td->xd.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
@@ -3433,11 +3052,11 @@
        ++tile_row_idx) {
     for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
          ++tile_col_idx) {
-      if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile ||
-          tile_row_idx * cm->tile_cols + tile_col_idx > end_tile)
+      if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile ||
+          tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile)
         continue;
 
-      tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx;
+      tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx;
       dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 
       num_threads_working = dec_row_mt_sync->num_threads_working;
@@ -3470,7 +3089,7 @@
   // No job found to process
   if (tile_row == -1 || tile_col == -1) return 0;
 
-  tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+  tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
   tile_info = tile_data->tile_info;
   dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 
@@ -3517,8 +3136,8 @@
 
 // This function is very similar to decode_tile(). It would be good to figure
 // out how to share code.
-static void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
-                              TileDataDec *const tile_data) {
+static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
+                                         TileDataDec *const tile_data) {
   AV1_COMMON *const cm = &pbi->common;
   const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
   const int num_planes = av1_num_planes(cm);
@@ -3583,12 +3202,12 @@
   }
   thread_data->error_info.setjmp = 1;
 
-  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
-  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
 
   set_decode_func_pointers(td, 0x1);
 
-  assert(cm->tile_cols > 0);
+  assert(cm->tiles.cols > 0);
   while (!td->xd.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
@@ -3656,7 +3275,7 @@
     int mi_row = next_job_info.mi_row;
 
     TileDataDec *tile_data =
-        pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+        pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
     AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
     TileInfo tile_info = tile_data->tile_info;
 
@@ -3685,10 +3304,10 @@
   return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
 }
 
-static void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
-                              int tile_rows_start, int tile_rows_end,
-                              int tile_cols_start, int tile_cols_end,
-                              int start_tile, int end_tile) {
+static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                                         int tile_rows_start, int tile_rows_end,
+                                         int tile_cols_start, int tile_cols_end,
+                                         int start_tile, int end_tile) {
   AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
   TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
   tile_mt_info->jobs_enqueued = 0;
@@ -3696,19 +3315,20 @@
 
   for (int row = tile_rows_start; row < tile_rows_end; row++) {
     for (int col = tile_cols_start; col < tile_cols_end; col++) {
-      if (row * cm->tile_cols + col < start_tile ||
-          row * cm->tile_cols + col > end_tile)
+      if (row * cm->tiles.cols + col < start_tile ||
+          row * cm->tiles.cols + col > end_tile)
         continue;
       tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
-      tile_job_queue->tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col;
       tile_job_queue++;
       tile_mt_info->jobs_enqueued++;
     }
   }
 }
 
-static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
-                           int tile_rows, int tile_cols) {
+static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info,
+                                      AV1_COMMON *cm, int tile_rows,
+                                      int tile_cols) {
   tile_mt_info->alloc_tile_rows = tile_rows;
   tile_mt_info->alloc_tile_cols = tile_cols;
   int num_tiles = tile_rows * tile_cols;
@@ -3746,8 +3366,9 @@
   }
 }
 
-static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
-                                int buf_size, int use_highbd) {
+static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
+                                           ThreadData *thread_data,
+                                           int buf_size, int use_highbd) {
   for (int ref = 0; ref < 2; ref++) {
     if (use_highbd) {
       uint16_t *hbd_mc_buf;
@@ -3772,8 +3393,9 @@
   }
 }
 
-static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
-                              int num_workers) {
+static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
+                                         AVxWorkerHook worker_hook,
+                                         int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
   // Reset tile decoding hook
@@ -3801,8 +3423,9 @@
 #endif
 }
 
-static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
-                               int num_workers) {
+static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
+                                          const uint8_t *data_end,
+                                          int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
   for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
@@ -3820,7 +3443,7 @@
   }
 }
 
-static void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int corrupted = 0;
 
@@ -3832,7 +3455,7 @@
   pbi->mb.corrupted = corrupted;
 }
 
-static void decode_mt_init(AV1Decoder *pbi) {
+static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int worker_idx;
@@ -3881,10 +3504,11 @@
   }
 }
 
-static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
-                          int tile_rows_start, int tile_rows_end,
-                          int tile_cols_start, int tile_cols_end,
-                          int start_tile, int end_tile) {
+static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols,
+                                     int tile_rows, int tile_rows_start,
+                                     int tile_rows_end, int tile_cols_start,
+                                     int tile_cols_end, int start_tile,
+                                     int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
   if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
       pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
@@ -3901,8 +3525,9 @@
                                       const uint8_t *data_end, int start_tile,
                                       int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
   const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
@@ -3917,7 +3542,7 @@
   int num_workers;
   const uint8_t *raw_data_end = NULL;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
     tile_cols_start = single_col ? dec_tile_col : 0;
@@ -3950,8 +3575,8 @@
 
   // get tile size in tile group
 #if EXT_TILE_DEBUG
-  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
-  if (cm->large_scale_tile)
+  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+  if (tiles->large_scale)
     raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
 #endif  // EXT_TILE_DEBUG
@@ -3963,7 +3588,7 @@
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
-      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
       av1_tile_init(&tile_data->tile_info, cm, row, col);
     }
   }
@@ -3979,7 +3604,7 @@
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     if (n_tiles == 1) {
       // Find the end of the single tile buffer
       return aom_reader_find_end(&pbi->tile_data->bit_reader);
@@ -3992,10 +3617,10 @@
   return aom_reader_find_end(&tile_data->bit_reader);
 }
 
-static void dec_alloc_cb_buf(AV1Decoder *pbi) {
+static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
 
   if (pbi->cb_buffer_alloc_size < size) {
     av1_dec_free_cb_buf(pbi);
@@ -4006,10 +3631,10 @@
   }
 }
 
-static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
-                              int tile_rows_end, int tile_cols_start,
-                              int tile_cols_end, int start_tile, int end_tile,
-                              int max_sb_rows) {
+static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+                                         int tile_rows_end, int tile_cols_start,
+                                         int tile_cols_end, int start_tile,
+                                         int end_tile, int max_sb_rows) {
   AV1_COMMON *const cm = &pbi->common;
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 
@@ -4026,12 +3651,12 @@
 
   for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
     for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
-      if (tile_row * cm->tile_cols + tile_col < start_tile ||
-          tile_row * cm->tile_cols + tile_col > end_tile)
+      if (tile_row * cm->tiles.cols + tile_col < start_tile ||
+          tile_row * cm->tiles.cols + tile_col > end_tile)
         continue;
 
       TileDataDec *const tile_data =
-          pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+          pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
       TileInfo tile_info = tile_data->tile_info;
 
       tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
@@ -4076,8 +3701,9 @@
                                           const uint8_t *data_end,
                                           int start_tile, int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
   const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
@@ -4094,7 +3720,7 @@
   const uint8_t *raw_data_end = NULL;
   int max_sb_rows = 0;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
     tile_cols_start = single_col ? dec_tile_col : 0;
@@ -4129,24 +3755,26 @@
 
   // get tile size in tile group
 #if EXT_TILE_DEBUG
-  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
-  if (cm->large_scale_tile)
+  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+  if (tiles->large_scale)
     raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
 #endif  // EXT_TILE_DEBUG
     get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
-    for (int i = 0; i < pbi->allocated_tiles; i++) {
-      TileDataDec *const tile_data = pbi->tile_data + i;
-      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+    if (pbi->tile_data != NULL) {
+      for (int i = 0; i < pbi->allocated_tiles; i++) {
+        TileDataDec *const tile_data = pbi->tile_data + i;
+        av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+      }
     }
     decoder_alloc_tile_data(pbi, n_tiles);
   }
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
-      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
       av1_tile_init(&tile_data->tile_info, cm, row, col);
 
       max_sb_rows = AOMMAX(max_sb_rows,
@@ -4181,7 +3809,7 @@
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     if (n_tiles == 1) {
       // Find the end of the single tile buffer
       return aom_reader_find_end(&pbi->tile_data->bit_reader);
@@ -4194,7 +3822,7 @@
   return aom_reader_find_end(&tile_data->bit_reader);
 }
 
-static void error_handler(void *data) {
+static AOM_INLINE void error_handler(void *data) {
   AV1_COMMON *const cm = (AV1_COMMON *)data;
   aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
@@ -4203,9 +3831,9 @@
 // seq_params->bit_depth based on the values of those fields and
 // seq_params->profile. Reports errors by calling rb->error_handler() or
 // aom_internal_error().
-static void read_bitdepth(struct aom_read_bit_buffer *rb,
-                          SequenceHeader *seq_params,
-                          struct aom_internal_error_info *error_info) {
+static AOM_INLINE void read_bitdepth(
+    struct aom_read_bit_buffer *rb, SequenceHeader *seq_params,
+    struct aom_internal_error_info *error_info) {
   const int high_bitdepth = aom_rb_read_bit(rb);
   if (seq_params->profile == PROFILE_2 && high_bitdepth) {
     const int twelve_bit = aom_rb_read_bit(rb);
@@ -4216,6 +3844,12 @@
     aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
                        "Unsupported profile/bit-depth combination");
   }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  if (seq_params->bit_depth > AOM_BITS_8) {
+    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Bit-depth %d not supported", seq_params->bit_depth);
+  }
+#endif
 }
 
 void av1_read_film_grain_params(AV1_COMMON *cm,
@@ -4383,7 +4017,8 @@
   pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 }
 
-static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
+                                       struct aom_read_bit_buffer *rb) {
   if (cm->seq_params.film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     av1_read_film_grain_params(cm, rb);
@@ -4472,64 +4107,59 @@
   seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
 }
 
-void av1_read_timing_info_header(AV1_COMMON *cm,
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+                                 struct aom_internal_error_info *error,
                                  struct aom_read_bit_buffer *rb) {
-  cm->timing_info.num_units_in_display_tick = aom_rb_read_unsigned_literal(
-      rb, 32);  // Number of units in a display tick
-  cm->timing_info.time_scale =
-      aom_rb_read_unsigned_literal(rb, 32);  // Time scale
-  if (cm->timing_info.num_units_in_display_tick == 0 ||
-      cm->timing_info.time_scale == 0) {
+  timing_info->num_units_in_display_tick =
+      aom_rb_read_unsigned_literal(rb,
+                                   32);  // Number of units in a display tick
+  timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32);  // Time scale
+  if (timing_info->num_units_in_display_tick == 0 ||
+      timing_info->time_scale == 0) {
     aom_internal_error(
-        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        error, AOM_CODEC_UNSUP_BITSTREAM,
         "num_units_in_display_tick and time_scale must be greater than 0.");
   }
-  cm->timing_info.equal_picture_interval =
+  timing_info->equal_picture_interval =
       aom_rb_read_bit(rb);  // Equal picture interval bit
-  if (cm->timing_info.equal_picture_interval) {
+  if (timing_info->equal_picture_interval) {
     const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
     if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          error, AOM_CODEC_UNSUP_BITSTREAM,
           "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
     }
-    cm->timing_info.num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
+    timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
   }
 }
 
-void av1_read_decoder_model_info(AV1_COMMON *cm,
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
                                  struct aom_read_bit_buffer *rb) {
-  cm->buffer_model.encoder_decoder_buffer_delay_length =
+  decoder_model_info->encoder_decoder_buffer_delay_length =
       aom_rb_read_literal(rb, 5) + 1;
-  cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
-      rb, 32);  // Number of units in a decoding tick
-  cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1;
-  cm->buffer_model.frame_presentation_time_length =
+  decoder_model_info->num_units_in_decoding_tick =
+      aom_rb_read_unsigned_literal(rb,
+                                   32);  // Number of units in a decoding tick
+  decoder_model_info->buffer_removal_time_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  decoder_model_info->frame_presentation_time_length =
       aom_rb_read_literal(rb, 5) + 1;
 }
 
-void av1_read_op_parameters_info(AV1_COMMON *const cm,
-                                 struct aom_read_bit_buffer *rb, int op_num) {
-  // The cm->op_params array has MAX_NUM_OPERATING_POINTS + 1 elements.
-  if (op_num > MAX_NUM_OPERATING_POINTS) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "AV1 does not support %d decoder model operating points",
-                       op_num + 1);
-  }
-
-  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal(
-      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
-
-  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal(
-      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
-
-  cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+                                 int buffer_delay_length,
+                                 struct aom_read_bit_buffer *rb) {
+  op_params->decoder_buffer_delay =
+      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+  op_params->encoder_buffer_delay =
+      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+  op_params->low_delay_mode_flag = aom_rb_read_bit(rb);
 }
 
-static void av1_read_temporal_point_info(AV1_COMMON *const cm,
-                                         struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void read_temporal_point_info(
+    AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
   cm->frame_presentation_time = aom_rb_read_unsigned_literal(
-      rb, cm->buffer_model.frame_presentation_time_length);
+      rb, cm->seq_params.decoder_model_info.frame_presentation_time_length);
 }
 
 void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
@@ -4681,20 +4311,22 @@
   }
 
   if (params->wmtype <= AFFINE) {
-    int good_shear_params = get_shear_params(params);
+    int good_shear_params = av1_get_shear_params(params);
     if (!good_shear_params) return 0;
   }
 
   return 1;
 }
 
-static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static AOM_INLINE void read_global_motion(AV1_COMMON *cm,
+                                          struct aom_read_bit_buffer *rb) {
   for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
         cm->prev_frame ? &cm->prev_frame->global_motion[frame]
                        : &default_warp_params;
-    int good_params = read_global_motion_params(
-        &cm->global_motion[frame], ref_params, rb, cm->allow_high_precision_mv);
+    int good_params =
+        read_global_motion_params(&cm->global_motion[frame], ref_params, rb,
+                                  cm->features.allow_high_precision_mv);
     if (!good_params) {
 #if WARPED_MOTION_DEBUG
       printf("Warning: unexpected global motion shear params from aomenc\n");
@@ -4711,7 +4343,7 @@
         cm->height == ref_buf->y_crop_height) {
       read_global_motion_params(&cm->global_motion[frame],
                                 &cm->prev_frame->global_motion[frame], rb,
-                                cm->allow_high_precision_mv);
+                                cm->features.allow_high_precision_mv);
     } else {
       cm->global_motion[frame] = default_warp_params;
     }
@@ -4731,7 +4363,7 @@
 
 // Release the references to the frame buffers in cm->ref_frame_map and reset
 // all elements of cm->ref_frame_map to NULL.
-static void reset_ref_frame_map(AV1_COMMON *const cm) {
+static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) {
   BufferPool *const pool = cm->buffer_pool;
 
   for (int i = 0; i < REF_FRAMES; i++) {
@@ -4740,51 +4372,21 @@
   }
 }
 
-// Generate next_ref_frame_map.
-static void generate_next_ref_frame_map(AV1Decoder *const pbi) {
-  AV1_COMMON *const cm = &pbi->common;
-  BufferPool *const pool = cm->buffer_pool;
-
-  lock_buffer_pool(pool);
-  // cm->next_ref_frame_map holds references to frame buffers. After storing a
-  // frame buffer index in cm->next_ref_frame_map, we need to increase the
-  // frame buffer's ref_count.
-  int ref_index = 0;
-  for (int mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      cm->next_ref_frame_map[ref_index] = cm->cur_frame;
-    } else {
-      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-    }
-    if (cm->next_ref_frame_map[ref_index] != NULL)
-      ++cm->next_ref_frame_map[ref_index]->ref_count;
-    ++ref_index;
-  }
-
-  for (; ref_index < REF_FRAMES; ++ref_index) {
-    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-    if (cm->next_ref_frame_map[ref_index] != NULL)
-      ++cm->next_ref_frame_map[ref_index]->ref_count;
-  }
-  unlock_buffer_pool(pool);
-  pbi->hold_ref_buf = 1;
-}
-
 // If the refresh_frame_flags bitmask is set, update reference frame id values
 // and mark frames as valid for reference.
-static void update_ref_frame_id(AV1_COMMON *const cm, int frame_id) {
-  assert(cm->seq_params.frame_id_numbers_present_flag);
+static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) {
+  AV1_COMMON *const cm = &pbi->common;
   int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
   for (int i = 0; i < REF_FRAMES; i++) {
     if ((refresh_frame_flags >> i) & 1) {
-      cm->ref_frame_id[i] = frame_id;
-      cm->valid_for_referencing[i] = 1;
+      cm->ref_frame_id[i] = cm->current_frame_id;
+      pbi->valid_for_referencing[i] = 1;
     }
   }
 }
 
-static void show_existing_frame_reset(AV1Decoder *const pbi,
-                                      int existing_frame_idx) {
+static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi,
+                                                 int existing_frame_idx) {
   AV1_COMMON *const cm = &pbi->common;
 
   assert(cm->show_existing_frame);
@@ -4804,28 +4406,16 @@
 
   // Note that the displayed frame must be valid for referencing in order to
   // have been selected.
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    update_ref_frame_id(cm, cm->ref_frame_id[existing_frame_idx]);
-  }
+  cm->current_frame_id = cm->ref_frame_id[existing_frame_idx];
+  update_ref_frame_id(pbi);
 
-  cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
-
-  generate_next_ref_frame_map(pbi);
-
-  // Reload the adapted CDFs from when we originally coded this keyframe
-  *cm->fc = cm->next_ref_frame_map[existing_frame_idx]->frame_context;
+  cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 }
 
 static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
-  // We have not stored any references to frame buffers in
-  // cm->next_ref_frame_map, so we can directly reset it to all NULL.
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->next_ref_frame_map[i] = NULL;
-  }
-
   lock_buffer_pool(cm->buffer_pool);
   reset_ref_frame_map(cm);
   assert(cm->cur_frame->ref_count == 1);
@@ -4849,6 +4439,7 @@
   AV1_COMMON *const cm = &pbi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &pbi->mb;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -4858,8 +4449,6 @@
                        "No sequence header");
   }
 
-  cm->last_frame_type = current_frame->frame_type;
-
   if (seq_params->reduced_still_picture_hdr) {
     cm->show_existing_frame = 0;
     cm->show_frame = 1;
@@ -4870,7 +4459,7 @@
       pbi->decoding_first_frame = 1;
       reset_frame_buffers(cm);
     }
-    cm->error_resilient_mode = 1;
+    features->error_resilient_mode = 1;
   } else {
     cm->show_existing_frame = aom_rb_read_bit(rb);
     pbi->reset_decoder_state = 0;
@@ -4889,8 +4478,8 @@
                            "Buffer does not contain a decoded frame");
       }
       if (seq_params->decoder_model_info_present_flag &&
-          cm->timing_info.equal_picture_interval == 0) {
-        av1_read_temporal_point_info(cm, rb);
+          seq_params->timing_info.equal_picture_interval == 0) {
+        read_temporal_point_info(cm, rb);
       }
       if (seq_params->frame_id_numbers_present_flag) {
         int frame_id_length = seq_params->frame_id_length;
@@ -4898,16 +4487,15 @@
         /* Compare display_frame_id with ref_frame_id and check valid for
          * referencing */
         if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
-            cm->valid_for_referencing[existing_frame_idx] == 0)
+            pbi->valid_for_referencing[existing_frame_idx] == 0)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference buffer frame ID mismatch");
       }
       lock_buffer_pool(pool);
       assert(frame_to_show->ref_count > 0);
       // cm->cur_frame should be the buffer referenced by the return value
-      // of the get_free_fb() call in av1_receive_compressed_data(), and
-      // generate_next_ref_frame_map() has not been called, so ref_count
-      // should still be 1.
+      // of the get_free_fb() call in assign_cur_frame_new_fb() (called by
+      // av1_receive_compressed_data()), so the ref_count should be 1.
       assert(cm->cur_frame->ref_count == 1);
       // assign_frame_buffer_p() decrements ref_count directly rather than
       // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
@@ -4922,9 +4510,17 @@
       cm->lf.filter_level[1] = 0;
       cm->show_frame = 1;
 
+      // Section 6.8.2: It is a requirement of bitstream conformance that when
+      // show_existing_frame is used to show a previous frame, that the value
+      // of showable_frame for the previous frame was equal to 1.
       if (!frame_to_show->showable_frame) {
-        aom_merge_corrupted_flag(&xd->corrupted, 1);
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer does not contain a showable frame");
       }
+      // Section 6.8.2: It is a requirement of bitstream conformance that when
+      // show_existing_frame is used to show a previous frame with
+      // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the
+      // frame is output via the show_existing_frame mechanism at most once.
       if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
 
       cm->film_grain_params = frame_to_show->film_grain_params;
@@ -4960,40 +4556,47 @@
     cm->showable_frame = current_frame->frame_type != KEY_FRAME;
     if (cm->show_frame) {
       if (seq_params->decoder_model_info_present_flag &&
-          cm->timing_info.equal_picture_interval == 0)
-        av1_read_temporal_point_info(cm, rb);
+          seq_params->timing_info.equal_picture_interval == 0)
+        read_temporal_point_info(cm, rb);
     } else {
       // See if this frame can be used as show_existing_frame in future
       cm->showable_frame = aom_rb_read_bit(rb);
     }
     cm->cur_frame->showable_frame = cm->showable_frame;
-    cm->error_resilient_mode =
+    features->error_resilient_mode =
         frame_is_sframe(cm) ||
                 (current_frame->frame_type == KEY_FRAME && cm->show_frame)
             ? 1
             : aom_rb_read_bit(rb);
   }
 
-  cm->disable_cdf_update = aom_rb_read_bit(rb);
+  if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
+    /* All frames need to be marked as not valid for referencing */
+    for (int i = 0; i < REF_FRAMES; i++) {
+      pbi->valid_for_referencing[i] = 0;
+    }
+  }
+  features->disable_cdf_update = aom_rb_read_bit(rb);
   if (seq_params->force_screen_content_tools == 2) {
-    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+    features->allow_screen_content_tools = aom_rb_read_bit(rb);
   } else {
-    cm->allow_screen_content_tools = seq_params->force_screen_content_tools;
+    features->allow_screen_content_tools =
+        seq_params->force_screen_content_tools;
   }
 
-  if (cm->allow_screen_content_tools) {
+  if (features->allow_screen_content_tools) {
     if (seq_params->force_integer_mv == 2) {
-      cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+      features->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
     } else {
-      cm->cur_frame_force_integer_mv = seq_params->force_integer_mv;
+      features->cur_frame_force_integer_mv = seq_params->force_integer_mv;
     }
   } else {
-    cm->cur_frame_force_integer_mv = 0;
+    features->cur_frame_force_integer_mv = 0;
   }
 
   int frame_size_override_flag = 0;
-  cm->allow_intrabc = 0;
-  cm->primary_ref_frame = PRIMARY_REF_NONE;
+  features->allow_intrabc = 0;
+  features->primary_ref_frame = PRIMARY_REF_NONE;
 
   if (!seq_params->reduced_still_picture_hdr) {
     if (seq_params->frame_id_numbers_present_flag) {
@@ -5025,17 +4628,15 @@
       }
       /* Check if some frames need to be marked as not valid for referencing */
       for (int i = 0; i < REF_FRAMES; i++) {
-        if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
-          cm->valid_for_referencing[i] = 0;
-        } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+        if (cm->current_frame_id - (1 << diff_len) > 0) {
           if (cm->ref_frame_id[i] > cm->current_frame_id ||
               cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
-            cm->valid_for_referencing[i] = 0;
+            pbi->valid_for_referencing[i] = 0;
         } else {
           if (cm->ref_frame_id[i] > cm->current_frame_id &&
               cm->ref_frame_id[i] < (1 << frame_id_length) +
                                         cm->current_frame_id - (1 << diff_len))
-            cm->valid_for_referencing[i] = 0;
+            pbi->valid_for_referencing[i] = 0;
         }
       }
     }
@@ -5046,8 +4647,8 @@
         rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
     current_frame->frame_number = current_frame->order_hint;
 
-    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
-      cm->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+      features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
     }
   }
 
@@ -5056,7 +4657,7 @@
     if (cm->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
-        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
           if ((((seq_params->operating_point_idc[op_num] >>
                  cm->temporal_layer_id) &
                 0x1) &&
@@ -5064,14 +4665,13 @@
                  (cm->spatial_layer_id + 8)) &
                 0x1)) ||
               seq_params->operating_point_idc[op_num] == 0) {
-            cm->op_frame_timing[op_num].buffer_removal_time =
-                aom_rb_read_unsigned_literal(
-                    rb, cm->buffer_model.buffer_removal_time_length);
+            cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
+                rb, seq_params->decoder_model_info.buffer_removal_time_length);
           } else {
-            cm->op_frame_timing[op_num].buffer_removal_time = 0;
+            cm->buffer_removal_times[op_num] = 0;
           }
         } else {
-          cm->op_frame_timing[op_num].buffer_removal_time = 0;
+          cm->buffer_removal_times[op_num] = 0;
         }
       }
     }
@@ -5109,7 +4709,7 @@
 
   if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
     // Read all ref frame order hints if error_resilient_mode == 1
-    if (cm->error_resilient_mode &&
+    if (features->error_resilient_mode &&
         seq_params->order_hint_info.enable_order_hint) {
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
         // Read order hint from bit stream
@@ -5122,6 +4722,7 @@
             lock_buffer_pool(pool);
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
+            cm->ref_frame_map[ref_idx] = NULL;
           }
           // If no corresponding buffer exists, allocate a new buffer with all
           // pixels set to neutral grey.
@@ -5136,7 +4737,7 @@
                   &buf->buf, seq_params->max_frame_width,
                   seq_params->max_frame_height, seq_params->subsampling_x,
                   seq_params->subsampling_y, seq_params->use_highbitdepth,
-                  AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+                  AOM_BORDER_IN_PIXELS, features->byte_alignment,
                   &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
@@ -5144,8 +4745,23 @@
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
-          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
+          // According to the specification, valid bitstreams are required to
+          // never use missing reference frames so the filling process for
+          // missing frames is not normatively defined and RefValid for missing
+          // frames is set to 0.
 
+          // To make libaom more robust when the bitstream has been corrupted
+          // by the loss of some frames of data, this code adds a neutral grey
+          // buffer in place of missing frames, i.e.
+          //
+          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
+          //
+          // and allows the frames to be used for referencing, i.e.
+          //
+          pbi->valid_for_referencing[ref_idx] = 1;
+          //
+          // Please note such behavior is not normative and other decoders may
+          // use a different approach.
           cm->ref_frame_map[ref_idx] = buf;
           buf->order_hint = order_hint;
         }
@@ -5156,19 +4772,19 @@
   if (current_frame->frame_type == KEY_FRAME) {
     setup_frame_size(cm, frame_size_override_flag, rb);
 
-    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
-      cm->allow_intrabc = aom_rb_read_bit(rb);
-    cm->allow_ref_frame_mvs = 0;
+    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+      features->allow_intrabc = aom_rb_read_bit(rb);
+    features->allow_ref_frame_mvs = 0;
     cm->prev_frame = NULL;
   } else {
-    cm->allow_ref_frame_mvs = 0;
+    features->allow_ref_frame_mvs = 0;
 
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       cm->cur_frame->film_grain_params_present =
           seq_params->film_grain_params_present;
       setup_frame_size(cm, frame_size_override_flag, rb);
-      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
-        cm->allow_intrabc = aom_rb_read_bit(rb);
+      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+        features->allow_intrabc = aom_rb_read_bit(rb);
 
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
       int frame_refs_short_signaling = 0;
@@ -5219,6 +4835,10 @@
         } else {
           ref = cm->remapped_ref_idx[i];
         }
+        // Check valid for referencing
+        if (pbi->valid_for_referencing[ref] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference frame not valid for referencing");
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
 
@@ -5231,31 +4851,30 @@
                 (1 << frame_id_length)) %
                (1 << frame_id_length));
           // Compare values derived from delta_frame_id_minus_1 and
-          // refresh_frame_flags. Also, check valid for referencing
-          if (ref_frame_id != cm->ref_frame_id[ref] ||
-              cm->valid_for_referencing[ref] == 0)
+          // refresh_frame_flags.
+          if (ref_frame_id != cm->ref_frame_id[ref])
             aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
       }
 
-      if (!cm->error_resilient_mode && frame_size_override_flag) {
+      if (!features->error_resilient_mode && frame_size_override_flag) {
         setup_frame_size_with_refs(cm, rb);
       } else {
         setup_frame_size(cm, frame_size_override_flag, rb);
       }
 
-      if (cm->cur_frame_force_integer_mv) {
-        cm->allow_high_precision_mv = 0;
+      if (features->cur_frame_force_integer_mv) {
+        features->allow_high_precision_mv = 0;
       } else {
-        cm->allow_high_precision_mv = aom_rb_read_bit(rb);
+        features->allow_high_precision_mv = aom_rb_read_bit(rb);
       }
-      cm->interp_filter = read_frame_interp_filter(rb);
-      cm->switchable_motion_mode = aom_rb_read_bit(rb);
+      features->interp_filter = read_frame_interp_filter(rb);
+      features->switchable_motion_mode = aom_rb_read_bit(rb);
     }
 
     cm->prev_frame = get_primary_ref_frame_buf(cm);
-    if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
+    if (features->primary_ref_frame != PRIMARY_REF_NONE &&
         get_primary_ref_frame_buf(cm) == NULL) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Reference frame containing this frame's initial "
@@ -5265,9 +4884,9 @@
     if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
         pbi->need_resync != 1) {
       if (frame_might_allow_ref_frame_mvs(cm))
-        cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
+        features->allow_ref_frame_mvs = aom_rb_read_bit(rb);
       else
-        cm->allow_ref_frame_mvs = 0;
+        features->allow_ref_frame_mvs = 0;
 
       for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
@@ -5289,18 +4908,16 @@
 
   cm->cur_frame->frame_type = current_frame->frame_type;
 
-  if (seq_params->frame_id_numbers_present_flag) {
-    update_ref_frame_id(cm, cm->current_frame_id);
-  }
+  update_ref_frame_id(pbi);
 
-  const int might_bwd_adapt =
-      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+                              !(features->disable_cdf_update);
   if (might_bwd_adapt) {
-    cm->refresh_frame_context = aom_rb_read_bit(rb)
-                                    ? REFRESH_FRAME_CONTEXT_DISABLED
-                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
+    features->refresh_frame_context = aom_rb_read_bit(rb)
+                                          ? REFRESH_FRAME_CONTEXT_DISABLED
+                                          : REFRESH_FRAME_CONTEXT_BACKWARD;
   } else {
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
   }
 
   cm->cur_frame->buf.bit_depth = seq_params->bit_depth;
@@ -5321,9 +4938,7 @@
                        " state");
   }
 
-  generate_next_ref_frame_map(pbi);
-
-  if (cm->allow_intrabc) {
+  if (features->allow_intrabc) {
     // Set parameters corresponding to no filtering.
     struct loopfilter *lf = &cm->lf;
     lf->filter_level[0] = 0;
@@ -5338,24 +4953,30 @@
   }
 
   read_tile_info(pbi, rb);
-  if (!is_min_tile_width_satisfied(cm)) {
+  if (!av1_is_min_tile_width_satisfied(cm)) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Minimum tile width requirement not satisfied");
   }
 
-  setup_quantization(cm, rb);
+  CommonQuantParams *const quant_params = &cm->quant_params;
+  setup_quantization(quant_params, av1_num_planes(cm),
+                     cm->seq_params.separate_uv_delta_q, rb);
   xd->bd = (int)seq_params->bit_depth;
 
-  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
-      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
-      cm->num_allocated_above_contexts < cm->tile_rows) {
-    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
-    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+  CommonContexts *const above_contexts = &cm->above_contexts;
+  if (above_contexts->num_planes < av1_num_planes(cm) ||
+      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+      above_contexts->num_tile_rows < cm->tiles.rows) {
+    av1_free_above_context_buffers(above_contexts);
+    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+                                        cm->mi_params.mi_cols,
+                                        av1_num_planes(cm))) {
       aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
+    }
   }
 
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+  if (features->primary_ref_frame == PRIMARY_REF_NONE) {
     av1_setup_past_independence(cm);
   }
 
@@ -5366,11 +4987,11 @@
   cm->delta_q_info.delta_lf_present_flag = 0;
   cm->delta_q_info.delta_lf_multi = 0;
   cm->delta_q_info.delta_q_present_flag =
-      cm->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+      quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
   if (cm->delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = cm->base_qindex;
+    xd->current_qindex = quant_params->base_qindex;
     cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
-    if (!cm->allow_intrabc)
+    if (!features->allow_intrabc)
       cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
     if (cm->delta_q_info.delta_lf_present_flag) {
       cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
@@ -5379,58 +5000,57 @@
     }
   }
 
-  xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+  xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv;
 
   for (int i = 0; i < MAX_SEGMENTS; ++i) {
-    const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
-    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
-                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex);
+    xd->lossless[i] =
+        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
     xd->qindex[i] = qindex;
   }
-  cm->coded_lossless = is_coded_lossless(cm, xd);
-  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+  features->coded_lossless = is_coded_lossless(cm, xd);
+  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
   setup_segmentation_dequant(cm, xd);
-  if (cm->coded_lossless) {
+  if (features->coded_lossless) {
     cm->lf.filter_level[0] = 0;
     cm->lf.filter_level[1] = 0;
   }
-  if (cm->coded_lossless || !seq_params->enable_cdef) {
+  if (features->coded_lossless || !seq_params->enable_cdef) {
     cm->cdef_info.cdef_bits = 0;
     cm->cdef_info.cdef_strengths[0] = 0;
     cm->cdef_info.cdef_uv_strengths[0] = 0;
   }
-  if (cm->all_lossless || !seq_params->enable_restoration) {
+  if (features->all_lossless || !seq_params->enable_restoration) {
     cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
     cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
     cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
   setup_loopfilter(cm, rb);
 
-  if (!cm->coded_lossless && seq_params->enable_cdef) {
+  if (!features->coded_lossless && seq_params->enable_cdef) {
     setup_cdef(cm, rb);
   }
-  if (!cm->all_lossless && seq_params->enable_restoration) {
+  if (!features->all_lossless && seq_params->enable_restoration) {
     decode_restoration_mode(cm, rb);
   }
 
-  cm->tx_mode = read_tx_mode(cm, rb);
+  features->tx_mode = read_tx_mode(rb, features->coded_lossless);
   current_frame->reference_mode = read_frame_reference_mode(cm, rb);
-  if (current_frame->reference_mode != SINGLE_REFERENCE)
-    setup_compound_reference_mode(cm);
 
   av1_setup_skip_mode_allowed(cm);
   current_frame->skip_mode_info.skip_mode_flag =
       current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
 
   if (frame_might_allow_warped_motion(cm))
-    cm->allow_warped_motion = aom_rb_read_bit(rb);
+    features->allow_warped_motion = aom_rb_read_bit(rb);
   else
-    cm->allow_warped_motion = 0;
+    features->allow_warped_motion = 0;
 
-  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
+  features->reduced_tx_set_used = aom_rb_read_bit(rb);
 
-  if (cm->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+  if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Frame wrongly requests reference frame MVs");
   }
@@ -5442,7 +5062,7 @@
   read_film_grain(cm, rb);
 
 #if EXT_TILE_DEBUG
-  if (pbi->ext_tile_debug && cm->large_scale_tile) {
+  if (pbi->ext_tile_debug && cm->tiles.large_scale) {
     read_ext_tile_info(pbi, rb);
     av1_set_single_tile_decoding_mode(cm);
   }
@@ -5472,16 +5092,14 @@
   return (BITSTREAM_PROFILE)profile;
 }
 
-static void superres_post_decode(AV1Decoder *pbi) {
+static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
   if (!av1_superres_scaled(cm)) return;
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
-  lock_buffer_pool(pool);
   av1_superres_upscale(cm, pool);
-  unlock_buffer_pool(pool);
 }
 
 uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
@@ -5494,8 +5112,8 @@
   MACROBLOCKD *const xd = &pbi->mb;
 
 #if CONFIG_BITSTREAM_DEBUG
-  bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 +
-                                 cm->show_frame);
+  aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 +
+                                     cm->show_frame);
 #endif
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_r();
@@ -5511,9 +5129,7 @@
 
   if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
 
-  // If cm->single_tile_decoding = 0, the independent decoding of a single tile
-  // or a section of a frame is not allowed.
-  if (!cm->single_tile_decoding &&
+  if (!cm->tiles.single_tile_decoding &&
       (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
     pbi->dec_tile_row = -1;
     pbi->dec_tile_col = -1;
@@ -5542,13 +5158,13 @@
     return uncomp_hdr_size;
   }
 
-  cm->setup_mi(cm);
+  cm->mi_params.setup_mi(&cm->mi_params);
 
   av1_setup_motion_field(cm);
 
   av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
                          cm->seq_params.subsampling_y, num_planes);
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = *cm->default_frame_context;
   } else {
@@ -5563,7 +5179,7 @@
 }
 
 // Once-per-frame initialization
-static void setup_frame_info(AV1Decoder *pbi) {
+static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
 
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
@@ -5584,21 +5200,22 @@
                                     const uint8_t **p_data_end, int start_tile,
                                     int end_tile, int initialize_flag) {
   AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
   MACROBLOCKD *const xd = &pbi->mb;
   const int tile_count_tg = end_tile - start_tile + 1;
 
   if (initialize_flag) setup_frame_info(pbi);
   const int num_planes = av1_num_planes(cm);
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   av1_loop_filter_frame_init(cm, 0, num_planes);
 #endif
 
-  if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
+  if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
       pbi->row_mt)
     *p_data_end =
         decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
   else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
-           !(cm->large_scale_tile && !pbi->ext_tile_debug))
+           !(tiles->large_scale && !pbi->ext_tile_debug))
     *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
   else
     *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
@@ -5608,22 +5225,22 @@
     set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
   }
 
-  if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
+  if (end_tile != tiles->rows * tiles->cols - 1) {
     return;
   }
 
-  if (!cm->allow_intrabc && !cm->single_tile_decoding) {
+  if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
       if (pbi->num_workers > 1) {
         av1_loop_filter_frame_mt(
             &cm->cur_frame->buf, cm, &pbi->mb, 0, num_planes, 0,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
             1,
 #endif
             pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
       } else {
         av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->mb,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                               1,
 #endif
                               0, num_planes, 0);
@@ -5635,7 +5252,7 @@
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
     const int do_cdef =
-        !cm->skip_loop_filter && !cm->coded_lossless &&
+        !pbi->skip_loop_filter && !cm->features.coded_lossless &&
         (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
          cm->cdef_info.cdef_uv_strengths[0]);
     const int do_superres = av1_superres_scaled(cm);
@@ -5681,14 +5298,14 @@
       }
     }
   }
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
 #endif
 
   if (!xd->corrupted) {
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      assert(cm->context_update_tile_id < pbi->allocated_tiles);
-      *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
+    if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      assert(pbi->context_update_tile_id < pbi->allocated_tiles);
+      *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
       av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
@@ -5703,7 +5320,7 @@
 #endif
 
   // Non frame parallel update frame context here.
-  if (!cm->large_scale_tile) {
+  if (!tiles->large_scale) {
     cm->cur_frame->frame_context = *cm->fc;
   }
 }

diff --git a/libaom/av1/decoder/decodeframe.h b/libaom/av1/decoder/decodeframe.h
index 13b9696..95b3c9f 100644
--- a/libaom/av1/decoder/decodeframe.h
+++ b/libaom/av1/decoder/decodeframe.h

@@ -56,19 +56,21 @@
                            struct aom_internal_error_info *error_info);
 
 // Implements the timing_info() function in the spec. Reports errors by calling
-// rb->error_handler().
-void av1_read_timing_info_header(AV1_COMMON *cm,
+// rb->error_handler() or aom_internal_error().
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+                                 struct aom_internal_error_info *error,
                                  struct aom_read_bit_buffer *rb);
 
 // Implements the decoder_model_info() function in the spec. Reports errors by
 // calling rb->error_handler().
-void av1_read_decoder_model_info(AV1_COMMON *cm,
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
                                  struct aom_read_bit_buffer *rb);
 
 // Implements the operating_parameters_info() function in the spec. Reports
-// errors by calling rb->error_handler() or aom_internal_error().
-void av1_read_op_parameters_info(AV1_COMMON *const cm,
-                                 struct aom_read_bit_buffer *rb, int op_num);
+// errors by calling rb->error_handler().
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+                                 int buffer_delay_length,
+                                 struct aom_read_bit_buffer *rb);
 
 struct aom_read_bit_buffer *av1_init_read_bit_buffer(
     struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,

diff --git a/libaom/av1/decoder/decodemv.c b/libaom/av1/decoder/decodemv.c
index 2791f3a..e97cec4 100644
--- a/libaom/av1/decoder/decodemv.c
+++ b/libaom/av1/decoder/decodemv.c

@@ -36,40 +36,57 @@
   return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
 }
 
-static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd,
-                      int mi_col, int mi_row) {
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  if (cm->coded_lossless) return;
-  if (cm->allow_intrabc) {
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
+  const int skip = xd->mi[0]->skip;
+  if (cm->features.coded_lossless) return;
+  if (cm->features.allow_intrabc) {
     assert(cm->cdef_info.cdef_bits == 0);
     return;
   }
 
-  if (!(mi_col & (cm->seq_params.mib_size - 1)) &&
-      !(mi_row & (cm->seq_params.mib_size - 1))) {  // Top left?
-    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
-        xd->cdef_preset[3] = -1;
+  // At the start of a superblock, mark that we haven't yet read CDEF strengths
+  // for any of the CDEF units contained in this superblock.
+  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int mi_row_in_sb = (xd->mi_row & sb_mask);
+  const int mi_col_in_sb = (xd->mi_col & sb_mask);
+  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
   }
-  // Read CDEF param at the first non-skip coding block
-  const int mask = (1 << (6 - MI_SIZE_LOG2));
-  const int m = ~(mask - 1);
-  const int index = cm->seq_params.sb_size == BLOCK_128X128
-                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+
+  // CDEF unit size is 64x64 irrespective of the superblock size.
+  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+  // Find index of this CDEF unit in this superblock.
+  const int index_mask = cdef_size;
+  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
-  cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]
-      ->cdef_strength = xd->cdef_preset[index] =
-      xd->cdef_preset[index] == -1 && !mbmi->skip
-          ? aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR)
-          : xd->cdef_preset[index];
+
+  // Read CDEF strength from the first non-skip coding block in this CDEF unit.
+  if (!xd->cdef_transmitted[index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+                        xd->mi_col & first_block_mask);
+    MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    mbmi->cdef_strength =
+        aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
+    xd->cdef_transmitted[index] = true;
+  }
 }
 
 static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             aom_reader *r, MB_MODE_INFO *const mbmi,
-                             int mi_col, int mi_row) {
+                             aom_reader *r, MB_MODE_INFO *const mbmi) {
   int sign, abs, reduced_delta_qindex = 0;
   BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
@@ -129,20 +146,20 @@
   return uv_mode;
 }
 
-static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
-                           int *signs_out) {
-  const int joint_sign =
+static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
+                               int8_t *signs_out) {
+  const int8_t joint_sign =
       aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
-  int idx = 0;
+  uint8_t idx = 0;
   // Magnitudes are only coded for nonzero values
   if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
     aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
-    idx = aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
+    idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
           << CFL_ALPHABET_SIZE_LOG2;
   }
   if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
     aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
-    idx += aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
+    idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
   }
   *signs_out = joint_sign;
   return idx;
@@ -183,7 +200,7 @@
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
     for (int idx = 0; idx < 2; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx;
         if (!drl_idx) return;
@@ -196,7 +213,7 @@
     // mode is factored in.
     for (int idx = 1; idx < 3; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx - 1;
         if (!drl_idx) return;
@@ -207,11 +224,11 @@
 
 static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                     MB_MODE_INFO *mbmi, aom_reader *r) {
-  if (cm->switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+  if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
   if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
 
-  const MOTION_MODE last_motion_mode_allowed =
-      motion_mode_allowed(xd->global_motion, xd, mbmi, cm->allow_warped_motion);
+  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+      xd->global_motion, xd, mbmi, cm->features.allow_warped_motion);
   int motion_mode;
 
   if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
@@ -260,9 +277,9 @@
 }
 
 static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
-                           int mi_row, int mi_col, aom_reader *r, int skip) {
+                           aom_reader *r, int skip) {
   int cdf_num;
-  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   if (skip) return pred;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -286,8 +303,8 @@
 
   for (int y = 0; y < y_mis; y++)
     for (int x = 0; x < x_mis; x++)
-      segment_id =
-          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+      segment_id = AOMMIN(
+          segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]);
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
@@ -299,37 +316,40 @@
 
   for (int y = 0; y < y_mis; y++)
     for (int x = 0; x < x_mis; x++)
-      cm->cur_frame->seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+      cm->cur_frame->seg_map[mi_offset + y * cm->mi_params.mi_cols + x] =
+          segment_id;
 }
 
 static int read_intra_segment_id(AV1_COMMON *const cm,
-                                 const MACROBLOCKD *const xd, int mi_row,
-                                 int mi_col, int bsize, aom_reader *r,
-                                 int skip) {
+                                 const MACROBLOCKD *const xd, int bsize,
+                                 aom_reader *r, int skip) {
   struct segmentation *const seg = &cm->seg;
   if (!seg->enabled) return 0;  // Default for disabled segmentation
-
   assert(seg->update_map && !seg->temporal_update);
 
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
-  const int segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, skip);
+  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+  const int segment_id = read_segment_id(cm, xd, r, skip);
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
-static void copy_segment_id(const AV1_COMMON *cm,
+static void copy_segment_id(const CommonModeInfoParams *const mi_params,
                             const uint8_t *last_segment_ids,
                             uint8_t *current_segment_ids, int mi_offset,
                             int x_mis, int y_mis) {
   for (int y = 0; y < y_mis; y++)
     for (int x = 0; x < x_mis; x++)
-      current_segment_ids[mi_offset + y * cm->mi_cols + x] =
-          last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
-                           : 0;
+      current_segment_ids[mi_offset + y * mi_params->mi_cols + x] =
+          last_segment_ids
+              ? last_segment_ids[mi_offset + y * mi_params->mi_cols + x]
+              : 0;
 }
 
 static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
@@ -340,22 +360,24 @@
 }
 
 static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, int preskip,
-                                 aom_reader *r) {
+                                 int preskip, aom_reader *r) {
   struct segmentation *const seg = &cm->seg;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[mbmi->sb_type];
   const int bh = mi_size_high[mbmi->sb_type];
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
 
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
   if (!seg->update_map) {
-    copy_segment_id(cm, cm->last_frame_seg_map, cm->cur_frame->seg_map,
+    copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map,
                     mi_offset, x_mis, y_mis);
     return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
   }
@@ -368,7 +390,7 @@
       if (seg->temporal_update) {
         mbmi->seg_id_predicted = 0;
       }
-      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 1);
+      segment_id = read_segment_id(cm, xd, r, 1);
       set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
       return segment_id;
     }
@@ -383,10 +405,10 @@
     if (mbmi->seg_id_predicted) {
       segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
     } else {
-      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+      segment_id = read_segment_id(cm, xd, r, 0);
     }
   } else {
-    segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+    segment_id = read_segment_id(cm, xd, r, 0);
   }
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
@@ -538,11 +560,11 @@
 }
 
 static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                   int mi_row, int mi_col, aom_reader *r) {
+                                   aom_reader *r) {
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
@@ -559,9 +581,7 @@
       read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
     }
   }
-  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
+  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
     const int modev = aom_read_symbol(
         r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
@@ -601,9 +621,8 @@
 void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
                       int blk_col, TX_SIZE tx_size, aom_reader *r) {
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const int txk_type_idx =
-      av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-  TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+  uint8_t *tx_type =
+      &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
   *tx_type = DCT_DCT;
 
   // No need to read transform type if block is skipped.
@@ -615,11 +634,12 @@
   if (qindex == 0) return;
 
   const int inter_block = is_inter_block(mbmi);
-  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
-    const TxSetType tx_set_type =
-        av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
+  if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) >
+      1) {
+    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+        tx_size, inter_block, cm->features.reduced_tx_set_used);
     const int eset =
-        get_ext_tx_set(tx_size, inter_block, cm->reduced_tx_set_used);
+        get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used);
     // eset == 0 should correspond to a set with only DCT_DCT and
     // there is no need to read the tx_type
     assert(eset != 0);
@@ -665,7 +685,7 @@
 }
 
 static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                              int mi_row, int mi_col, aom_reader *r) {
+                              aom_reader *r) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
@@ -680,22 +700,21 @@
     int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
 
     av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
-                     xd->ref_mv_stack, ref_mvs, /*global_mvs=*/NULL, mi_row,
-                     mi_col, inter_mode_ctx);
+                     xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
+                     inter_mode_ctx);
 
     int_mv nearestmv, nearmv;
 
     av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
     int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
     if (dv_ref.as_int == 0)
-      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, mi_row,
-                      mi_col);
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row);
     // Ref DV should not have sub-pel.
     int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
     dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
     dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
-    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row,
-                                     mi_col, bsize, r);
+    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row,
+                                     xd->mi_col, bsize, r);
     if (!valid_dv) {
       // Intra bc motion vectors are not valid - signal corrupt frame
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
@@ -707,18 +726,19 @@
 // If delta q is present, reads delta_q index.
 // Also reads delta_q loop filter levels, if present.
 static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                const int mi_row, const int mi_col,
                                 aom_reader *r) {
   DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
   if (delta_q_info->delta_q_present_flag) {
     MB_MODE_INFO *const mbmi = xd->mi[0];
-    xd->current_qindex += read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) *
-                          delta_q_info->delta_q_res;
+    xd->current_qindex +=
+        read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
     xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
     FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
     if (delta_q_info->delta_lf_present_flag) {
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
       if (delta_q_info->delta_lf_multi) {
         const int frame_lf_count =
             av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
@@ -744,8 +764,7 @@
 }
 
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd, int mi_row,
-                                       int mi_col, aom_reader *r) {
+                                       MACROBLOCKD *const xd, aom_reader *r) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
@@ -755,18 +774,16 @@
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (seg->segid_preskip)
-    mbmi->segment_id =
-        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, 0);
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
 
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
   if (!seg->segid_preskip)
-    mbmi->segment_id =
-        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, mbmi->skip);
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip);
 
-  read_cdef(cm, r, xd, mi_col, mi_row);
+  read_cdef(cm, r, xd);
 
-  read_delta_q_params(cm, xd, mi_row, mi_col, r);
+  read_delta_q_params(cm, xd, r);
 
   mbmi->current_qindex = xd->current_qindex;
 
@@ -776,12 +793,14 @@
   mbmi->palette_mode_info.palette_size[1] = 0;
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
-  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (av1_allow_intrabc(cm)) {
-    read_intrabc_info(cm, xd, mi_row, mi_col, r);
+    read_intrabc_info(cm, xd, r);
     if (is_intrabc_block(mbmi)) return;
   }
 
@@ -793,10 +812,7 @@
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
 
-  if (!cm->seq_params.monochrome &&
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-    xd->cfl.is_chroma_reference = 1;
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -810,12 +826,11 @@
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
-    xd->cfl.is_chroma_reference = 0;
   }
   xd->cfl.store_y = store_cfl_required(cm, xd);
 
-  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, r);
 
   read_filter_intra_mode_info(cm, xd, r);
 }
@@ -966,19 +981,19 @@
       // Decode forward references.
       if (!bit) {
         const int bit1 = READ_REF_BIT(comp_ref_p1);
-        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 1 : 0];
+        ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME;
       } else {
         const int bit2 = READ_REF_BIT(comp_ref_p2);
-        ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
+        ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME;
       }
 
       // Decode backward references.
       const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
       if (!bit_bwd) {
         const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
-        ref_frame[idx] = cm->comp_bwd_ref[bit1_bwd];
+        ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME;
       } else {
-        ref_frame[idx] = cm->comp_bwd_ref[2];
+        ref_frame[idx] = ALTREF_FRAME;
       }
     } else if (mode == SINGLE_REFERENCE) {
       const int bit0 = READ_REF_BIT(single_ref_p1);
@@ -1008,38 +1023,39 @@
   }
 }
 
-static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
-                                         MACROBLOCKD *const xd,
+static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd,
+                                         InterpFilter interp_filter,
+                                         bool enable_dual_filter,
                                          MB_MODE_INFO *const mbmi,
                                          aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
-    set_default_interp_filters(mbmi, cm->interp_filter);
+    set_default_interp_filters(mbmi, interp_filter);
     return;
   }
 
-  if (cm->interp_filter != SWITCHABLE) {
-    mbmi->interp_filters = av1_broadcast_interp_filter(cm->interp_filter);
+  if (interp_filter != SWITCHABLE) {
+    mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter);
   } else {
     InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
     for (int dir = 0; dir < 2; ++dir) {
       const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
       ref0_filter[dir] = (InterpFilter)aom_read_symbol(
           r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
-      if (cm->seq_params.enable_dual_filter == 0) {
+      if (!enable_dual_filter) {
         ref0_filter[1] = ref0_filter[0];
         break;
       }
     }
     // The index system works as: (0, 1) -> (vertical, horizontal) filter types
-    mbmi->interp_filters =
-        av1_make_interp_filters(ref0_filter[0], ref0_filter[1]);
+    mbmi->interp_filters.as_filters.x_filter = ref0_filter[1];
+    mbmi->interp_filters.as_filters.y_filter = ref0_filter[0];
   }
 }
 
-static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
-                                       const int mi_col, MACROBLOCKD *const xd,
+static void read_intra_block_mode_info(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
                                        MB_MODE_INFO *const mbmi,
                                        aom_reader *r) {
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -1056,11 +1072,7 @@
       use_angle_delta && av1_is_directional_mode(mbmi->mode)
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
-  const int has_chroma =
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y);
-  xd->cfl.is_chroma_reference = has_chroma;
-  if (!cm->seq_params.monochrome && has_chroma) {
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1080,8 +1092,8 @@
 
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, r);
 
   read_filter_intra_mode_info(cm, xd, r);
 }
@@ -1095,12 +1107,13 @@
                             PREDICTION_MODE mode,
                             MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
                             int_mv ref_mv[2], int_mv nearest_mv[2],
-                            int_mv near_mv[2], int mi_row, int mi_col,
-                            int is_compound, int allow_hp, aom_reader *r) {
+                            int_mv near_mv[2], int is_compound, int allow_hp,
+                            aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   MB_MODE_INFO *mbmi = xd->mi[0];
   BLOCK_SIZE bsize = mbmi->sb_type;
-  if (cm->cur_frame_force_integer_mv) {
+  FeatureFlags *const features = &cm->features;
+  if (features->cur_frame_force_integer_mv) {
     allow_hp = MV_SUBPEL_NONE;
   }
   switch (mode) {
@@ -1118,11 +1131,11 @@
       break;
     }
     case GLOBALMV: {
-      mv[0].as_int =
-          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                               cm->allow_high_precision_mv, bsize, mi_col,
-                               mi_row, cm->cur_frame_force_integer_mv)
-              .as_int;
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
       break;
     }
     case NEW_NEWMV: {
@@ -1175,16 +1188,16 @@
     }
     case GLOBAL_GLOBALMV: {
       assert(is_compound);
-      mv[0].as_int =
-          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                               cm->allow_high_precision_mv, bsize, mi_col,
-                               mi_row, cm->cur_frame_force_integer_mv)
-              .as_int;
-      mv[1].as_int =
-          gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
-                               cm->allow_high_precision_mv, bsize, mi_col,
-                               mi_row, cm->cur_frame_force_integer_mv)
-              .as_int;
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
+      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
       break;
     }
     default: { return 0; }
@@ -1249,11 +1262,12 @@
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       MB_MODE_INFO *const mbmi, int mi_row,
-                                       int mi_col, aom_reader *r) {
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
+  FeatureFlags *const features = &cm->features;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int allow_hp = cm->allow_high_precision_mv;
+  const int allow_hp = features->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
   int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
@@ -1269,12 +1283,10 @@
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   const int is_compound = has_second_ref(mbmi);
 
-  MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
-                   ref_mvs, /*global_mvs=*/NULL, mi_row, mi_col,
-                   inter_mode_ctx);
+                   xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
 
-  int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
 
   if (mbmi->skip_mode) {
@@ -1285,6 +1297,8 @@
         segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
       mbmi->mode = GLOBALMV;
     } else {
+      const int mode_ctx =
+          av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
       if (is_compound)
         mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
       else
@@ -1303,32 +1317,29 @@
 
   if (!is_compound && mbmi->mode != GLOBALMV) {
     av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
-                          &nearmv[0], cm->cur_frame_force_integer_mv);
+                          &nearmv[0], features->cur_frame_force_integer_mv);
   }
 
   if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
-    int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    const int ref_mv_idx = mbmi->ref_mv_idx + 1;
     nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
     nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
     nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
     nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
     lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
-                       cm->cur_frame_force_integer_mv);
+                       features->cur_frame_force_integer_mv);
     lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
-                       cm->cur_frame_force_integer_mv);
+                       features->cur_frame_force_integer_mv);
     lower_mv_precision(&nearmv[0].as_mv, allow_hp,
-                       cm->cur_frame_force_integer_mv);
+                       features->cur_frame_force_integer_mv);
     lower_mv_precision(&nearmv[1].as_mv, allow_hp,
-                       cm->cur_frame_force_integer_mv);
+                       features->cur_frame_force_integer_mv);
   } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
-    int_mv cur_mv =
+    nearmv[0] =
         xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
-    nearmv[0] = cur_mv;
   }
 
-  int_mv ref_mv[2];
-  ref_mv[0] = nearestmv[0];
-  ref_mv[1] = nearestmv[1];
+  int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] };
 
   if (is_compound) {
     int ref_mv_idx = mbmi->ref_mv_idx;
@@ -1353,9 +1364,9 @@
 
   if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV);
 
-  int mv_corrupted_flag =
+  const int mv_corrupted_flag =
       !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
-                 nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
+                 nearestmv, nearmv, is_compound, allow_hp, r);
   aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
 
   mbmi->use_wedge_interintra = 0;
@@ -1373,13 +1384,12 @@
       mbmi->angle_delta[PLANE_TYPE_Y] = 0;
       mbmi->angle_delta[PLANE_TYPE_UV] = 0;
       mbmi->filter_intra_mode_info.use_filter_intra = 0;
-      if (is_interintra_wedge_used(bsize)) {
+      if (av1_is_wedge_used(bsize)) {
         mbmi->use_wedge_interintra = aom_read_symbol(
             r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
         if (mbmi->use_wedge_interintra) {
-          mbmi->interintra_wedge_index =
-              aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
-          mbmi->interintra_wedge_sign = 0;
+          mbmi->interintra_wedge_index = (int8_t)aom_read_symbol(
+              r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
         }
       }
     }
@@ -1392,9 +1402,10 @@
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
-      !has_second_ref(mbmi))
-    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+      !has_second_ref(mbmi)) {
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+  }
+  av1_count_overlappable_neighbors(cm, xd);
 
   if (mbmi->ref_frame[1] != INTRA_FRAME)
     mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
@@ -1411,14 +1422,14 @@
 
     if (masked_compound_used) {
       const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
-      mbmi->comp_group_idx = aom_read_symbol(
+      mbmi->comp_group_idx = (uint8_t)aom_read_symbol(
           r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
     }
 
     if (mbmi->comp_group_idx == 0) {
       if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
         const int comp_index_ctx = get_comp_index_context(cm, xd);
-        mbmi->compound_idx = aom_read_symbol(
+        mbmi->compound_idx = (uint8_t)aom_read_symbol(
             r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
         mbmi->interinter_comp.type =
             mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
@@ -1434,19 +1445,20 @@
       assert(masked_compound_used);
 
       // compound_diffwtd, wedge
-      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
         mbmi->interinter_comp.type =
             COMPOUND_WEDGE + aom_read_symbol(r,
                                              ec_ctx->compound_type_cdf[bsize],
                                              MASKED_COMPOUND_TYPES, ACCT_STR);
-      else
+      } else {
         mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+      }
 
       if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
         assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-        mbmi->interinter_comp.wedge_index =
-            aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
-        mbmi->interinter_comp.wedge_sign = aom_read_bit(r, ACCT_STR);
+        mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol(
+            r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
+        mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR);
       } else {
         assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
         mbmi->interinter_comp.mask_type =
@@ -1455,19 +1467,24 @@
     }
   }
 
-  read_mb_interp_filter(cm, xd, mbmi, r);
+  read_mb_interp_filter(xd, features->interp_filter,
+                        cm->seq_params.enable_dual_filter, mbmi, r);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
   if (mbmi->motion_mode == WARPED_CAUSAL) {
     mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
     mbmi->wm_params.invalid = 0;
 
-    if (mbmi->num_proj_ref > 1)
-      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
-                                         mbmi->num_proj_ref, bsize);
+    if (mbmi->num_proj_ref > 1) {
+      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                             mbmi->num_proj_ref, bsize);
+    }
 
-    if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
-                        mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                        &mbmi->wm_params, mi_row, mi_col)) {
+    if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                            &mbmi->wm_params, mi_row, mi_col)) {
 #if WARPED_MOTION_DEBUG
       printf("Warning: unexpected warped model from aomenc\n");
 #endif
@@ -1475,9 +1492,6 @@
     }
   }
 
-  xd->cfl.is_chroma_reference =
-      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
-                          cm->seq_params.subsampling_y);
   xd->cfl.store_y = store_cfl_required(cm, xd);
 
 #if DEC_MISMATCH_DEBUG
@@ -1486,15 +1500,14 @@
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd, int mi_row,
-                                       int mi_col, aom_reader *r) {
+                                       MACROBLOCKD *const xd, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int inter_block = 1;
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
-  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 1, r);
+  mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r);
 
   mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
 
@@ -1504,30 +1517,31 @@
     mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
   if (!cm->seg.segid_preskip)
-    mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+    mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
 
-  read_cdef(cm, r, xd, mi_col, mi_row);
+  read_cdef(cm, r, xd);
 
-  read_delta_q_params(cm, xd, mi_row, mi_col, r);
+  read_delta_q_params(cm, xd, r);
 
   if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 
   mbmi->current_qindex = xd->current_qindex;
 
-  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
   xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mbmi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, mbmi, r);
   else
-    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mbmi, r);
+    read_intra_block_mode_info(cm, xd, mbmi, r);
 }
 
 static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
                                  int x_mis, int y_mis) {
-  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
   MV_REF *frame_mvs =
       cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
@@ -1543,17 +1557,19 @@
   }
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
-                        int mi_col, aom_reader *r, int x_mis, int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
+                        int x_mis, int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mi = xd->mi[0];
   mi->use_intrabc = 0;
 
   if (frame_is_intra_only(cm)) {
-    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
-    intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
+    read_intra_frame_mode_info(cm, xd, r);
+    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+      intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
-    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+    read_inter_frame_mode_info(pbi, xd, r);
+    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+      av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
   }
 }

diff --git a/libaom/av1/decoder/decodemv.h b/libaom/av1/decoder/decodemv.h
index 1625e5b..289e66a 100644
--- a/libaom/av1/decoder/decodemv.h
+++ b/libaom/av1/decoder/decodemv.h

@@ -20,10 +20,8 @@
 extern "C" {
 #endif
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
-
-                        int mi_row, int mi_col, aom_reader *r, int x_mis,
-                        int y_mis);
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
+                        int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/decoder/decoder.c b/libaom/av1/decoder/decoder.c
index bff4b7a..fc5f2cd 100644
--- a/libaom/av1/decoder/decoder.c
+++ b/libaom/av1/decoder/decoder.c

@@ -26,8 +26,8 @@
 #include "aom_util/aom_thread.h"
 
 #include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
@@ -45,29 +45,50 @@
   av1_init_wedge_masks();
 }
 
-static void dec_setup_mi(AV1_COMMON *cm) {
-  cm->mi = cm->mip;
-  cm->mi_grid_visible = cm->mi_grid_base;
-  memset(cm->mi_grid_base, 0,
-         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
+static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                          int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+  mi_params->mi_alloc_stride = mi_params->mi_stride;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+
+#if CONFIG_LPF_MASK
+  av1_alloc_loop_filter_mask(mi_params);
+#endif
 }
 
-static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
-  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
-  if (!cm->mip) return 1;
-  cm->mi_alloc_size = mi_size;
-  cm->mi_grid_base =
-      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
-  if (!cm->mi_grid_base) return 1;
-  return 0;
+static void dec_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
 }
 
-static void dec_free_mi(AV1_COMMON *cm) {
-  aom_free(cm->mip);
-  cm->mip = NULL;
-  aom_free(cm->mi_grid_base);
-  cm->mi_grid_base = NULL;
-  cm->mi_alloc_size = 0;
+static void dec_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
 }
 
 AV1Decoder *av1_decoder_create(BufferPool *const pool) {
@@ -102,7 +123,6 @@
   // Initialize the references to not point to any frame buffers.
   for (int i = 0; i < REF_FRAMES; i++) {
     cm->ref_frame_map[i] = NULL;
-    cm->next_ref_frame_map[i] = NULL;
   }
 
   cm->current_frame.frame_number = 0;
@@ -111,13 +131,13 @@
 
   cm->seq_params.bit_depth = AOM_BITS_8;
 
-  cm->alloc_mi = av1_dec_alloc_mi;
-  cm->free_mi = dec_free_mi;
-  cm->setup_mi = dec_setup_mi;
+  cm->mi_params.free_mi = dec_free_mi;
+  cm->mi_params.setup_mi = dec_setup_mi;
+  cm->mi_params.set_mb_mi = dec_set_mb_mi;
 
   av1_loop_filter_init(cm);
 
-  av1_qm_init(cm);
+  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
   av1_loop_restoration_precal();
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
@@ -205,19 +225,16 @@
   aom_accounting_clear(&pbi->accounting);
 #endif
   av1_free_mc_tmp_buf(&pbi->td);
-
+  aom_img_metadata_array_free(pbi->metadata);
   aom_free(pbi);
 }
 
-void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
-                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
-                       palette_visitor_fn_t visit) {
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                       aom_reader *r, palette_visitor_fn_t visit) {
   if (!is_inter_block(xd->mi[0])) {
     for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
          ++plane) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                              pd->subsampling_y)) {
+      if (plane == 0 || xd->is_chroma_ref) {
         if (xd->mi[0]->palette_mode_info.palette_size[plane])
           visit(xd, plane, r);
       } else {
@@ -319,22 +336,12 @@
   return cm->error.error_code;
 }
 
-static void release_frame_buffers(AV1Decoder *pbi) {
+static void release_current_frame(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
   cm->cur_frame->buf.corrupted = 1;
   lock_buffer_pool(pool);
-  // Release all the reference buffers in cm->next_ref_frame_map if the worker
-  // thread is holding them.
-  if (pbi->hold_ref_buf) {
-    for (int ref_index = 0; ref_index < REF_FRAMES; ++ref_index) {
-      decrease_ref_count(cm->next_ref_frame_map[ref_index], pool);
-      cm->next_ref_frame_map[ref_index] = NULL;
-    }
-    pbi->hold_ref_buf = 0;
-  }
-  // Release current frame.
   decrease_ref_count(cm->cur_frame, pool);
   unlock_buffer_pool(pool);
   cm->cur_frame = NULL;
@@ -345,7 +352,7 @@
 //
 // This functions returns void. It reports failure by setting
 // cm->error.error_code.
-static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
+static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
@@ -354,33 +361,19 @@
     lock_buffer_pool(pool);
 
     // In ext-tile decoding, the camera frame header is only decoded once. So,
-    // we don't release the references here.
+    // we don't update the references here.
     if (!pbi->camera_frame_header_ready) {
-      // If we are not holding reference buffers in cm->next_ref_frame_map,
-      // assert that the following two for loops are no-ops.
-      assert(IMPLIES(!pbi->hold_ref_buf,
-                     cm->current_frame.refresh_frame_flags == 0));
-      assert(IMPLIES(!pbi->hold_ref_buf,
-                     cm->show_existing_frame && !pbi->reset_decoder_state));
-
-      // The following two for loops need to release the reference stored in
-      // cm->ref_frame_map[ref_index] before transferring the reference stored
-      // in cm->next_ref_frame_map[ref_index] to cm->ref_frame_map[ref_index].
+      // The following for loop needs to release the reference stored in
+      // cm->ref_frame_map[ref_index] before storing a reference to
+      // cm->cur_frame in cm->ref_frame_map[ref_index].
       for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
-        decrease_ref_count(cm->ref_frame_map[ref_index], pool);
-        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-        cm->next_ref_frame_map[ref_index] = NULL;
+        if (mask & 1) {
+          decrease_ref_count(cm->ref_frame_map[ref_index], pool);
+          cm->ref_frame_map[ref_index] = cm->cur_frame;
+          ++cm->cur_frame->ref_count;
+        }
         ++ref_index;
       }
-
-      const int check_on_show_existing_frame =
-          !cm->show_existing_frame || pbi->reset_decoder_state;
-      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
-           ++ref_index) {
-        decrease_ref_count(cm->ref_frame_map[ref_index], pool);
-        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-        cm->next_ref_frame_map[ref_index] = NULL;
-      }
     }
 
     if (cm->show_existing_frame || cm->show_frame) {
@@ -411,10 +404,6 @@
 
     unlock_buffer_pool(pool);
   } else {
-    // The code here assumes we are not holding reference buffers in
-    // cm->next_ref_frame_map. If this assertion fails, we are leaking the
-    // frame buffer references in cm->next_ref_frame_map.
-    assert(IMPLIES(!pbi->camera_frame_header_ready, !pbi->hold_ref_buf));
     // Nothing was decoded, so just drop this frame buffer
     lock_buffer_pool(pool);
     decrease_ref_count(cm->cur_frame, pool);
@@ -423,8 +412,6 @@
   cm->cur_frame = NULL;
 
   if (!pbi->camera_frame_header_ready) {
-    pbi->hold_ref_buf = 0;
-
     // Invalidate these references until the next frame starts.
     for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
       cm->remapped_ref_idx[ref_index] = INVALID_IDX;
@@ -457,8 +444,6 @@
     return 1;
   }
 
-  if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
-
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
@@ -475,7 +460,7 @@
       winterface->sync(&pbi->tile_workers[i]);
     }
 
-    release_frame_buffers(pbi);
+    release_current_frame(pbi);
     aom_clear_system_state();
     return -1;
   }
@@ -487,7 +472,7 @@
 
   if (frame_decoded < 0) {
     assert(cm->error.error_code != AOM_CODEC_OK);
-    release_frame_buffers(pbi);
+    release_current_frame(pbi);
     cm->error.setjmp = 0;
     return 1;
   }
@@ -502,8 +487,8 @@
 #endif
 
   // Note: At this point, this function holds a reference to cm->cur_frame
-  // in the buffer pool. This reference is consumed by swap_frame_buffers().
-  swap_frame_buffers(pbi, frame_decoded);
+  // in the buffer pool. This reference is consumed by update_frame_buffers().
+  update_frame_buffers(pbi, frame_decoded);
 
   if (frame_decoded) {
     pbi->decoding_first_frame = 0;
@@ -518,8 +503,9 @@
 
   if (!cm->show_existing_frame) {
     if (cm->seg.enabled) {
-      if (cm->prev_frame && (cm->mi_rows == cm->prev_frame->mi_rows) &&
-          (cm->mi_cols == cm->prev_frame->mi_cols)) {
+      if (cm->prev_frame &&
+          (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+          (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
         cm->last_frame_seg_map = cm->prev_frame->seg_map;
       } else {
         cm->last_frame_seg_map = NULL;

diff --git a/libaom/av1/decoder/decoder.h b/libaom/av1/decoder/decoder.h
index 685c931..4580de2 100644
--- a/libaom/av1/decoder/decoder.h
+++ b/libaom/av1/decoder/decoder.h

@@ -19,8 +19,8 @@
 #include "aom_scale/yv12config.h"
 #include "aom_util/aom_thread.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/thread_common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/decoder/dthread.h"
 #if CONFIG_ACCOUNTING
 #include "av1/decoder/accounting.h"
@@ -41,7 +41,6 @@
 
 typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
                                                  MACROBLOCKD *const xd,
-                                                 int mi_row, int mi_col,
                                                  BLOCK_SIZE bsize);
 
 typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
@@ -197,9 +196,7 @@
   int allow_lowbitdepth;
   int max_threads;
   int inv_tile_order;
-  int need_resync;   // wait for key/intra-only frame.
-  int hold_ref_buf;  // Boolean: whether we are holding reference buffers in
-                     // common.next_ref_frame_map.
+  int need_resync;  // wait for key/intra-only frame.
   int reset_decoder_state;
 
   int tile_size_bytes;
@@ -209,9 +206,6 @@
   int acct_enabled;
   Accounting accounting;
 #endif
-  int tg_size;   // Number of tiles in the current tilegroup
-  int tg_start;  // First tile in the current tilegroup
-  int tg_size_bit_offset;
   int sequence_header_ready;
   int sequence_header_changed;
 #if CONFIG_INSPECTION
@@ -221,6 +215,8 @@
   int operating_point;
   int current_operating_point;
   int seen_frame_header;
+  // The expected start_tile (tg_start syntax element) of the next tile group.
+  int next_start_tile;
 
   // State if the camera frame header is already decoded while
   // large_scale_tile = 1.
@@ -247,6 +243,13 @@
 #endif
 
   AV1DecRowMTInfo frame_row_mt_info;
+  aom_metadata_array_t *metadata;
+
+  int context_update_tile_id;
+  int skip_loop_filter;
+  int skip_film_grain;
+  int is_annexb;
+  int valid_for_referencing[REF_FRAMES];
 } AV1Decoder;
 
 // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
@@ -314,9 +317,8 @@
 typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
                                      aom_reader *r);
 
-void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
-                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
-                       palette_visitor_fn_t visit);
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                       aom_reader *r, palette_visitor_fn_t visit);
 
 typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
                                    int mi_row, int mi_col, aom_reader *r,

diff --git a/libaom/av1/decoder/decodetxb.c b/libaom/av1/decoder/decodetxb.c
index 223e32e..541f4c9 100644
--- a/libaom/av1/decoder/decodetxb.c
+++ b/libaom/av1/decoder/decodetxb.c

@@ -43,7 +43,7 @@
 }
 
 static INLINE int rec_eob_pos(const int eob_token, const int extra) {
-  int eob = k_eob_group_start[eob_token];
+  int eob = av1_eob_group_start[eob_token];
   if (eob > 2) {
     eob += extra;
   }
@@ -148,9 +148,7 @@
   if (all_zero) {
     *max_scan_line = 0;
     if (plane == 0) {
-      const int txk_type_idx =
-          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-      mbmi->txk_type[txk_type_idx] = DCT_DCT;
+      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT;
     }
     return 0;
   }
@@ -159,14 +157,12 @@
     // only y plane's tx_type is transmitted
     av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
   }
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
   const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+      av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type);
   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
   const int16_t *const scan = scan_order->scan;
   int eob_extra = 0;
@@ -220,7 +216,7 @@
       break;
   }
 
-  const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
   if (eob_offset_bits > 0) {
     const int eob_ctx = eob_pt - 3;
     int bit = aom_read_symbol(
@@ -336,25 +332,42 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
 
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
   TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
-              pd->left_context + row, &txb_ctx);
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
+              pd->left_entropy_context + row, &txb_ctx);
   const uint8_t cul_level =
       av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
-  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
+                           row);
 
   if (is_inter_block(mbmi)) {
-    PLANE_TYPE plane_type = get_plane_type(plane);
+    const PLANE_TYPE plane_type = get_plane_type(plane);
     // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
-                                            cm->reduced_tx_set_used);
+    const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+                                            cm->features.reduced_tx_set_used);
 
-    if (plane == 0)
-      update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size,
-                       tx_type);
+    if (plane == 0) {
+      const int txw = tx_size_wide_unit[tx_size];
+      const int txh = tx_size_high_unit[tx_size];
+      // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+      // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+      // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+      // the intricacy, cover all the 16x16 units inside a 64 level transform.
+      if (txw == tx_size_wide_unit[TX_64X64] ||
+          txh == tx_size_high_unit[TX_64X64]) {
+        const int tx_unit = tx_size_wide_unit[TX_16X16];
+        const int stride = xd->tx_type_map_stride;
+        for (int idy = 0; idy < txh; idy += tx_unit) {
+          for (int idx = 0; idx < txw; idx += tx_unit) {
+            xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type;
+          }
+        }
+      }
+    }
   }
 
 #if TXCOEFF_TIMER

diff --git a/libaom/av1/decoder/decodetxb.h b/libaom/av1/decoder/decodetxb.h
index fe04f6a..39bf0bf 100644
--- a/libaom/av1/decoder/decodetxb.h
+++ b/libaom/av1/decoder/decodetxb.h

@@ -14,8 +14,8 @@
 
 #include "config/aom_config.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 #include "aom_dsp/bitreader.h"
 

diff --git a/libaom/av1/decoder/dthread.h b/libaom/av1/decoder/dthread.h
index c1b8719..f82b9d8 100644
--- a/libaom/av1/decoder/dthread.h
+++ b/libaom/av1/decoder/dthread.h

@@ -39,7 +39,6 @@
   const uint8_t *data_end;
   size_t data_size;
   void *user_priv;
-  int worker_id;
   int received_frame;
   int frame_context_ready;  // Current frame's context is ready to read.
   int frame_decoded;        // Finished decoding current frame.

diff --git a/libaom/av1/decoder/inspection.c b/libaom/av1/decoder/inspection.c
index eeed1d3..d121a70 100644
--- a/libaom/av1/decoder/inspection.c
+++ b/libaom/av1/decoder/inspection.c

@@ -36,16 +36,18 @@
 int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
   struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
   AV1_COMMON *const cm = &pbi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const CommonQuantParams *quant_params = &cm->quant_params;
 
-  if (fd->mi_rows != cm->mi_rows || fd->mi_cols != cm->mi_cols) {
+  if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) {
     ifd_clear(fd);
-    ifd_init_mi_rc(fd, cm->mi_rows, cm->mi_cols);
+    ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols);
   }
   fd->show_existing_frame = cm->show_existing_frame;
   fd->frame_number = cm->current_frame.frame_number;
   fd->show_frame = cm->show_frame;
   fd->frame_type = cm->current_frame.frame_type;
-  fd->base_qindex = cm->base_qindex;
+  fd->base_qindex = quant_params->base_qindex;
   // Set width and height of the first tile until generic support can be added
   TileInfo tile_info;
   av1_tile_set_row(&tile_info, cm, 0);
@@ -61,15 +63,16 @@
   int i, j;
   for (i = 0; i < MAX_SEGMENTS; i++) {
     for (j = 0; j < 2; j++) {
-      fd->y_dequant[i][j] = cm->y_dequant_QTX[i][j];
-      fd->u_dequant[i][j] = cm->u_dequant_QTX[i][j];
-      fd->v_dequant[i][j] = cm->v_dequant_QTX[i][j];
+      fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j];
+      fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j];
+      fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j];
     }
   }
-  for (j = 0; j < cm->mi_rows; j++) {
-    for (i = 0; i < cm->mi_cols; i++) {
-      const MB_MODE_INFO *mbmi = cm->mi_grid_visible[j * cm->mi_stride + i];
-      insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
+  for (j = 0; j < mi_params->mi_rows; j++) {
+    for (i = 0; i < mi_params->mi_cols; i++) {
+      const MB_MODE_INFO *mbmi =
+          mi_params->mi_grid_base[j * mi_params->mi_stride + i];
+      insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i];
       // Segment
       mi->segment_id = mbmi->segment_id;
       // Motion Vectors
@@ -116,8 +119,16 @@
 
       if (skip_not_transform && mi->skip) mi->tx_size = -1;
 
-      mi->tx_type =
-          (mi->skip ? 0 : mbmi->txk_type[av1_get_txk_type_index(bsize, r, c)]);
+      if (mi->skip) {
+        const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size];
+        const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size];
+        const int tx_type_map_idx =
+            tx_type_row * mi_params->mi_stride + tx_type_col;
+        mi->tx_type = mi_params->tx_type_map[tx_type_map_idx];
+      } else {
+        mi->tx_type = 0;
+      }
+
       if (skip_not_transform &&
           (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)]))
         mi->tx_type = -1;

diff --git a/libaom/av1/decoder/obu.c b/libaom/av1/decoder/obu.c
index ac66df5..791e596 100644
--- a/libaom/av1/decoder/obu.c
+++ b/libaom/av1/decoder/obu.c

@@ -25,25 +25,6 @@
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/obu.h"
 
-// Picture prediction structures (0-12 are predefined) in scalability metadata.
-enum {
-  SCALABILITY_L1T2 = 0,
-  SCALABILITY_L1T3 = 1,
-  SCALABILITY_L2T1 = 2,
-  SCALABILITY_L2T2 = 3,
-  SCALABILITY_L2T3 = 4,
-  SCALABILITY_S2T1 = 5,
-  SCALABILITY_S2T2 = 6,
-  SCALABILITY_S2T3 = 7,
-  SCALABILITY_L2T1h = 8,
-  SCALABILITY_L2T2h = 9,
-  SCALABILITY_L2T3h = 10,
-  SCALABILITY_S2T1h = 11,
-  SCALABILITY_S2T2h = 12,
-  SCALABILITY_S2T3h = 13,
-  SCALABILITY_SS = 14
-} UENUM1BYTE(SCALABILITY_STRUCTURES);
-
 aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
     int operating_point_idc, unsigned int *number_spatial_layers,
     unsigned int *number_temporal_layers) {
@@ -106,10 +87,14 @@
 }
 
 // Returns whether two sequence headers are consistent with each other.
-// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly.
+// Note that the 'op_params' field is not compared per Section 7.5 in the spec:
+//   Within a particular coded video sequence, the contents of
+//   sequence_header_obu must be bit-identical each time the sequence header
+//   appears except for the contents of operating_parameters_info.
 static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
                                       const SequenceHeader *seq_params_new) {
-  return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader));
+  return !memcmp(seq_params_old, seq_params_new,
+                 offsetof(SequenceHeader, op_params));
 }
 
 // On success, sets pbi->sequence_header_ready to 1 and returns the number of
@@ -144,7 +129,7 @@
   }
 
   if (seq_params->reduced_still_picture_hdr) {
-    cm->timing_info_present = 0;
+    seq_params->timing_info_present = 0;
     seq_params->decoder_model_info_present_flag = 0;
     seq_params->display_model_info_present_flag = 0;
     seq_params->operating_points_cnt_minus_1 = 0;
@@ -154,16 +139,16 @@
       return 0;
     }
     seq_params->tier[0] = 0;
-    cm->op_params[0].decoder_model_param_present_flag = 0;
-    cm->op_params[0].display_model_param_present_flag = 0;
+    seq_params->op_params[0].decoder_model_param_present_flag = 0;
+    seq_params->op_params[0].display_model_param_present_flag = 0;
   } else {
-    cm->timing_info_present = aom_rb_read_bit(rb);  // timing_info_present_flag
-    if (cm->timing_info_present) {
-      av1_read_timing_info_header(cm, rb);
+    seq_params->timing_info_present = aom_rb_read_bit(rb);
+    if (seq_params->timing_info_present) {
+      av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb);
 
       seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
       if (seq_params->decoder_model_info_present_flag)
-        av1_read_decoder_model_info(cm, rb);
+        av1_read_decoder_model_info(&seq_params->decoder_model_info, rb);
     } else {
       seq_params->decoder_model_info_present_flag = 0;
     }
@@ -184,51 +169,57 @@
       else
         seq_params->tier[i] = 0;
       if (seq_params->decoder_model_info_present_flag) {
-        cm->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb);
-        if (cm->op_params[i].decoder_model_param_present_flag)
-          av1_read_op_parameters_info(cm, rb, i);
+        seq_params->op_params[i].decoder_model_param_present_flag =
+            aom_rb_read_bit(rb);
+        if (seq_params->op_params[i].decoder_model_param_present_flag)
+          av1_read_op_parameters_info(&seq_params->op_params[i],
+                                      seq_params->decoder_model_info
+                                          .encoder_decoder_buffer_delay_length,
+                                      rb);
       } else {
-        cm->op_params[i].decoder_model_param_present_flag = 0;
+        seq_params->op_params[i].decoder_model_param_present_flag = 0;
       }
-      if (cm->timing_info_present &&
-          (cm->timing_info.equal_picture_interval ||
-           cm->op_params[i].decoder_model_param_present_flag)) {
-        cm->op_params[i].bitrate =
-            max_level_bitrate(seq_params->profile, seq_params->seq_level_idx[i],
-                              seq_params->tier[i]);
+      if (seq_params->timing_info_present &&
+          (seq_params->timing_info.equal_picture_interval ||
+           seq_params->op_params[i].decoder_model_param_present_flag)) {
+        seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+            seq_params->profile, seq_params->seq_level_idx[i],
+            seq_params->tier[i]);
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
-        if (cm->op_params[i].bitrate == 0)
+        if (seq_params->op_params[i].bitrate == 0)
           aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "AV1 does not support this combination of "
                              "profile, level, and tier.");
         // Buffer size in bits/s is bitrate in bits/s * 1 s
-        cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+        seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
       }
-      if (cm->timing_info_present && cm->timing_info.equal_picture_interval &&
-          !cm->op_params[i].decoder_model_param_present_flag) {
+      if (seq_params->timing_info_present &&
+          seq_params->timing_info.equal_picture_interval &&
+          !seq_params->op_params[i].decoder_model_param_present_flag) {
         // When the decoder_model_parameters are not sent for this op, set
         // the default ones that can be used with the resource availability mode
-        cm->op_params[i].decoder_buffer_delay = 70000;
-        cm->op_params[i].encoder_buffer_delay = 20000;
-        cm->op_params[i].low_delay_mode_flag = 0;
+        seq_params->op_params[i].decoder_buffer_delay = 70000;
+        seq_params->op_params[i].encoder_buffer_delay = 20000;
+        seq_params->op_params[i].low_delay_mode_flag = 0;
       }
 
       if (seq_params->display_model_info_present_flag) {
-        cm->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb);
-        if (cm->op_params[i].display_model_param_present_flag) {
-          cm->op_params[i].initial_display_delay =
+        seq_params->op_params[i].display_model_param_present_flag =
+            aom_rb_read_bit(rb);
+        if (seq_params->op_params[i].display_model_param_present_flag) {
+          seq_params->op_params[i].initial_display_delay =
               aom_rb_read_literal(rb, 4) + 1;
-          if (cm->op_params[i].initial_display_delay > 10)
+          if (seq_params->op_params[i].initial_display_delay > 10)
             aom_internal_error(
                 &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                 "AV1 does not support more than 10 decoded frames delay");
         } else {
-          cm->op_params[i].initial_display_delay = 10;
+          seq_params->op_params[i].initial_display_delay = 10;
         }
       } else {
-        cm->op_params[i].display_model_param_present_flag = 0;
-        cm->op_params[i].initial_display_delay = 10;
+        seq_params->op_params[i].display_model_param_present_flag = 0;
+        seq_params->op_params[i].initial_display_delay = 10;
       }
     }
   }
@@ -297,11 +288,12 @@
                                       int *start_tile, int *end_tile,
                                       int tile_start_implicit) {
   AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
   uint32_t saved_bit_offset = rb->bit_offset;
   int tile_start_and_end_present_flag = 0;
-  const int num_tiles = pbi->common.tile_rows * pbi->common.tile_cols;
+  const int num_tiles = tiles->rows * tiles->cols;
 
-  if (!pbi->common.large_scale_tile && num_tiles > 1) {
+  if (!tiles->large_scale && num_tiles > 1) {
     tile_start_and_end_present_flag = aom_rb_read_bit(rb);
     if (tile_start_implicit && tile_start_and_end_present_flag) {
       aom_internal_error(
@@ -310,20 +302,35 @@
       return -1;
     }
   }
-  if (pbi->common.large_scale_tile || num_tiles == 1 ||
+  if (tiles->large_scale || num_tiles == 1 ||
       !tile_start_and_end_present_flag) {
     *start_tile = 0;
     *end_tile = num_tiles - 1;
   } else {
-    int tile_bits = cm->log2_tile_rows + cm->log2_tile_cols;
+    int tile_bits = tiles->log2_rows + tiles->log2_cols;
     *start_tile = aom_rb_read_literal(rb, tile_bits);
     *end_tile = aom_rb_read_literal(rb, tile_bits);
   }
-  if (*start_tile > *end_tile) {
+  if (*start_tile != pbi->next_start_tile) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "tg_end must be greater than or equal to tg_start");
+                       "tg_start (%d) must be equal to %d", *start_tile,
+                       pbi->next_start_tile);
     return -1;
   }
+  if (*start_tile > *end_tile) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_CORRUPT_FRAME,
+        "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
+        *start_tile);
+    return -1;
+  }
+  if (*end_tile >= num_tiles) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
+                       num_tiles);
+    return -1;
+  }
+  pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1;
 
   return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
 }
@@ -350,8 +357,7 @@
 
   tg_payload_size = (uint32_t)(*p_data_end - data);
 
-  // TODO(shan):  For now, assume all tile groups received in order
-  *is_last_tg = end_tile == cm->tile_rows * cm->tile_cols - 1;
+  *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1;
   return header_size + tg_payload_size;
 }
 
@@ -383,7 +389,7 @@
                              cm->seq_params.subsampling_y,
                              (cm->seq_params.use_highbitdepth &&
                               (cm->seq_params.bit_depth > AOM_BITS_8)),
-                             0, cm->byte_alignment))
+                             0, cm->features.byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate the tile list output buffer");
 }
@@ -479,7 +485,7 @@
                                               int *frame_decoding_finished) {
   AV1_COMMON *const cm = &pbi->common;
   uint32_t tile_list_payload_size = 0;
-  const int num_tiles = cm->tile_cols * cm->tile_rows;
+  const int num_tiles = cm->tiles.cols * cm->tiles.rows;
   const int start_tile = 0;
   const int end_tile = num_tiles - 1;
   int i = 0;
@@ -515,13 +521,14 @@
       cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
-    av1_set_reference_dec(cm, 0, 1, &pbi->ext_refs.refs[ref_idx]);
+    av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
+                          &pbi->ext_refs.refs[ref_idx]);
 
     pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
     pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
     if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
-        pbi->dec_tile_row >= cm->tile_rows ||
-        pbi->dec_tile_col >= cm->tile_cols) {
+        pbi->dec_tile_row >= cm->tiles.rows ||
+        pbi->dec_tile_col >= cm->tiles.cols) {
       cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
@@ -552,129 +559,295 @@
   return tile_list_payload_size;
 }
 
-static void read_metadata_itut_t35(const uint8_t *data, size_t sz) {
-  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
-  for (size_t i = 0; i < sz; i++) {
-    aom_rb_read_literal(&rb, 8);
+// Returns the last nonzero byte index in 'data'. If there is no nonzero byte in
+// 'data', returns -1.
+static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) {
+  // Scan backward and return on the first nonzero byte.
+  int i = (int)sz - 1;
+  while (i >= 0 && data[i] == 0) {
+    --i;
   }
+  return i;
 }
 
-static void read_metadata_hdr_cll(const uint8_t *data, size_t sz) {
-  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
-  aom_rb_read_literal(&rb, 16);  // max_cll
-  aom_rb_read_literal(&rb, 16);  // max_fall
+// Allocates metadata that was read and adds it to the decoders metadata array.
+static void alloc_read_metadata(AV1Decoder *const pbi,
+                                OBU_METADATA_TYPE metadata_type,
+                                const uint8_t *data, size_t sz,
+                                aom_metadata_insert_flags_t insert_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
+  if (!metadata) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating metadata");
+  }
+  if (!pbi->metadata) {
+    pbi->metadata = aom_img_metadata_array_alloc(1);
+    if (!pbi->metadata) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate metadata array");
+    }
+  } else {
+    aom_metadata_t **metadata_array =
+        (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
+                                   (pbi->metadata->sz + 1) * sizeof(metadata));
+    if (!metadata_array) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating metadata");
+    }
+    pbi->metadata->metadata_array = metadata_array;
+    pbi->metadata->sz++;
+  }
+  pbi->metadata->metadata_array[pbi->metadata->sz - 1] = metadata;
 }
 
-static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
-  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
-  for (int i = 0; i < 3; i++) {
-    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_x
-    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_y
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
+                                     size_t sz) {
+  const int kMinItuT35PayloadSize = 2;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "itu_t_t35_country_code is missing");
   }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on metadata");
+  }
+  if (*data == 0xFF && bytes_read < kMinItuT35PayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "itu_t_t35_country_code_extension_byte is missing");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
+}
 
-  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_x
-  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_y
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
+                                    size_t sz) {
+  const int kHdrCllPayloadSize = 4;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "HDR CLL metadata payload is missing");
+  }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on metadata");
+  }
+  if (bytes_read != kHdrCllPayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Incorrect HDR CLL metadata payload size");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
+}
 
-  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_max
-  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_min
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
+                                     size_t sz) {
+  const int kMdcvPayloadSize = 24;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "HDR MDCV metadata payload is missing");
+  }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on HDR MDCV metadata");
+  }
+  if (bytes_read != kMdcvPayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Incorrect HDR MDCV metadata payload size");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
 }
 
 static void scalability_structure(struct aom_read_bit_buffer *rb) {
-  int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
-  int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
-  int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
-  int temporal_group_description_present_flag = aom_rb_read_bit(rb);
+  const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2);
+  const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+  const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+  const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
   aom_rb_read_literal(rb, 3);  // reserved
 
   if (spatial_layer_dimensions_present_flag) {
-    int i;
-    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
       aom_rb_read_literal(rb, 16);
       aom_rb_read_literal(rb, 16);
     }
   }
   if (spatial_layer_description_present_flag) {
-    int i;
-    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
       aom_rb_read_literal(rb, 8);
     }
   }
   if (temporal_group_description_present_flag) {
-    int i, j, temporal_group_size;
-    temporal_group_size = aom_rb_read_literal(rb, 8);
-    for (i = 0; i < temporal_group_size; i++) {
+    const int temporal_group_size = aom_rb_read_literal(rb, 8);
+    for (int i = 0; i < temporal_group_size; i++) {
       aom_rb_read_literal(rb, 3);
       aom_rb_read_bit(rb);
       aom_rb_read_bit(rb);
-      int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
-      for (j = 0; j < temporal_group_ref_cnt; j++) {
+      const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+      for (int j = 0; j < temporal_group_ref_cnt; j++) {
         aom_rb_read_literal(rb, 8);
       }
     }
   }
 }
 
-static void read_metadata_scalability(const uint8_t *data, size_t sz) {
-  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
-  int scalability_mode_idc = aom_rb_read_literal(&rb, 8);
+static void read_metadata_scalability(struct aom_read_bit_buffer *rb) {
+  const int scalability_mode_idc = aom_rb_read_literal(rb, 8);
   if (scalability_mode_idc == SCALABILITY_SS) {
-    scalability_structure(&rb);
+    scalability_structure(rb);
   }
 }
 
-static void read_metadata_timecode(const uint8_t *data, size_t sz) {
-  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
-  aom_rb_read_literal(&rb, 5);                     // counting_type f(5)
-  int full_timestamp_flag = aom_rb_read_bit(&rb);  // full_timestamp_flag f(1)
-  aom_rb_read_bit(&rb);                            // discontinuity_flag (f1)
-  aom_rb_read_bit(&rb);                            // cnt_dropped_flag f(1)
-  aom_rb_read_literal(&rb, 9);                     // n_frames f(9)
+static void read_metadata_timecode(struct aom_read_bit_buffer *rb) {
+  aom_rb_read_literal(rb, 5);  // counting_type f(5)
+  const int full_timestamp_flag =
+      aom_rb_read_bit(rb);     // full_timestamp_flag f(1)
+  aom_rb_read_bit(rb);         // discontinuity_flag (f1)
+  aom_rb_read_bit(rb);         // cnt_dropped_flag f(1)
+  aom_rb_read_literal(rb, 9);  // n_frames f(9)
   if (full_timestamp_flag) {
-    aom_rb_read_literal(&rb, 6);  // seconds_value f(6)
-    aom_rb_read_literal(&rb, 6);  // minutes_value f(6)
-    aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+    aom_rb_read_literal(rb, 6);  // seconds_value f(6)
+    aom_rb_read_literal(rb, 6);  // minutes_value f(6)
+    aom_rb_read_literal(rb, 5);  // hours_value f(5)
   } else {
-    int seconds_flag = aom_rb_read_bit(&rb);  // seconds_flag f(1)
+    const int seconds_flag = aom_rb_read_bit(rb);  // seconds_flag f(1)
     if (seconds_flag) {
-      aom_rb_read_literal(&rb, 6);              // seconds_value f(6)
-      int minutes_flag = aom_rb_read_bit(&rb);  // minutes_flag f(1)
+      aom_rb_read_literal(rb, 6);                    // seconds_value f(6)
+      const int minutes_flag = aom_rb_read_bit(rb);  // minutes_flag f(1)
       if (minutes_flag) {
-        aom_rb_read_literal(&rb, 6);            // minutes_value f(6)
-        int hours_flag = aom_rb_read_bit(&rb);  // hours_flag f(1)
+        aom_rb_read_literal(rb, 6);                  // minutes_value f(6)
+        const int hours_flag = aom_rb_read_bit(rb);  // hours_flag f(1)
         if (hours_flag) {
-          aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+          aom_rb_read_literal(rb, 5);  // hours_value f(5)
         }
       }
     }
   }
   // time_offset_length f(5)
-  int time_offset_length = aom_rb_read_literal(&rb, 5);
+  const int time_offset_length = aom_rb_read_literal(rb, 5);
   if (time_offset_length) {
-    aom_rb_read_literal(&rb, time_offset_length);  // f(time_offset_length)
+    // time_offset_value f(time_offset_length)
+    aom_rb_read_literal(rb, time_offset_length);
   }
 }
 
-// Not fully implemented. Always succeeds and returns sz.
-static size_t read_metadata(const uint8_t *data, size_t sz) {
+// Returns the last nonzero byte in 'data'. If there is no nonzero byte in
+// 'data', returns 0.
+//
+// Call this function to check the following requirement in the spec:
+//   This implies that when any payload data is present for this OBU type, at
+//   least one byte of the payload data (including the trailing bit) shall not
+//   be equal to 0.
+static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
+  // Scan backward and return on the first nonzero byte.
+  size_t i = sz;
+  while (i != 0) {
+    --i;
+    if (data[i] != 0) return data[i];
+  }
+  return 0;
+}
+
+// Checks the metadata for correct syntax but ignores the parsed metadata.
+//
+// On success, returns the number of bytes read from 'data'. On failure, sets
+// pbi->common.error.error_code and returns 0, or calls aom_internal_error()
+// and does not return.
+static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
+  AV1_COMMON *const cm = &pbi->common;
   size_t type_length;
   uint64_t type_value;
-  OBU_METADATA_TYPE metadata_type;
   if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+  const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
+  if (metadata_type == 0 || metadata_type >= 6) {
+    // If metadata_type is reserved for future use or a user private value,
+    // ignore the entire OBU and just check trailing bits.
+    if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
     return sz;
   }
-  metadata_type = (OBU_METADATA_TYPE)type_value;
   if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
-    read_metadata_itut_t35(data + type_length, sz - type_length);
+    size_t bytes_read =
+        type_length +
+        read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
+    // itu_t_t35_payload_bytes is byte aligned and the first
+    // trailing byte should be 0x80.
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
   } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
-    read_metadata_hdr_cll(data + type_length, sz - type_length);
+    size_t bytes_read =
+        type_length +
+        read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
   } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
-    read_metadata_hdr_mdcv(data + type_length, sz - type_length);
-  } else if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
-    read_metadata_scalability(data + type_length, sz - type_length);
-  } else if (metadata_type == OBU_METADATA_TYPE_TIMECODE) {
-    read_metadata_timecode(data + type_length, sz - type_length);
+    size_t bytes_read =
+        type_length +
+        read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
   }
 
+  struct aom_read_bit_buffer rb;
+  av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz);
+  if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+    read_metadata_scalability(&rb);
+  } else {
+    assert(metadata_type == OBU_METADATA_TYPE_TIMECODE);
+    read_metadata_timecode(&rb);
+  }
+  if (av1_check_trailing_bits(pbi, &rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+  assert((rb.bit_offset & 7) == 0);
+  return type_length + (rb.bit_offset >> 3);
+}
+
+// On success, returns 'sz'. On failure, sets pbi->common.error.error_code and
+// returns 0.
+static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
+                           size_t sz) {
+  // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So
+  // check trailing bits only if sz > 0.
+  if (sz > 0) {
+    // The payload of a padding OBU is byte aligned. Therefore the first
+    // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+    const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
+    if (last_nonzero_byte != 0x80) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+  }
   return sz;
 }
 
@@ -691,14 +864,15 @@
   ObuHeader obu_header;
   memset(&obu_header, 0, sizeof(obu_header));
   pbi->seen_frame_header = 0;
+  pbi->next_start_tile = 0;
 
   if (data_end < data) {
     cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
 
-  // Reset pbi->camera_frame_header_ready to 0 if cm->large_scale_tile = 0.
-  if (!cm->large_scale_tile) pbi->camera_frame_header_ready = 0;
+  // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0.
+  if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
 
   // decode frame as a series of OBUs
   while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) {
@@ -716,7 +890,7 @@
     }
 
     aom_codec_err_t status =
-        aom_read_obu_header_and_size(data, bytes_available, cm->is_annexb,
+        aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb,
                                      &obu_header, &payload_size, &bytes_read);
 
     if (status != AOM_CODEC_OK) {
@@ -756,6 +930,7 @@
       case OBU_TEMPORAL_DELIMITER:
         decoded_payload_size = read_temporal_delimiter_obu();
         pbi->seen_frame_header = 0;
+        pbi->next_start_tile = 0;
         break;
       case OBU_SEQUENCE_HEADER:
         decoded_payload_size = read_sequence_header_obu(pbi, &rb);
@@ -769,13 +944,25 @@
       case OBU_FRAME_HEADER:
       case OBU_REDUNDANT_FRAME_HEADER:
       case OBU_FRAME:
+        if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
+          if (!pbi->seen_frame_header) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+        } else {
+          // OBU_FRAME_HEADER or OBU_FRAME.
+          if (pbi->seen_frame_header) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+        }
         // Only decode first frame header received
         if (!pbi->seen_frame_header ||
-            (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
+            (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
           frame_header_size = read_frame_header_obu(
               pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
           pbi->seen_frame_header = 1;
-          if (!pbi->ext_tile_debug && cm->large_scale_tile)
+          if (!pbi->ext_tile_debug && cm->tiles.large_scale)
             pbi->camera_frame_header_ready = 1;
         } else {
           // TODO(wtc): Verify that the frame_header_obu is identical to the
@@ -837,7 +1024,8 @@
         if (frame_decoding_finished) pbi->seen_frame_header = 0;
         break;
       case OBU_METADATA:
-        decoded_payload_size = read_metadata(data, payload_size);
+        decoded_payload_size = read_metadata(pbi, data, payload_size);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_TILE_LIST:
         if (CONFIG_NORMAL_TILE_MODE) {
@@ -852,7 +1040,7 @@
           return -1;
         }
 
-        cm->large_scale_tile = 1;
+        cm->tiles.large_scale = 1;
         av1_set_single_tile_decoding_mode(cm);
         decoded_payload_size =
             read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
@@ -860,8 +1048,16 @@
         if (cm->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_PADDING:
+        decoded_payload_size = read_padding(&pbi->common, data, payload_size);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
       default:
         // Skip unrecognized OBUs
+        if (payload_size > 0 &&
+            get_last_nonzero_byte(data, payload_size) == 0) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
         decoded_payload_size = payload_size;
         break;
     }

diff --git a/libaom/av1/encoder/aq_complexity.c b/libaom/av1/encoder/aq_complexity.c
index 16edbc6..3658006 100644
--- a/libaom/av1/encoder/aq_complexity.c
+++ b/libaom/av1/encoder/aq_complexity.c

@@ -41,14 +41,29 @@
 
 static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
+  const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4;
   return (base_quant > 10) + (base_quant > 25);
 }
 
+static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+// Segmentation only makes sense if the target bits per SB is above a threshold.
+// Below this the overheads will usually outweigh any benefit.
+static bool is_sb_aq_enabled(const AV1_COMP *const cpi) {
+  return cpi->rc.sb64_target_rate >= 256;
+}
+
 void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *const seg = &cm->seg;
-  int resolution_change =
+  const int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
 
@@ -56,27 +71,24 @@
   aom_clear_system_state();
 
   if (resolution_change) {
-    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_clearall_segfeatures(seg);
     av1_disable_segmentation(seg);
     return;
   }
 
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cpi->refresh_alt_ref_frame ||
-      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+  if (is_frame_aq_enabled(cpi)) {
     int segment;
     const int aq_strength =
-        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+        get_aq_c_strength(base_qindex, cm->seq_params.bit_depth);
 
     // Clear down the segment map.
-    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+    memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
+           cm->mi_params.mi_rows * cm->mi_params.mi_cols);
 
     av1_clearall_segfeatures(seg);
 
-    // Segmentation only makes sense if the target bits per SB is above a
-    // threshold. Below this the overheads will usually outweigh any benefit.
-    if (cpi->rc.sb64_target_rate < 256) {
+    if (!is_sb_aq_enabled(cpi)) {
       av1_disable_segmentation(seg);
       return;
     }
@@ -93,17 +105,17 @@
       if (segment == DEFAULT_AQ2_SEG) continue;
 
       qindex_delta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->current_frame.frame_type, cm->base_qindex,
+          &cpi->rc, cm->current_frame.frame_type, base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
-      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
-        qindex_delta = -cm->base_qindex + 1;
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
       }
-      if ((cm->base_qindex + qindex_delta) > 0) {
+      if ((base_qindex + qindex_delta) > 0) {
         av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
         av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
       }
@@ -118,12 +130,13 @@
 // bits for the block vs a target average and its spatial complexity.
 void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
                             int mi_row, int mi_col, int projected_rate) {
+  if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return;
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
 
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
-  const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]);
+  const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
   int x, y;
   int i;
   unsigned char segment;
@@ -139,12 +152,12 @@
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
-    const int aq_strength =
-        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+    const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+                                              cm->seq_params.bit_depth);
 
     aom_clear_system_state();
     low_var_thresh =
-        (cpi->oxcf.pass == 2)
+        (is_stat_consumption_stage_twopass(cpi))
             ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
             : DEFAULT_LV_THRESH;
 
@@ -166,7 +179,7 @@
   // Fill in the entires in the segment map corresponding to this SB64.
   for (y = 0; y < ymis; y++) {
     for (x = 0; x < xmis; x++) {
-      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+      cpi->enc_seg.map[mi_offset + y * cm->mi_params.mi_cols + x] = segment;
     }
   }
 }

diff --git a/libaom/av1/encoder/aq_cyclicrefresh.c b/libaom/av1/encoder/aq_cyclicrefresh.c
index bfb2a90..b888494 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.c
+++ b/libaom/av1/encoder/aq_cyclicrefresh.c

@@ -19,46 +19,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/system_state.h"
 
-struct CYCLIC_REFRESH {
-  // Percentage of blocks per frame that are targeted as candidates
-  // for cyclic refresh.
-  int percent_refresh;
-  // Maximum q-delta as percentage of base q.
-  int max_qdelta_perc;
-  // Superblock starting index for cycling through the frame.
-  int sb_index;
-  // Controls how long block will need to wait to be refreshed again, in
-  // excess of the cycle time, i.e., in the case of all zero motion, block
-  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
-  int time_for_refresh;
-  // Target number of (4x4) blocks that are set for delta-q.
-  int target_num_seg_blocks;
-  // Actual number of (4x4) blocks that were applied delta-q.
-  int actual_num_seg1_blocks;
-  int actual_num_seg2_blocks;
-  // RD mult. parameters for segment 1.
-  int rdmult;
-  // Cyclic refresh map.
-  int8_t *map;
-  // Map of the last q a block was coded at.
-  uint8_t *last_coded_q_map;
-  // Thresholds applied to the projected rate/distortion of the coding block,
-  // when deciding whether block should be refreshed.
-  int64_t thresh_rate_sb;
-  int64_t thresh_dist_sb;
-  // Threshold applied to the motion vector (in units of 1/8 pel) of the
-  // coding block, when deciding whether block should be refreshed.
-  int16_t motion_thresh;
-  // Rate target ratio to set q delta.
-  double rate_ratio_qdelta;
-  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
-  int rate_boost_fac;
-  double low_content_avg;
-  int qindex_delta[3];
-  double weight_segment;
-  int apply_cyclic_refresh;
-};
-
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
@@ -77,7 +37,7 @@
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-
+  cr->avg_frame_low_motion = 0.0;
   return cr;
 }
 
@@ -136,28 +96,27 @@
 int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
                                           double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int base_qindex = cm->quant_params.base_qindex;
+  const int bit_depth = cm->seq_params.bit_depth;
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int estimated_bits;
-  int mbs = cm->MBs;
-  int num4x4bl = mbs << 4;
+  const int mbs = cm->mi_params.MBs;
+  const int num4x4bl = mbs << 4;
   // Weight for non-base segments: use actual number of blocks refreshed in
   // previous/just encoded frame. Note number of blocks here is in 4x4 units.
-  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
-  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+  const double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+  const double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
   // Take segment weighted average for estimated bits.
-  estimated_bits =
+  const int estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
-                av1_estimate_bits_at_q(cm->current_frame.frame_type,
-                                       cm->base_qindex, mbs, correction_factor,
-                                       cm->seq_params.bit_depth) +
+                av1_estimate_bits_at_q(frame_type, base_qindex, mbs,
+                                       correction_factor, bit_depth) +
             weight_segment1 * av1_estimate_bits_at_q(
-                                  cm->current_frame.frame_type,
-                                  cm->base_qindex + cr->qindex_delta[1], mbs,
-                                  correction_factor, cm->seq_params.bit_depth) +
+                                  frame_type, base_qindex + cr->qindex_delta[1],
+                                  mbs, correction_factor, bit_depth) +
             weight_segment2 * av1_estimate_bits_at_q(
-                                  cm->current_frame.frame_type,
-                                  cm->base_qindex + cr->qindex_delta[2], mbs,
-                                  correction_factor, cm->seq_params.bit_depth));
+                                  frame_type, base_qindex + cr->qindex_delta[2],
+                                  mbs, correction_factor, bit_depth));
   return estimated_bits;
 }
 
@@ -171,7 +130,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int bits_per_mb;
-  int num4x4bl = cm->MBs << 4;
+  int num4x4bl = cm->mi_params.MBs << 4;
   // Weight for segment prior to encoding: take the average of the target
   // number for the frame to be encoded and the actual from the previous frame.
   double weight_segment =
@@ -204,15 +163,13 @@
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
-  const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
   const int refresh_this_block =
       candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
-  int x = 0;
-  int y = 0;
 
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
@@ -240,45 +197,60 @@
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++) {
-      int map_offset = block_index + y * cm->mi_cols + x;
+  for (int y = 0; y < ymis; y++)
+    for (int x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_params.mi_cols + x;
       cr->map[map_offset] = new_map_value;
-      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      cpi->enc_seg.map[map_offset] = mbmi->segment_id;
     }
 }
 
-// Update the actual number of blocks that were applied the segment delta q.
+// Update the some stats after encode frame is done.
 void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  unsigned char *const seg_map = cpi->segmentation_map;
-  int mi_row, mi_col;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  cr->cnt_zeromv = 0;
   cr->actual_num_seg1_blocks = 0;
   cr->actual_num_seg2_blocks = 0;
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
-      if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
-          CR_SEGMENT_ID_BOOST1)
-        cr->actual_num_seg1_blocks++;
-      else if (cyclic_refresh_segment_id(
-                   seg_map[mi_row * cm->mi_cols + mi_col]) ==
-               CR_SEGMENT_ID_BOOST2)
-        cr->actual_num_seg2_blocks++;
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+      MB_MODE_INFO **mi =
+          mi_params->mi_grid_base + mi_row * mi_params->mi_stride + mi_col;
+      MV mv = mi[0]->mv[0].as_mv;
+      if (cm->seg.enabled) {
+        int map_index = mi_row * mi_params->mi_cols + mi_col;
+        if (cyclic_refresh_segment_id(seg_map[map_index]) ==
+            CR_SEGMENT_ID_BOOST1)
+          cr->actual_num_seg1_blocks++;
+        else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
+                 CR_SEGMENT_ID_BOOST2)
+          cr->actual_num_seg2_blocks++;
+      }
+      // Accumulate low_content_frame.
+      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
+        cr->cnt_zeromv++;
     }
+  }
+  cr->cnt_zeromv =
+      100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+  cr->avg_frame_low_motion =
+      (3 * cr->avg_frame_low_motion + (double)cr->cnt_zeromv) / 4;
 }
 
 // Set golden frame update interval, for 1 pass CBR mode.
 void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
-  // period. Depending on past encoding stats, GF flag may be reset and update
-  // may not occur until next baseline_gf_interval.
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
   if (cr->percent_refresh > 0)
-    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+    rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
   else
-    rc->baseline_gf_interval = 40;
+    rc->baseline_gf_interval = 20;
+  if (cr->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -289,18 +261,20 @@
 // encoding of the superblock).
 static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  unsigned char *const seg_map = cpi->segmentation_map;
+  unsigned char *const seg_map = cpi->enc_seg.map;
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
-  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols =
-      (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
-  sb_rows =
-      (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+  memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
+  sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) /
+            cm->seq_params.mib_size;
+  sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) /
+            cm->seq_params.mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
-  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  block_count =
+      cr->percent_refresh * mi_params->mi_rows * mi_params->mi_cols / 100;
   // Set the segmentation map: cycle through the superblocks, starting at
   // cr->mb_index, and stopping when either block_count blocks have been found
   // to be refreshed, or we have passed through whole frame.
@@ -315,19 +289,22 @@
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * cm->seq_params.mib_size;
     int mi_col = sb_col_index * cm->seq_params.mib_size;
-    int qindex_thresh =
-        cpi->oxcf.content == AOM_CONTENT_SCREEN
-            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
-            : 0;
-    assert(mi_row >= 0 && mi_row < cm->mi_rows);
-    assert(mi_col >= 0 && mi_col < cm->mi_cols);
-    bl_index = mi_row * cm->mi_cols + mi_col;
+    // TODO(any): Ensure the population of
+    // cpi->common.features.allow_screen_content_tools and use the same instead
+    // of cpi->oxcf.content == AOM_CONTENT_SCREEN
+    int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN
+                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
+                                             cm->quant_params.base_qindex)
+                            : 0;
+    assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
+    assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
+    bl_index = mi_row * mi_params->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size);
-    ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size);
+    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
+    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
-        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        const int bl_index2 = bl_index + y * mi_params->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
@@ -343,7 +320,7 @@
     if (sum_map >= xmis * ymis / 2) {
       for (y = 0; y < ymis; y++)
         for (x = 0; x < xmis; x++) {
-          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+          seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
         }
       cr->target_num_seg_blocks += xmis * ymis;
     }
@@ -361,14 +338,19 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int num4x4bl = cm->MBs << 4;
+  int num4x4bl = cm->mi_params.MBs << 4;
   int target_refresh = 0;
   double weight_segment_target = 0;
   double weight_segment = 0;
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
+  int qp_max_thresh = 118 * MAXQ >> 7;
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
-      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh) {
+      cpi->svc.temporal_layer_id > 0 ||
+      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (rc->frames_since_key > 20 &&
+       rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+      (cr->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
@@ -387,7 +369,7 @@
     cr->rate_ratio_qdelta = 2.0;
   }
   // Adjust some parameters for low resolutions.
-  if (cm->width <= 352 && cm->height <= 288) {
+  if (cm->width * cm->height <= 352 * 288) {
     if (rc->avg_frame_bandwidth < 3000) {
       cr->motion_thresh = 16;
       cr->rate_boost_fac = 13;
@@ -411,8 +393,9 @@
   // Weight for segment prior to encoding: take the average of the target
   // number for the frame to be encoded and the actual from the previous frame.
   // Use the target if its less. To be used for setting the base qp for the
-  // frame in vp9_rc_regulate_q.
-  target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // frame in av1_rc_regulate_q.
+  target_refresh =
+      cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100;
   weight_segment_target = (double)(target_refresh) / num4x4bl;
   weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
                              cr->actual_num_seg2_blocks) >>
@@ -432,30 +415,23 @@
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
-  if (resolution_change) {
-    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    av1_clearall_segfeatures(seg);
-    aom_clear_system_state();
-    av1_disable_segmentation(seg);
-    return;
-  }
+  if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
   if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
   if (!cr->apply_cyclic_refresh) {
     // Set segmentation map to 0 and disable.
-    unsigned char *const seg_map = cpi->segmentation_map;
-    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    unsigned char *const seg_map = cpi->enc_seg.map;
+    memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_disable_segmentation(&cm->seg);
     if (cm->current_frame.frame_type == KEY_FRAME) {
       memset(cr->last_coded_q_map, MAXQ,
-             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
+                 sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
     }
     return;
   } else {
-    int qindex_delta = 0;
-    int qindex2;
-    const double q =
-        av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth);
+    const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+                                             cm->seq_params.bit_depth);
     aom_clear_system_state();
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
@@ -486,19 +462,22 @@
     av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
 
     // Set the q delta for segment BOOST1.
-    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    const CommonQuantParams *const quant_params = &cm->quant_params;
+    int qindex_delta =
+        compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta);
     cr->qindex_delta[1] = qindex_delta;
 
     // Compute rd-mult for segment BOOST1.
-    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
-
+    const int qindex2 = clamp(
+        quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
+        0, MAXQ);
     cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
 
     av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
     // Set a more aggressive (higher) q delta for segment BOOST2.
     qindex_delta = compute_deltaq(
-        cpi, cm->base_qindex,
+        cpi, quant_params->base_qindex,
         AOMMIN(CR_MAX_RATE_TARGET_RATIO,
                0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
     cr->qindex_delta[2] = qindex_delta;
@@ -516,7 +495,7 @@
 void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
 }

diff --git a/libaom/av1/encoder/aq_cyclicrefresh.h b/libaom/av1/encoder/aq_cyclicrefresh.h
index ddabae6..ee62f6a 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.h
+++ b/libaom/av1/encoder/aq_cyclicrefresh.h

@@ -27,9 +27,50 @@
 // Maximum rate target ratio for setting segment delta-qp.
 #define CR_MAX_RATE_TARGET_RATIO 4.0
 
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (4x4) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (4x4) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  int8_t *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+  double weight_segment;
+  int apply_cyclic_refresh;
+  int cnt_zeromv;
+  double avg_frame_low_motion;
+};
+
 struct AV1_COMP;
 
-struct CYCLIC_REFRESH;
 typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
@@ -54,7 +95,7 @@
                                        int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip);
 
-// Update the actual number of blocks that were applied the segment delta q.
+// Update the some stats after encode frame is done.
 void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
 
 // Set golden frame update interval, for 1 pass CBR mode.

diff --git a/libaom/av1/encoder/aq_variance.c b/libaom/av1/encoder/aq_variance.c
index d572948..4176da2 100644
--- a/libaom/av1/encoder/aq_variance.c
+++ b/libaom/av1/encoder/aq_variance.c

@@ -44,6 +44,7 @@
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *seg = &cm->seg;
   int i;
 
@@ -57,13 +58,13 @@
   avg_ratio = rate_ratio[avg_energy];
 
   if (resolution_change) {
-    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_clearall_segfeatures(seg);
     aom_clear_system_state();
     av1_disable_segmentation(seg);
     return;
   }
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     cpi->vaq_refresh = 1;
@@ -77,15 +78,15 @@
       // Set up avg segment id to be 1.0 and adjust the other segments around
       // it.
       int qindex_delta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->current_frame.frame_type, cm->base_qindex,
+          &cpi->rc, cm->current_frame.frame_type, base_qindex,
           rate_ratio[i] / avg_ratio, cm->seq_params.bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
-      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
-        qindex_delta = -cm->base_qindex + 1;
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
       }
 
       av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
@@ -174,29 +175,31 @@
                                    BLOCK_SIZE bs) {
   double energy, energy_midpoint;
   aom_clear_system_state();
-  energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy
-                                          : DEFAULT_E_MIDPOINT;
+  energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
+                        ? cpi->twopass.frame_avg_haar_energy
+                        : DEFAULT_E_MIDPOINT;
   energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
 
-int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
-                                         int block_var_level) {
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level) {
   int rate_level;
   const AV1_COMMON *const cm = &cpi->common;
 
-  if (DELTAQ_MODULATION == 1) {
+  if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
     ENERGY_IN_BOUNDS(block_var_level);
     rate_level = SEGMENT_ID(block_var_level);
   } else {
     rate_level = block_var_level;
   }
+  const int base_qindex = cm->quant_params.base_qindex;
   int qindex_delta = av1_compute_qdelta_by_rate(
-      &cpi->rc, cm->current_frame.frame_type, cm->base_qindex,
+      &cpi->rc, cm->current_frame.frame_type, base_qindex,
       deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth);
 
-  if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
-    qindex_delta = -cm->base_qindex + 1;
+  if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -base_qindex + 1;
   }
-  return qindex_delta;
+  return base_qindex + qindex_delta;
 }

diff --git a/libaom/av1/encoder/aq_variance.h b/libaom/av1/encoder/aq_variance.h
index 2d22b66..543eb0b 100644
--- a/libaom/av1/encoder/aq_variance.h
+++ b/libaom/av1/encoder/aq_variance.h

@@ -21,8 +21,8 @@
 void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
-int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
-                                         int block_var_level);
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level);
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs);
 

diff --git a/libaom/av1/encoder/arm/neon/av1_error_neon.c b/libaom/av1/encoder/arm/neon/av1_error_neon.c
new file mode 100644
index 0000000..22da1a8
--- /dev/null
+++ b/libaom/av1/encoder/arm/neon/av1_error_neon.c

@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  int64x2_t error = vdupq_n_s64(0);
+  int64x2_t sqcoeff = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = load_tran_low_to_s16q(coeff);
+    const int16x8_t d = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+
+    const int16x4_t coeff_lo = vget_low_s16(c);
+    const int16x4_t coeff_hi = vget_high_s16(c);
+    const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo);
+    const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi);
+    const int64x2_t sqcoeff2 =
+        vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1));
+    sqcoeff = vaddq_s64(sqcoeff, sqcoeff2);
+
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+#if defined(__aarch64__)
+  *ssz = vaddvq_s64(sqcoeff);
+  return vaddvq_s64(error);
+#else
+  *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1);
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+#endif
+}
+
+int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}

diff --git a/libaom/av1/encoder/arm/neon/quantize_neon.c b/libaom/av1/encoder/arm/neon/quantize_neon.c
index 36e7d33..c2f50a2 100644
--- a/libaom/av1/encoder/arm/neon/quantize_neon.c
+++ b/libaom/av1/encoder/arm/neon/quantize_neon.c

@@ -17,102 +17,199 @@
 
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
+#include "av1/common/arm/mem_neon.h"
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
-void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
   (void)scan;
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    int i;
-    const int16x8_t v_zero = vdupq_n_s16(0);
-    const int16x8_t v_one = vdupq_n_s16(1);
-    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    // adjust for dc
-    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-    // process dc and the first seven ac coeffs
-    {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
-      v_round = vmovq_n_s16(round_ptr[1]);
-      v_quant = vmovq_n_s16(quant_ptr[1]);
-      v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    }
-    // now process the rest of the ac coeffs
-    for (i = 8; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
-    }
-    {
-      const int16x4_t v_eobmax_3210 = vmax_s16(
-          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
-      const int64x1_t v_eobmax_xx32 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-      const int16x4_t v_eobmax_tmp =
-          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-      const int64x1_t v_eobmax_xxx3 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-      const int16x4_t v_eobmax_final =
-          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-    }
-  } else {
-    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
-    *eob_ptr = 0;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  int i;
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
   }
+  // now process the rest of the ac coeffs
+  for (i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(&qcoeff_ptr[i], v_qcoeff);
+    store_s16q_to_tran_low(&dqcoeff_ptr[i], v_dqcoeff);
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+#endif  // __aarch64__
+}
+
+static INLINE void calculate_dqcoeff_lp_and_store(const int16x8_t qcoeff,
+                                                  const int16x8_t dequant,
+                                                  int16_t *dqcoeff) {
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+}
+
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan) {
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&scan[0]);
+    const int16x8_t v_coeff = vld1q_s16(coeff_ptr);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    vst1q_s16(qcoeff_ptr, v_qcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  }
+  // now process the rest of the ac coeffs
+  for (int i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&scan[i]);
+    const int16x8_t v_coeff = vld1q_s16(coeff_ptr + i);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    vst1q_s16(qcoeff_ptr + i, v_qcoeff);
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+#endif  // __aarch64__
 }

diff --git a/libaom/av1/encoder/av1_fwd_txfm1d.c b/libaom/av1/encoder/av1_fwd_txfm1d.c
index 98505e0..6601c19 100644
--- a/libaom/av1/encoder/av1_fwd_txfm1d.c
+++ b/libaom/av1/encoder/av1_fwd_txfm1d.c

@@ -13,8 +13,8 @@
 #include "av1/encoder/av1_fwd_txfm1d.h"
 #include "av1/common/av1_txfm.h"
 
-void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   const int32_t size = 4;
   const int32_t *cospi;
 
@@ -56,8 +56,8 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   const int32_t size = 8;
   const int32_t *cospi;
 
@@ -141,8 +141,8 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   const int32_t size = 16;
   const int32_t *cospi;
 
@@ -312,8 +312,8 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   const int32_t size = 32;
   const int32_t *cospi;
 
@@ -673,8 +673,8 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   int bit = cos_bit;
   const int32_t *sinpi = sinpi_arr(bit);
   int32_t x0, x1, x2, x3;
@@ -732,8 +732,8 @@
   av1_range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
-void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   const int32_t size = 8;
   const int32_t *cospi;
 
@@ -846,8 +846,8 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range) {
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
   const int32_t size = 16;
   const int32_t *cospi;
 
@@ -1093,8 +1093,8 @@
   av1_range_check_buf(0, input, output, 32, stage_range[0]);
 }
 
-void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   const int32_t size = 64;
   const int32_t *cospi;
 

diff --git a/libaom/av1/encoder/av1_fwd_txfm1d.h b/libaom/av1/encoder/av1_fwd_txfm1d.h
index 9dcf165..9ef54fe 100644
--- a/libaom/av1/encoder/av1_fwd_txfm1d.h
+++ b/libaom/av1/encoder/av1_fwd_txfm1d.h

@@ -18,22 +18,22 @@
 extern "C" {
 #endif
 
-void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range);
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range);
 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,

diff --git a/libaom/av1/encoder/av1_fwd_txfm1d_cfg.h b/libaom/av1/encoder/av1_fwd_txfm1d_cfg.h
index 98b6530..2777cc2 100644
--- a/libaom/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/libaom/av1/encoder/av1_fwd_txfm1d_cfg.h

@@ -13,7 +13,7 @@
 #define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
 #include "av1/common/enums.h"
 #include "av1/encoder/av1_fwd_txfm1d.h"
-extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
-extern const int8_t fwd_cos_bit_col[5][5];
-extern const int8_t fwd_cos_bit_row[5][5];
+extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t av1_fwd_cos_bit_col[5][5];
+extern const int8_t av1_fwd_cos_bit_row[5][5];
 #endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_

diff --git a/libaom/av1/encoder/av1_fwd_txfm2d.c b/libaom/av1/encoder/av1_fwd_txfm2d.c
index d282b5f..bcb829d 100644
--- a/libaom/av1/encoder/av1_fwd_txfm2d.c
+++ b/libaom/av1/encoder/av1_fwd_txfm2d.c

@@ -22,14 +22,14 @@
 
 static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_fdct4_new;
-    case TXFM_TYPE_DCT8: return av1_fdct8_new;
-    case TXFM_TYPE_DCT16: return av1_fdct16_new;
-    case TXFM_TYPE_DCT32: return av1_fdct32_new;
-    case TXFM_TYPE_DCT64: return av1_fdct64_new;
-    case TXFM_TYPE_ADST4: return av1_fadst4_new;
-    case TXFM_TYPE_ADST8: return av1_fadst8_new;
-    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_DCT4: return av1_fdct4;
+    case TXFM_TYPE_DCT8: return av1_fdct8;
+    case TXFM_TYPE_DCT16: return av1_fdct16;
+    case TXFM_TYPE_DCT32: return av1_fdct32;
+    case TXFM_TYPE_DCT64: return av1_fdct64;
+    case TXFM_TYPE_ADST4: return av1_fadst4;
+    case TXFM_TYPE_ADST8: return av1_fadst8;
+    case TXFM_TYPE_ADST16: return av1_fadst16;
     case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
@@ -327,7 +327,7 @@
 static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
 static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
 
-const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = {
   fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
   fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
   fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
@@ -335,23 +335,23 @@
   fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
 };
 
-const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
-                            [MAX_TXWH_IDX /*txh_idx*/] = {
-                              { 13, 13, 13, 0, 0 },
-                              { 13, 13, 13, 12, 0 },
-                              { 13, 13, 13, 12, 13 },
-                              { 0, 13, 13, 12, 13 },
-                              { 0, 0, 13, 12, 13 }
-                            };
+const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 13, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 13, 12, 13 },
+                                  { 0, 13, 13, 12, 13 },
+                                  { 0, 0, 13, 12, 13 }
+                                };
 
-const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
-                            [MAX_TXWH_IDX /*txh_idx*/] = {
-                              { 13, 13, 12, 0, 0 },
-                              { 13, 13, 13, 12, 0 },
-                              { 13, 13, 12, 13, 12 },
-                              { 0, 12, 13, 12, 11 },
-                              { 0, 0, 12, 11, 10 }
-                            };
+const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 12, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 12, 13, 12 },
+                                  { 0, 12, 13, 12, 11 },
+                                  { 0, 0, 12, 11, 10 }
+                                };
 
 static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
 static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
@@ -379,7 +379,7 @@
                                                                 9 } };
 #endif
 
-const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
   fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
   fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
   fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
@@ -416,11 +416,11 @@
   set_flip_cfg(tx_type, cfg);
   const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
   const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
-  const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
-  const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
-  cfg->shift = fwd_txfm_shift_ls[tx_size];
-  cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->shift = av1_fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
   cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
   cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];

diff --git a/libaom/av1/encoder/av1_multi_thread.c b/libaom/av1/encoder/av1_multi_thread.c
index 1260c7a..d170b0c 100644
--- a/libaom/av1/encoder/av1_multi_thread.c
+++ b/libaom/av1/encoder/av1_multi_thread.c

@@ -19,8 +19,8 @@
   struct AV1Common *cm = &cpi->common;
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int tile_row, tile_col;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
 
   multi_thread_ctxt->allocated_tile_cols = tile_cols;
   multi_thread_ctxt->allocated_tile_rows = tile_rows;

diff --git a/libaom/av1/encoder/av1_quantize.c b/libaom/av1/encoder/av1_quantize.c
index ff1342c..569784a 100644
--- a/libaom/av1/encoder/av1_quantize.c
+++ b/libaom/av1/encoder/av1_quantize.c

@@ -57,7 +57,7 @@
       const int rc = scan[i];
       const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
       const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
       int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       int tmp32 = 0;
       if ((abs_coeff << (1 + log_scale)) >= thresh) {
@@ -84,7 +84,7 @@
       const int dequant =
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
       int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       int tmp32 = 0;
       if (abs_coeff * wt >=
@@ -104,6 +104,7 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_quantize_fp_helper_c(
     const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -131,7 +132,7 @@
       const int dequant =
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
       const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       int abs_qcoeff = 0;
       if (abs_coeff * wt >=
@@ -158,7 +159,7 @@
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int rc01 = (rc != 0);
-      const int coeff_sign = (coeff >> 31);
+      const int coeff_sign = AOMSIGN(coeff);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int log_scaled_round = log_scaled_round_arr[rc01];
       if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
@@ -178,6 +179,7 @@
   }
   *eob_ptr = eob + 1;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -190,6 +192,35 @@
                        eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
+void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (int i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+    if (tmp) eob = i;
+  }
+  *eob_ptr = eob + 1;
+}
+
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                              const int16_t *quant_ptr,
@@ -228,19 +259,10 @@
   } else {
     switch (qparam->log_scale) {
       case 0:
-        if (n_coeffs < 16) {
-          // TODO(jingning): Need SIMD implementation for smaller block size
-          // quantization.
-          quantize_fp_helper_c(
-              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-              p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0);
-        } else {
-          av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-                          p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                          sc->iscan);
-        }
+        av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                        sc->iscan);
         break;
       case 1:
         av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
@@ -269,7 +291,7 @@
     // TODO(sarahparker) These quantize_b optimizations need SIMD
     // implementations
     if (qm_ptr != NULL && iqm_ptr != NULL) {
-      quantize_b_adaptive_helper_c(
+      aom_quantize_b_adaptive_helper_c(
           coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
           p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
           sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
@@ -288,7 +310,7 @@
               eob_ptr, sc->scan, sc->iscan);
           break;
         case 2:
-          aom_quantize_b_64x64_adaptive_c(
+          aom_quantize_b_64x64_adaptive(
               coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
               p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
               eob_ptr, sc->scan, sc->iscan);
@@ -298,10 +320,10 @@
     }
   } else {
     if (qm_ptr != NULL && iqm_ptr != NULL) {
-      quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                          p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                          sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+      aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                              p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
     } else {
       switch (qparam->log_scale) {
         case 0:
@@ -336,7 +358,7 @@
                         const qm_val_t *iqm_ptr, const int log_scale) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
+  const int coeff_sign = AOMSIGN(coeff);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int64_t tmp;
   int eob = -1;
@@ -376,6 +398,7 @@
               eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
@@ -390,15 +413,6 @@
         p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
         sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
-    if (n_coeffs < 16) {
-      // TODO(jingning): Need SIMD implementation for smaller block size
-      // quantization.
-      av1_highbd_quantize_fp_c(
-          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
-          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-          sc->scan, sc->iscan, qparam->log_scale);
-      return;
-    }
     av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
                            dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
@@ -416,35 +430,26 @@
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qparam->use_quant_b_adapt) {
     if (qm_ptr != NULL && iqm_ptr != NULL) {
-      highbd_quantize_b_adaptive_helper_c(
+      aom_highbd_quantize_b_adaptive_helper_c(
           coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
           p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
           sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
     } else {
       switch (qparam->log_scale) {
         case 0:
-          if (LIKELY(n_coeffs >= 8)) {
-            aom_highbd_quantize_b_adaptive_c(
-                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-                eob_ptr, sc->scan, sc->iscan);
-          } else {
-            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
-            // quantization
-            aom_highbd_quantize_b_adaptive_c(
-                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-                eob_ptr, sc->scan, sc->iscan);
-          }
+          aom_highbd_quantize_b_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
           break;
         case 1:
-          aom_highbd_quantize_b_32x32_adaptive_c(
+          aom_highbd_quantize_b_32x32_adaptive(
               coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
               p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
               eob_ptr, sc->scan, sc->iscan);
           break;
         case 2:
-          aom_highbd_quantize_b_64x64_adaptive_c(
+          aom_highbd_quantize_b_64x64_adaptive(
               coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
               p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
               eob_ptr, sc->scan, sc->iscan);
@@ -454,26 +459,17 @@
     }
   } else {
     if (qm_ptr != NULL && iqm_ptr != NULL) {
-      highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                                 p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                                 dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                                 sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+      aom_highbd_quantize_b_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
     } else {
       switch (qparam->log_scale) {
         case 0:
-          if (LIKELY(n_coeffs >= 8)) {
-            aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX,
-                                  p->round_QTX, p->quant_QTX,
-                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
-          } else {
-            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
-            // quantization
-            aom_highbd_quantize_b_c(
-                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-                eob_ptr, sc->scan, sc->iscan);
-          }
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                sc->iscan);
           break;
         case 1:
           aom_highbd_quantize_b_32x32(
@@ -507,7 +503,7 @@
     const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
     const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
     const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
     const int64_t tmpw = tmp * wt;
@@ -555,6 +551,7 @@
                               dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
                               log_scale);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   uint32_t t;
@@ -567,7 +564,7 @@
 }
 
 static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
-  const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
+  const int quant = av1_dc_quant_QTX(q, 0, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
@@ -582,7 +579,7 @@
                          int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
                          int v_ac_delta_q, QUANTS *const quants,
                          Dequants *const deq) {
-  int i, q, quant_Q3, quant_QTX;
+  int i, q, quant_QTX;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = get_qzbin_factor(q, bit_depth);
@@ -590,9 +587,6 @@
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = 64;
-      // y quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
-                        : av1_ac_quant_Q3(q, 0, bit_depth);
       // y quantizer with TX scale
       quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
                          : av1_ac_quant_QTX(q, 0, bit_depth);
@@ -603,11 +597,7 @@
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
       quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
       deq->y_dequant_QTX[q][i] = quant_QTX;
-      deq->y_dequant_Q3[q][i] = quant_Q3;
 
-      // u quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
-                        : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
       // u quantizer with TX scale
       quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
                          : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
@@ -618,11 +608,7 @@
       quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
       quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
       deq->u_dequant_QTX[q][i] = quant_QTX;
-      deq->u_dequant_Q3[q][i] = quant_Q3;
 
-      // v quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
-                        : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
       // v quantizer with TX scale
       quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
                          : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
@@ -633,7 +619,6 @@
       quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
       quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
       deq->v_dequant_QTX[q][i] = quant_QTX;
-      deq->v_dequant_Q3[q][i] = quant_Q3;
     }
 
     for (i = 2; i < 8; i++) {  // 8: SIMD width
@@ -644,7 +629,6 @@
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
       deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
-      deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1];
 
       quants->u_quant[q][i] = quants->u_quant[q][1];
       quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
@@ -653,7 +637,6 @@
       quants->u_zbin[q][i] = quants->u_zbin[q][1];
       quants->u_round[q][i] = quants->u_round[q][1];
       deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
-      deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1];
       quants->v_quant[q][i] = quants->u_quant[q][1];
       quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
       quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
@@ -661,88 +644,88 @@
       quants->v_zbin[q][i] = quants->v_zbin[q][1];
       quants->v_round[q][i] = quants->v_round[q][1];
       deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
-      deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1];
     }
   }
 }
 
-void av1_init_quantizer(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  QUANTS *const quants = &cpi->quants;
-  Dequants *const dequants = &cpi->dequants;
-  av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q,
-                      cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q,
-                      cm->v_ac_delta_q, quants, dequants);
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth) {
+  QUANTS *const quants = &enc_quant_dequant_params->quants;
+  Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q,
+                      quant_params->u_dc_delta_q, quant_params->u_ac_delta_q,
+                      quant_params->v_dc_delta_q, quant_params->v_ac_delta_q,
+                      quants, dequants);
 }
 
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *const quant_params = &cm->quant_params;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const QUANTS *const quants = &cpi->quants;
+  const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants;
+  const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants;
 
-  int current_qindex = AOMMAX(
-      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
-                                      ? cm->base_qindex + xd->delta_qindex
-                                      : cm->base_qindex));
+  const int current_qindex =
+      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
+                       cm->delta_q_info.delta_q_present_flag
+                           ? quant_params->base_qindex + xd->delta_qindex
+                           : quant_params->base_qindex));
   const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
-  const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
-                    ? NUM_QM_LEVELS - 1
-                    : cm->qm_y;
+  const int rdmult =
+      av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
+  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
 
   // Y
+  const int qmlevel_y =
+      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
   x->plane[0].quant_QTX = quants->y_quant[qindex];
   x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
   x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
   x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
   x->plane[0].zbin_QTX = quants->y_zbin[qindex];
   x->plane[0].round_QTX = quants->y_round[qindex];
-  x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex];
-  memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
-         sizeof(cm->gqmatrix[qmlevel][0]));
-  memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
-         sizeof(cm->giqmatrix[qmlevel][0]));
-  xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex];
+  x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_y][0],
+         sizeof(quant_params->gqmatrix[qmlevel_y][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_y][0],
+         sizeof(quant_params->giqmatrix[qmlevel_y][0]));
 
   // U
-  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
-                ? NUM_QM_LEVELS - 1
-                : cm->qm_u;
-  {
-    x->plane[1].quant_QTX = quants->u_quant[qindex];
-    x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
-    x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
-    x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
-    x->plane[1].zbin_QTX = quants->u_zbin[qindex];
-    x->plane[1].round_QTX = quants->u_round[qindex];
-    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
-    memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
-           sizeof(cm->gqmatrix[qmlevel][1]));
-    memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
-           sizeof(cm->giqmatrix[qmlevel][1]));
-    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
-    xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex];
-  }
+  const int qmlevel_u =
+      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+  x->plane[1].quant_QTX = quants->u_quant[qindex];
+  x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+  x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+  x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+  x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+  x->plane[1].round_QTX = quants->u_round[qindex];
+  x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+  memcpy(&xd->plane[1].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_u][1],
+         sizeof(quant_params->gqmatrix[qmlevel_u][1]));
+  memcpy(&xd->plane[1].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_u][1],
+         sizeof(quant_params->giqmatrix[qmlevel_u][1]));
   // V
-  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
-                ? NUM_QM_LEVELS - 1
-                : cm->qm_v;
-  {
-    x->plane[2].quant_QTX = quants->v_quant[qindex];
-    x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
-    x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
-    x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
-    x->plane[2].zbin_QTX = quants->v_zbin[qindex];
-    x->plane[2].round_QTX = quants->v_round[qindex];
-    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
-    memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2],
-           sizeof(cm->gqmatrix[qmlevel][2]));
-    memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2],
-           sizeof(cm->giqmatrix[qmlevel][2]));
-    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
-    xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex];
-  }
+  const int qmlevel_v =
+      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+  x->plane[2].quant_QTX = quants->v_quant[qindex];
+  x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+  x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+  x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+  x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+  x->plane[2].round_QTX = quants->v_round[qindex];
+  x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+  memcpy(&xd->plane[2].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_v][2],
+         sizeof(quant_params->gqmatrix[qmlevel_v][2]));
+  memcpy(&xd->plane[2].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_v][2],
+         sizeof(quant_params->giqmatrix[qmlevel_v][2]));
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->qindex = qindex;
 
@@ -757,24 +740,29 @@
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 }
 
-void av1_set_quantizer(AV1_COMMON *cm, int q) {
+void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
+                       int q) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
-  cm->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
-  cm->y_dc_delta_q = 0;
-  cm->u_dc_delta_q = 0;
-  cm->u_ac_delta_q = 0;
-  cm->v_dc_delta_q = 0;
-  cm->v_ac_delta_q = 0;
-  cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel);
-  cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
-                             cm->min_qmlevel, cm->max_qmlevel);
+  CommonQuantParams *quant_params = &cm->quant_params;
+  quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+  quant_params->y_dc_delta_q = 0;
+  quant_params->u_dc_delta_q = 0;
+  quant_params->u_ac_delta_q = 0;
+  quant_params->v_dc_delta_q = 0;
+  quant_params->v_ac_delta_q = 0;
+  quant_params->qmatrix_level_y =
+      aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
+  quant_params->qmatrix_level_u =
+      aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
+                      min_qmlevel, max_qmlevel);
 
   if (!cm->seq_params.separate_uv_delta_q)
-    cm->qm_v = cm->qm_u;
+    quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
   else
-    cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
-                               cm->min_qmlevel, cm->max_qmlevel);
+    quant_params->qmatrix_level_v =
+        aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q,
+                        min_qmlevel, max_qmlevel);
 }
 
 // Table that converts 0-63 Q-range values passed in outside to the Qindex

diff --git a/libaom/av1/encoder/av1_quantize.h b/libaom/av1/encoder/av1_quantize.h
index 6419265..40fb4be 100644
--- a/libaom/av1/encoder/av1_quantize.h
+++ b/libaom/av1/encoder/av1_quantize.h

@@ -31,6 +31,8 @@
   const qm_val_t *qmatrix;
   const qm_val_t *iqmatrix;
   int use_quant_b_adapt;
+  int use_optimize_b;
+  int xform_quant_idx;
 } QUANT_PARAM;
 
 typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -79,12 +81,16 @@
   DECLARE_ALIGNED(16, int16_t,
                   u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
   DECLARE_ALIGNED(16, int16_t,
-                  v_dequant_QTX[QINDEX_RANGE][8]);              // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+                  v_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
 } Dequants;
 
+typedef struct {
+  // Quantization parameters for internal quantizer setup.
+  QUANTS quants;
+  // Dequantization parameters for internal quantizer setup.
+  Dequants dequants;
+} EncQuantDequantParams;
+
 struct AV1_COMP;
 struct AV1Common;
 
@@ -98,9 +104,12 @@
                          int v_ac_delta_q, QUANTS *const quants,
                          Dequants *const deq);
 
-void av1_init_quantizer(struct AV1_COMP *cpi);
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth);
 
-void av1_set_quantizer(struct AV1Common *cm, int q);
+void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
+                       int max_qmlevel, int q);
 
 int av1_quantizer_to_qindex(int quantizer);
 
@@ -124,6 +133,7 @@
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
@@ -144,6 +154,7 @@
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/bitstream.c b/libaom/av1/encoder/bitstream.c
index cbac2b2..daa8ce1 100644
--- a/libaom/av1/encoder/bitstream.c
+++ b/libaom/av1/encoder/bitstream.c

@@ -60,25 +60,25 @@
   }
 }
 
-static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
-                                             MACROBLOCKD *xd,
-                                             const RestorationUnitInfo *rui,
-                                             aom_writer *const w, int plane,
-                                             FRAME_COUNTS *counts);
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts);
 
-static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
-                                  const MB_MODE_INFO *mi,
-                                  const MB_MODE_INFO *above_mi,
-                                  const MB_MODE_INFO *left_mi,
-                                  PREDICTION_MODE mode, aom_writer *w) {
+static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                             const MB_MODE_INFO *mi,
+                                             const MB_MODE_INFO *above_mi,
+                                             const MB_MODE_INFO *left_mi,
+                                             PREDICTION_MODE mode,
+                                             aom_writer *w) {
   assert(!is_intrabc_block(mi));
   (void)mi;
   aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
                    INTRA_MODES);
 }
 
-static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
-                             FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
+static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                                        FRAME_CONTEXT *ec_ctx,
+                                        const int16_t mode_ctx) {
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
 
   aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
@@ -95,19 +95,17 @@
   }
 }
 
-static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
-                          const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
-  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-
+static AOM_INLINE void write_drl_idx(
+    FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
   assert(mbmi->ref_mv_idx < 3);
 
   const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
   if (new_mv) {
     int idx;
     for (idx = 0; idx < 2; ++idx) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
 
         aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
                          2);
@@ -121,9 +119,8 @@
     int idx;
     // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
     for (idx = 1; idx < 3; ++idx) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
         aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
                          ec_ctx->drl_cdf[drl_ctx], 2);
         if (mbmi->ref_mv_idx == (idx - 1)) return;
@@ -133,18 +130,20 @@
   }
 }
 
-static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
-                                      PREDICTION_MODE mode,
-                                      const int16_t mode_ctx) {
+static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                                 PREDICTION_MODE mode,
+                                                 const int16_t mode_ctx) {
   assert(is_inter_compound_mode(mode));
   aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
                    xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
                    INTER_COMPOUND_MODES);
 }
 
-static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
-                                TX_SIZE tx_size, int depth, int blk_row,
-                                int blk_col, aom_writer *w) {
+static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
+                                           const MB_MODE_INFO *mbmi,
+                                           TX_SIZE tx_size, int depth,
+                                           int blk_row, int blk_col,
+                                           aom_writer *w) {
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
@@ -193,7 +192,8 @@
   }
 }
 
-static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) {
+static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
+                                              aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -251,8 +251,9 @@
   return skip_mode;
 }
 
-static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           int segment_id, aom_writer *w, const int is_inter) {
+static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
+                                      const MACROBLOCKD *xd, int segment_id,
+                                      aom_writer *w, const int is_inter) {
   if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
       assert(is_inter);
@@ -264,12 +265,13 @@
   }
 }
 
-static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                              const MB_MODE_INFO *mbmi, aom_writer *w) {
+static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         const MB_MODE_INFO *mbmi,
+                                         aom_writer *w) {
   MOTION_MODE last_motion_mode_allowed =
-      cm->switchable_motion_mode
+      cm->features.switchable_motion_mode
           ? motion_mode_allowed(cm->global_motion, xd, mbmi,
-                                cm->allow_warped_motion)
+                                cm->features.allow_warped_motion)
           : SIMPLE_TRANSLATION;
   assert(mbmi->motion_mode <= last_motion_mode_allowed);
   switch (last_motion_mode_allowed) {
@@ -285,8 +287,8 @@
   }
 }
 
-static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
-                               aom_writer *w) {
+static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
+                                          int delta_qindex, aom_writer *w) {
   int sign = delta_qindex < 0;
   int abs = sign ? -delta_qindex : delta_qindex;
   int rem_bits, thr;
@@ -307,8 +309,9 @@
   }
 }
 
-static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                int lf_id, int delta_lflevel, aom_writer *w) {
+static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
+                                           const MACROBLOCKD *xd, int lf_id,
+                                           int delta_lflevel, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
@@ -336,8 +339,8 @@
   }
 }
 
-static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
-                            int num) {
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                                       int n, int num) {
   const TOKENEXTRA *p = *tp;
   write_uniform(w, n, p->token);  // The first color index.
   ++p;
@@ -349,13 +352,11 @@
   *tp = p;
 }
 
-static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
-                            const TOKENEXTRA **tp,
-                            const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
-                            MB_MODE_INFO *mbmi, int plane,
-                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
-                            int block, int blk_row, int blk_col,
-                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+static AOM_INLINE void pack_txb_tokens(
+    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp,
+    const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+    int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
+    int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
@@ -369,20 +370,7 @@
                                                          blk_col)];
 
   if (tx_size == plane_tx_size || plane) {
-    const int txb_offset =
-        x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-    tran_low_t *tcoeff_txb =
-        x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
-    uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
-    uint8_t *txb_skip_ctx_txb =
-        x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
-    int *dc_sign_ctx_txb =
-        x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
-    tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
-    const uint16_t eob = eob_txb[block];
-    TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
-    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
-                         eob, &txb_ctx);
+    av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size);
 #if CONFIG_RD_DEBUG
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
@@ -411,20 +399,20 @@
   }
 }
 
-static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm,
-                                          uint8_t *segment_ids,
-                                          BLOCK_SIZE bsize, int mi_row,
-                                          int mi_col, int segment_id) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+static INLINE void set_spatial_segment_id(
+    const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) {
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
-  int x, y;
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
 
-  for (y = 0; y < ymis; ++y)
-    for (x = 0; x < xmis; ++x)
-      segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id;
+  for (int y = 0; y < ymis; ++y) {
+    for (int x = 0; x < xmis; ++x) {
+      segment_ids[mi_offset + y * mi_params->mi_cols + x] = segment_id;
+    }
+  }
 }
 
 int av1_neg_interleave(int x, int ref, int max) {
@@ -451,27 +439,28 @@
   }
 }
 
-static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
-                             aom_writer *w, const struct segmentation *seg,
-                             struct segmentation_probs *segp, int mi_row,
-                             int mi_col, int skip) {
+static AOM_INLINE void write_segment_id(
+    AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, aom_writer *w,
+    const struct segmentation *seg, struct segmentation_probs *segp, int skip) {
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int cdf_num;
-  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
   if (skip) {
     // Still need to transmit tx size for intra blocks even if skip is
     // true. Changing segment_id may make the tx size become invalid, e.g
     // changing from lossless to lossy.
-    assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
+    assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
 
-    set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
-                           mi_col, pred);
-    set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
-                           mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+                           mbmi->sb_type, mi_row, mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->sb_type,
+                           mi_row, mi_col, pred);
     /* mbmi is read only but we need to update segment_id */
     ((MB_MODE_INFO *)mbmi)->segment_id = pred;
     return;
@@ -481,16 +470,16 @@
       av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
   aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
   aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
-  set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
-                         mi_col, mbmi->segment_id);
+  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->sb_type,
+                         mi_row, mi_col, mbmi->segment_id);
 }
 
 #define WRITE_REF_BIT(bname, pname) \
   aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
 
 // This function encodes the reference frame
-static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             aom_writer *w) {
+static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
+                                        const MACROBLOCKD *xd, aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -593,10 +582,9 @@
   }
 }
 
-static void write_filter_intra_mode_info(const AV1_COMMON *cm,
-                                         const MACROBLOCKD *xd,
-                                         const MB_MODE_INFO *const mbmi,
-                                         aom_writer *w) {
+static AOM_INLINE void write_filter_intra_mode_info(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+    aom_writer *w) {
   if (av1_filter_intra_allowed(cm, mbmi)) {
     aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
                      xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
@@ -609,25 +597,26 @@
   }
 }
 
-static void write_angle_delta(aom_writer *w, int angle_delta,
-                              aom_cdf_prob *cdf) {
+static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
+                                         aom_cdf_prob *cdf) {
   aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
                    2 * MAX_ANGLE_DELTA + 1);
 }
 
-static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
-                                   aom_writer *w) {
-  AV1_COMMON *const cm = &cpi->common;
+static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
+                                              const MACROBLOCKD *xd,
+                                              aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
-    assert(mbmi->interp_filters ==
-           av1_broadcast_interp_filter(
-               av1_unswitchable_filter(cm->interp_filter)));
+    int_interpfilters filters = av1_broadcast_interp_filter(
+        av1_unswitchable_filter(cm->features.interp_filter));
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
     return;
   }
-  if (cm->interp_filter == SWITCHABLE) {
+  if (cm->features.interp_filter == SWITCHABLE) {
     int dir;
     for (dir = 0; dir < 2; ++dir) {
       const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
@@ -644,9 +633,9 @@
 // Transmit color values with delta encoding. Write the first value as
 // literal, and the deltas between each value and the previous one. "min_val" is
 // the smallest possible value of the deltas.
-static void delta_encode_palette_colors(const int *colors, int num,
-                                        int bit_depth, int min_val,
-                                        aom_writer *w) {
+static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
+                                                   int bit_depth, int min_val,
+                                                   aom_writer *w) {
   if (num <= 0) return;
   assert(colors[0] < (1 << bit_depth));
   aom_write_literal(w, colors[0], bit_depth);
@@ -676,9 +665,9 @@
 // Transmit luma palette color values. First signal if each color in the color
 // cache is used. Those colors that are not in the cache are transmitted with
 // delta encoding.
-static void write_palette_colors_y(const MACROBLOCKD *const xd,
-                                   const PALETTE_MODE_INFO *const pmi,
-                                   int bit_depth, aom_writer *w) {
+static AOM_INLINE void write_palette_colors_y(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
   const int n = pmi->palette_size[0];
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
@@ -700,9 +689,9 @@
 // Write chroma palette color values. U channel is handled similarly to the luma
 // channel. For v channel, either use delta encoding or transmit raw values
 // directly, whichever costs less.
-static void write_palette_colors_uv(const MACROBLOCKD *const xd,
-                                    const PALETTE_MODE_INFO *const pmi,
-                                    int bit_depth, aom_writer *w) {
+static AOM_INLINE void write_palette_colors_uv(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
   const int n = pmi->palette_size[1];
   const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
   const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
@@ -759,12 +748,13 @@
   }
 }
 
-static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                    const MB_MODE_INFO *const mbmi, int mi_row,
-                                    int mi_col, aom_writer *w) {
+static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
+                                               const MACROBLOCKD *xd,
+                                               const MB_MODE_INFO *const mbmi,
+                                               aom_writer *w) {
   const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
@@ -783,9 +773,7 @@
   }
 
   const int uv_dc_pred =
-      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y);
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref;
   if (uv_dc_pred) {
     const int n = pmi->palette_size[1];
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
@@ -801,27 +789,21 @@
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
-                       aom_writer *w) {
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) {
   MB_MODE_INFO *mbmi = xd->mi[0];
+  const FeatureFlags *const features = &cm->features;
   const int is_inter = is_inter_block(mbmi);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-
-  // Only y plane's tx_type is transmitted
-  if (plane > 0) return;
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
-                                    cm->reduced_tx_set_used);
-
-  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
-      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+  if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
        (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
       !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const TxSetType tx_set_type =
-        av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
-    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+        tx_size, is_inter, features->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used);
     // eset == 0 should correspond to a set with only DCT_DCT and there
     // is no need to send the tx_type
     assert(eset > 0);
@@ -845,22 +827,26 @@
   }
 }
 
-static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
-                                     PREDICTION_MODE mode, aom_writer *w) {
+static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+                                                BLOCK_SIZE bsize,
+                                                PREDICTION_MODE mode,
+                                                aom_writer *w) {
   aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
                    INTRA_MODES);
 }
 
-static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
-                                UV_PREDICTION_MODE uv_mode,
-                                PREDICTION_MODE y_mode,
-                                CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) {
+static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                           UV_PREDICTION_MODE uv_mode,
+                                           PREDICTION_MODE y_mode,
+                                           CFL_ALLOWED_TYPE cfl_allowed,
+                                           aom_writer *w) {
   aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
                    UV_INTRA_MODES - !cfl_allowed);
 }
 
-static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
-                             int joint_sign, aom_writer *w) {
+static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
+                                        uint8_t idx, int8_t joint_sign,
+                                        aom_writer *w) {
   aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
   // Magnitudes are only signaled for nonzero codes.
   if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
@@ -873,39 +859,54 @@
   }
 }
 
-static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
-                       int skip, int mi_col, int mi_row) {
-  if (cm->coded_lossless || cm->allow_intrabc) return;
+static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  aom_writer *w, int skip) {
+  if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
 
-  const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
-  const MB_MODE_INFO *mbmi =
-      cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)];
-  // Initialise when at top left part of the superblock
-  if (!(mi_row & (cm->seq_params.mib_size - 1)) &&
-      !(mi_col & (cm->seq_params.mib_size - 1))) {  // Top left?
-    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
-        xd->cdef_preset[3] = -1;
+  // At the start of a superblock, mark that we haven't yet written CDEF
+  // strengths for any of the CDEF units contained in this superblock.
+  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int mi_row_in_sb = (xd->mi_row & sb_mask);
+  const int mi_col_in_sb = (xd->mi_col & sb_mask);
+  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
   }
 
-  // Emit CDEF param at first non-skip coding block
-  const int mask = 1 << (6 - MI_SIZE_LOG2);
-  const int index = cm->seq_params.sb_size == BLOCK_128X128
-                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+  // CDEF unit size is 64x64 irrespective of the superblock size.
+  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+  // Find index of this CDEF unit in this superblock.
+  const int index_mask = cdef_size;
+  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
-  if (xd->cdef_preset[index] == -1 && !skip) {
+
+  // Write CDEF strength to the first non-skip coding block in this CDEF unit.
+  if (!xd->cdef_transmitted[index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    const CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+                        xd->mi_col & first_block_mask);
+    const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
     aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
-    xd->cdef_preset[index] = mbmi->cdef_strength;
+    xd->cdef_transmitted[index] = true;
   }
 }
 
-static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
-                                   const struct segmentation *const seg,
-                                   struct segmentation_probs *const segp,
-                                   int mi_row, int mi_col, int skip,
-                                   int preskip) {
+static AOM_INLINE void write_inter_segment_id(
+    AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg,
+    struct segmentation_probs *const segp, int skip, int preskip) {
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
   if (seg->update_map) {
     if (preskip) {
@@ -913,7 +914,7 @@
     } else {
       if (seg->segid_preskip) return;
       if (skip) {
-        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
+        write_segment_id(cpi, mbmi, w, seg, segp, 1);
         if (seg->temporal_update) mbmi->seg_id_predicted = 0;
         return;
       }
@@ -923,22 +924,22 @@
       aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
       aom_write_symbol(w, pred_flag, pred_cdf, 2);
       if (!pred_flag) {
-        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+        write_segment_id(cpi, mbmi, w, seg, segp, 0);
       }
       if (pred_flag) {
-        set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type,
-                               mi_row, mi_col, mbmi->segment_id);
+        set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+                               mbmi->sb_type, mi_row, mi_col, mbmi->segment_id);
       }
     } else {
-      write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+      write_segment_id(cpi, mbmi, w, seg, segp, 0);
     }
   }
 }
 
 // If delta q is present, writes delta_q index.
 // Also writes delta_q loop filter levels, if present.
-static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
-                                 const int mi_col, int skip, aom_writer *w) {
+static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
+                                            aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
@@ -948,8 +949,8 @@
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     const BLOCK_SIZE bsize = mbmi->sb_type;
     const int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
 
     if ((bsize != cm->seq_params.sb_size || skip == 0) &&
         super_block_upper_left) {
@@ -982,9 +983,9 @@
   }
 }
 
-static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
-                                         const int mi_col, int is_keyframe,
-                                         aom_writer *w) {
+static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
+                                                    int is_keyframe,
+                                                    aom_writer *w) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1010,9 +1011,7 @@
   }
 
   // UV mode and UV angle delta.
-  if (!cm->seq_params.monochrome &&
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
     const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
     write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
     if (uv_mode == UV_CFL_PRED)
@@ -1024,16 +1023,57 @@
   }
 
   // Palette.
-  if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
-    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    write_palette_mode_info(cm, xd, mbmi, w);
   }
 
   // Filter intra.
   write_filter_intra_mode_info(cm, xd, mbmi, w);
 }
 
-static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
-                                const int mi_col, aom_writer *w) {
+static INLINE int16_t mode_context_analyzer(
+    const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] <= INTRA_FRAME) return mode_context;
+
+  const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
+}
+
+static INLINE int_mv get_ref_mv_from_stack(
+    int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                               x->mbmi_ext_frame);
+}
+
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1041,16 +1081,16 @@
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &ec_ctx->seg;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int allow_hp = cm->allow_high_precision_mv;
+  const int allow_hp = cm->features.allow_high_precision_mv;
   const int is_inter = is_inter_block(mbmi);
   const int is_compound = has_second_ref(mbmi);
   int ref;
 
-  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+  write_inter_segment_id(cpi, w, seg, segp, 0, 1);
 
   write_skip_mode(cm, xd, segment_id, mbmi, w);
 
@@ -1058,18 +1098,18 @@
   const int skip =
       mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
 
-  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+  write_inter_segment_id(cpi, w, seg, segp, skip, 0);
 
-  write_cdef(cm, xd, w, skip, mi_col, mi_row);
+  write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+  write_delta_q_params(cpi, skip, w);
 
   if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
+    write_intra_prediction_modes(cpi, 0, w);
   } else {
     int16_t mode_ctx;
 
@@ -1078,7 +1118,7 @@
     write_ref_frames(cm, xd, w);
 
     mode_ctx =
-        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+        mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame);
 
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
@@ -1088,7 +1128,7 @@
         write_inter_mode(w, mode, ec_ctx, mode_ctx);
 
       if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
-        write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w);
       else
         assert(mbmi->ref_mv_idx == 0);
     }
@@ -1096,17 +1136,17 @@
     if (mode == NEWMV || mode == NEW_NEWMV) {
       for (ref = 0; ref < 1 + is_compound; ++ref) {
         nmv_context *nmvc = &ec_ctx->nmvc;
-        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        const int_mv ref_mv = get_ref_mv(x, ref);
         av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
       }
     } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
-      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      const int_mv ref_mv = get_ref_mv(x, 1);
       av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
     } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
-      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      const int_mv ref_mv = get_ref_mv(x, 0);
       av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
     }
 
@@ -1120,13 +1160,12 @@
         aom_write_symbol(w, mbmi->interintra_mode,
                          ec_ctx->interintra_mode_cdf[bsize_group],
                          INTERINTRA_MODES);
-        if (is_interintra_wedge_used(bsize)) {
+        if (av1_is_wedge_used(bsize)) {
           aom_write_symbol(w, mbmi->use_wedge_interintra,
                            ec_ctx->wedge_interintra_cdf[bsize], 2);
           if (mbmi->use_wedge_interintra) {
             aom_write_symbol(w, mbmi->interintra_wedge_index,
-                             ec_ctx->wedge_idx_cdf[bsize], 16);
-            assert(mbmi->interintra_wedge_sign == 0);
+                             ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
           }
         }
       }
@@ -1177,7 +1216,7 @@
         if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
           assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
           aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
-                           ec_ctx->wedge_idx_cdf[bsize], 16);
+                           ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
           aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
         } else {
           assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
@@ -1186,13 +1225,13 @@
         }
       }
     }
-    write_mb_interp_filter(cpi, xd, w);
+    write_mb_interp_filter(cm, xd, w);
   }
 }
 
-static void write_intrabc_info(MACROBLOCKD *xd,
-                               const MB_MODE_INFO_EXT *mbmi_ext,
-                               aom_writer *w) {
+static AOM_INLINE void write_intrabc_info(
+    MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   int use_intrabc = is_intrabc_block(mbmi);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -1201,15 +1240,14 @@
     assert(mbmi->mode == DC_PRED);
     assert(mbmi->uv_mode == UV_DC_PRED);
     assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
-    int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv;
+    int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv;
     av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
   }
 }
 
-static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
-                              const MB_MODE_INFO_EXT *mbmi_ext,
-                              const int mi_row, const int mi_col,
-                              aom_writer *w) {
+static AOM_INLINE void write_mb_modes_kf(
+    AV1_COMP *cpi, MACROBLOCKD *xd,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
@@ -1217,27 +1255,27 @@
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+    write_segment_id(cpi, mbmi, w, seg, segp, 0);
 
   const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
 
   if (!seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip);
+    write_segment_id(cpi, mbmi, w, seg, segp, skip);
 
-  write_cdef(cm, xd, w, skip, mi_col, mi_row);
+  write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+  write_delta_q_params(cpi, skip, w);
 
   if (av1_allow_intrabc(cm)) {
-    write_intrabc_info(xd, mbmi_ext, w);
+    write_intrabc_info(xd, mbmi_ext_frame, w);
     if (is_intrabc_block(mbmi)) return;
   }
 
-  write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
+  write_intra_prediction_modes(cpi, 1, w);
 }
 
 #if CONFIG_RD_DEBUG
-static void dump_mode_info(MB_MODE_INFO *mi) {
+static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
   printf("\nmi->mi_row == %d\n", mi->mi_row);
   printf("&& mi->mi_col == %d\n", mi->mi_col);
   printf("&& mi->sb_type == %d\n", mi->sb_type);
@@ -1273,12 +1311,15 @@
 #endif
 
 #if ENC_MISMATCH_DEBUG
-static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  const MB_MODE_INFO *const *mbmi =
-      *(cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col));
-  const MB_MODE_INFO_EXT *const *mbmi_ext =
-      cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+static AOM_INLINE void enc_dump_logs(
+    const AV1_COMMON *const cm,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
+  const MB_MODE_INFO *const mbmi = *(
+      cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col));
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame =
+      mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col,
+                                                 cm->mi_params.mi_alloc_bsize,
+                                                 mbmi_ext_info->stride);
   if (is_inter_block(mbmi)) {
 #define FRAME_TO_CHECK 11
     if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
@@ -1297,8 +1338,8 @@
 
       const int16_t mode_ctx =
           is_comp_ref ? 0
-                      : av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                                  mbmi->ref_frame);
+                      : mode_context_analyzer(mbmi_ext_frame->mode_context,
+                                              mbmi->ref_frame);
 
       const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
       int16_t zeromv_ctx = -1;
@@ -1326,30 +1367,13 @@
 }
 #endif  // ENC_MISMATCH_DEBUG
 
-static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
-                         aom_writer *w, int mi_row, int mi_col) {
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  int bh, bw;
-  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   MB_MODE_INFO *m = xd->mi[0];
 
-  assert(m->sb_type <= cm->seq_params.sb_size ||
-         (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL));
-
-  bh = mi_size_high[m->sb_type];
-  bw = mi_size_wide[m->sb_type];
-
-  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
-
-  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w);
+    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w);
   } else {
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
@@ -1357,52 +1381,41 @@
     set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
 
 #if ENC_MISMATCH_DEBUG
-    enc_dump_logs(cpi, mi_row, mi_col);
+    enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, mi_row, mi_col, w);
+    pack_inter_mode_mvs(cpi, w);
   }
 }
 
-static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
-                                  MB_MODE_INFO *const mbmi, aom_writer *w,
-                                  const TOKENEXTRA **tok,
-                                  const TOKENEXTRA *const tok_end,
-                                  TOKEN_STATS *token_stats, const int row,
-                                  const int col, int *block, const int plane) {
+static AOM_INLINE void write_inter_txb_coeff(
+    AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
+    aom_writer *w, const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    TOKEN_STATS *token_stats, const int row, const int col, int *block,
+    const int plane) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const BLOCK_SIZE bsizec =
-      scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
-
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
   const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
   const int step =
       tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
   const int bkw = tx_size_wide_unit[max_tx_size];
   const int bkh = tx_size_high_unit[max_tx_size];
-
   const BLOCK_SIZE max_unit_bsize =
-      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-  int blk_row, blk_col;
-
-  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-
-  const int unit_height =
-      AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h);
-  const int unit_width =
-      AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w);
-  for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
-       blk_row += bkh) {
-    for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
-         blk_col += bkw) {
+      get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  const int mu_blocks_high = mi_size_high[max_unit_bsize];
+  const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h);
+  const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w);
+  for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
+    for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
       pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
                       cm->seq_params.bit_depth, *block, blk_row, blk_col,
                       max_tx_size, token_stats);
@@ -1411,103 +1424,101 @@
   }
 }
 
-static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
-                           aom_writer *w, const TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end, int mi_row,
-                           int mi_col) {
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
+                                      const TOKENEXTRA **tok,
+                                      const TOKENEXTRA *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset);
-  int plane;
-  int bh, bw;
   MACROBLOCK *const x = &cpi->td.mb;
-  (void)tok;
-  (void)tok_end;
-  xd->mi = cm->mi_grid_visible + mi_offset;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
-  assert(mbmi->sb_type <= cm->seq_params.sb_size ||
-         (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
+  assert(!mbmi->skip);
 
-  bh = mi_size_high[mbmi->sb_type];
-  bw = mi_size_wide[mbmi->sb_type];
-  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter) {
+    av1_write_coeffs_mb(cm, x, w, bsize);
+  } else {
+    int block[MAX_MB_PLANE] = { 0 };
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int num_4x4_w = mi_size_wide[bsize];
+    const int num_4x4_h = mi_size_high[bsize];
+    TOKEN_STATS token_stats;
+    init_token_stats(&token_stats);
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64,
+                                                  xd->plane[0].subsampling_x,
+                                                  xd->plane[0].subsampling_y));
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
 
-  if (!mbmi->skip) {
-    if (!is_inter_block(mbmi))
-      av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type);
-
-    if (is_inter_block(mbmi)) {
-      int block[MAX_MB_PLANE] = { 0 };
-      const BLOCK_SIZE plane_bsize = mbmi->sb_type;
-      assert(plane_bsize == get_plane_block_size(mbmi->sb_type,
-                                                 xd->plane[0].subsampling_x,
-                                                 xd->plane[0].subsampling_y));
-      const int num_4x4_w =
-          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-      const int num_4x4_h =
-          block_size_high[plane_bsize] >> tx_size_high_log2[0];
-      int row, col;
-      TOKEN_STATS token_stats;
-      init_token_stats(&token_stats);
-
-      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
-      assert(max_unit_bsize ==
-             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
-                                  xd->plane[0].subsampling_y));
-      int mu_blocks_wide =
-          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-      int mu_blocks_high =
-          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-      mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
-      mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
-
-      for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
-        for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
-          for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
-            const struct macroblockd_plane *const pd = &xd->plane[plane];
-            if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                                     pd->subsampling_x, pd->subsampling_y)) {
-              continue;
-            }
-            write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats,
-                                  row, col, &block[plane], plane);
-          }
+    const int num_planes = av1_num_planes(cm);
+    for (int row = 0; row < num_4x4_h; row += mu_blocks_high) {
+      for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          if (plane && !xd->is_chroma_ref) break;
+          write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row,
+                                col, &block[plane], plane);
         }
       }
-#if CONFIG_RD_DEBUG
-      for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
-        if (mbmi->sb_type >= BLOCK_8X8 &&
-            rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
-          dump_mode_info(mbmi);
-          assert(0);
-        }
-      }
-#endif  // CONFIG_RD_DEBUG
     }
+#if CONFIG_RD_DEBUG
+    for (int plane = 0; plane < num_planes; ++plane) {
+      if (mbmi->sb_type >= BLOCK_8X8 &&
+          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+        dump_mode_info(mbmi);
+        assert(0);
+      }
+    }
+#endif  // CONFIG_RD_DEBUG
   }
 }
 
-static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                          aom_writer *w, const TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end, int mi_row,
-                          int mi_col) {
-  write_mbmi_b(cpi, tile, w, mi_row, mi_col);
-
-  AV1_COMMON *cm = &cpi->common;
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                                     aom_writer *w, const TOKENEXTRA **tok,
+                                     const TOKENEXTRA *const tok_end,
+                                     int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
+  xd->mi = mi_params->mi_grid_base + grid_idx;
+  cpi->td.mb.mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+  xd->tx_type_map = mi_params->tx_type_map + grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize <= cm->seq_params.sb_size ||
+         (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
+
+  const int bh = mi_size_high[bsize];
+  const int bw = mi_size_wide[bsize];
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  write_mbmi_b(cpi, w);
+
   for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
     const uint8_t palette_size_plane =
         mbmi->palette_mode_info.palette_size[plane];
     assert(!mbmi->skip_mode || !palette_size_plane);
     if (palette_size_plane > 0) {
       assert(mbmi->use_intrabc == 0);
-      assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type));
+      assert(av1_allow_palette(cm->features.allow_screen_content_tools,
+                               mbmi->sb_type));
+      assert(!plane || xd->is_chroma_ref);
       int rows, cols;
       av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
                                &cols);
@@ -1516,44 +1527,46 @@
     }
   }
 
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
-  int skip = mbmi->skip;
-  int segment_id = mbmi->segment_id;
-  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+  const int is_inter_tx = is_inter_block(mbmi);
+  const int skip = mbmi->skip;
+  const int segment_id = mbmi->segment_id;
+  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
       !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
     if (is_inter_tx) {  // This implies skip flag is 0.
       const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
       const int txbh = tx_size_high_unit[max_tx_size];
       const int txbw = tx_size_wide_unit[max_tx_size];
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-      int idx, idy;
-      for (idy = 0; idy < height; idy += txbh)
-        for (idx = 0; idx < width; idx += txbw)
+      const int width = mi_size_wide[bsize];
+      const int height = mi_size_high[bsize];
+      for (int idy = 0; idy < height; idy += txbh) {
+        for (int idx = 0; idx < width; idx += txbw) {
           write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+        }
+      }
     } else {
       write_selected_tx_size(xd, w);
-      set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
+      set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
     }
   } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
-                  skip && is_inter_block(mbmi), xd);
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip && is_inter_tx,
+                  xd);
   }
 
-  write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  if (!mbmi->skip) {
+    write_tokens_b(cpi, w, tok, tok_end);
+  }
 }
 
-static void write_partition(const AV1_COMMON *const cm,
-                            const MACROBLOCKD *const xd, int hbs, int mi_row,
-                            int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
-                            aom_writer *w) {
+static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
+                                       const MACROBLOCKD *const xd, int hbs,
+                                       int mi_row, int mi_col, PARTITION_TYPE p,
+                                       BLOCK_SIZE bsize, aom_writer *w) {
   const int is_partition_point = bsize >= BLOCK_8X8;
 
   if (!is_partition_point) return;
 
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
@@ -1581,19 +1594,21 @@
   }
 }
 
-static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
-                           aom_writer *const w, const TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize) {
+static AOM_INLINE void write_modes_sb(
+    AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
+    const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row,
+    int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
   const int hbs = mi_size_wide[bsize] / 2;
   const int quarter_step = mi_size_wide[bsize] / 4;
   int i;
   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -1620,12 +1635,12 @@
       break;
     case PARTITION_HORZ:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      if (mi_row + hbs < cm->mi_rows)
+      if (mi_row + hbs < mi_params->mi_rows)
         write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_VERT:
       write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      if (mi_col + hbs < cm->mi_cols)
+      if (mi_col + hbs < mi_params->mi_cols)
         write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_SPLIT:
@@ -1658,7 +1673,7 @@
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
         write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
@@ -1666,7 +1681,7 @@
     case PARTITION_VERT_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
 
         write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
@@ -1678,29 +1693,31 @@
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
-                        aom_writer *const w, int tile_row, int tile_col) {
+static AOM_INLINE void write_modes(AV1_COMP *const cpi,
+                                   const TileInfo *const tile,
+                                   aom_writer *const w, int tile_row,
+                                   int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
   const int mi_row_end = tile->mi_row_end;
   const int mi_col_start = tile->mi_col_start;
   const int mi_col_end = tile->mi_col_end;
-  int mi_row, mi_col, sb_row_in_tile;
+  const int num_planes = av1_num_planes(cm);
 
   av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
-  av1_init_above_context(cm, xd, tile->tile_row);
+  av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
 
   if (cpi->common.delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = cpi->common.base_qindex;
+    xd->current_qindex = cpi->common.quant_params.base_qindex;
     if (cpi->common.delta_q_info.delta_lf_present_flag) {
-      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+      av1_reset_loop_filter_delta(xd, num_planes);
     }
   }
 
-  for (mi_row = mi_row_start; mi_row < mi_row_end;
+  for (int mi_row = mi_row_start; mi_row < mi_row_end;
        mi_row += cm->seq_params.mib_size) {
-    sb_row_in_tile =
+    const int sb_row_in_tile =
         (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
     const TOKENEXTRA *tok =
         cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
@@ -1709,8 +1726,9 @@
 
     av1_zero_left_context(xd);
 
-    for (mi_col = mi_col_start; mi_col < mi_col_end;
+    for (int mi_col = mi_col_start; mi_col < mi_col_end;
          mi_col += cm->seq_params.mib_size) {
+      cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
       write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
                      cm->seq_params.sb_size);
     }
@@ -1718,11 +1736,11 @@
   }
 }
 
-static void encode_restoration_mode(AV1_COMMON *cm,
-                                    struct aom_write_bit_buffer *wb) {
-  assert(!cm->all_lossless);
+static AOM_INLINE void encode_restoration_mode(
+    AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.all_lossless);
   if (!cm->seq_params.enable_restoration) return;
-  if (cm->allow_intrabc) return;
+  if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int all_none = 1, chroma_none = 1;
   for (int p = 0; p < num_planes; ++p) {
@@ -1789,8 +1807,10 @@
   }
 }
 
-static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info,
-                                WienerInfo *ref_wiener_info, aom_writer *wb) {
+static AOM_INLINE void write_wiener_filter(int wiener_win,
+                                           const WienerInfo *wiener_info,
+                                           WienerInfo *ref_wiener_info,
+                                           aom_writer *wb) {
   if (wiener_win == WIENER_WIN)
     aom_write_primitive_refsubexpfin(
         wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
@@ -1832,11 +1852,11 @@
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
-                                 SgrprojInfo *ref_sgrproj_info,
-                                 aom_writer *wb) {
+static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+                                            SgrprojInfo *ref_sgrproj_info,
+                                            aom_writer *wb) {
   aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
-  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 
   if (params->r[0] == 0) {
     assert(sgrproj_info->xqd[0] == 0);
@@ -1863,17 +1883,15 @@
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
-static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
-                                             MACROBLOCKD *xd,
-                                             const RestorationUnitInfo *rui,
-                                             aom_writer *const w, int plane,
-                                             FRAME_COUNTS *counts) {
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts) {
   const RestorationInfo *rsi = cm->rst_info + plane;
   RestorationType frame_rtype = rsi->frame_restoration_type;
   if (frame_rtype == RESTORE_NONE) return;
 
   (void)counts;
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
@@ -1916,11 +1934,41 @@
   }
 }
 
-static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  assert(!cm->coded_lossless);
-  if (cm->allow_intrabc) return;
+// Only write out the ref delta section if any of the elements
+// will signal a delta.
+static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
+  struct loopfilter *lf = &cm->lf;
+  if (!lf->mode_ref_delta_update) {
+    return 0;
+  }
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (lf->ref_deltas[i] != last_ref_deltas[i]) {
+      return true;
+    }
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    if (lf->mode_deltas[i] != last_mode_deltas[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
+  if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
-  int i;
   struct loopfilter *lf = &cm->lf;
 
   // Encode the loop filter level and type
@@ -1934,52 +1982,48 @@
   }
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
-  // Write out loop filter deltas applied at the MB level based on mode or
-  // ref frame (if they are enabled).
   aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
-  if (lf->mode_ref_delta_enabled) {
-    aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled), only if there is information to write.
+  int meaningful = is_mode_ref_delta_meaningful(cm);
+  aom_wb_write_bit(wb, meaningful);
+  if (!meaningful) {
+    return;
+  }
 
-    if (lf->mode_ref_delta_update) {
-      const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
-      int8_t last_ref_deltas[REF_FRAMES];
-      if (buf == NULL) {
-        av1_set_default_ref_deltas(last_ref_deltas);
-      } else {
-        memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
-      }
-      for (i = 0; i < REF_FRAMES; i++) {
-        const int delta = lf->ref_deltas[i];
-        const int changed = delta != last_ref_deltas[i];
-        aom_wb_write_bit(wb, changed);
-        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
-      }
-
-      int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
-      if (buf == NULL) {
-        av1_set_default_mode_deltas(last_mode_deltas);
-      } else {
-        memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
-      }
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        const int delta = lf->mode_deltas[i];
-        const int changed = delta != last_mode_deltas[i];
-        aom_wb_write_bit(wb, changed);
-        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
-      }
-    }
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    const int delta = lf->ref_deltas[i];
+    const int changed = delta != last_ref_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    const int delta = lf->mode_deltas[i];
+    const int changed = delta != last_mode_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
   }
 }
 
-static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  assert(!cm->coded_lossless);
+static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
+                                   struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
   if (!cm->seq_params.enable_cdef) return;
-  if (cm->allow_intrabc) return;
+  if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int i;
-  aom_wb_write_literal(wb, cm->cdef_info.cdef_pri_damping - 3, 2);
-  assert(cm->cdef_info.cdef_pri_damping == cm->cdef_info.cdef_sec_damping);
+  aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2);
   aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2);
   for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i],
@@ -1990,7 +2034,8 @@
   }
 }
 
-static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
+static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
+                                     int delta_q) {
   if (delta_q != 0) {
     aom_wb_write_bit(wb, 1);
     aom_wb_write_inv_signed_literal(wb, delta_q, 6);
@@ -1999,36 +2044,36 @@
   }
 }
 
-static void encode_quantization(const AV1_COMMON *const cm,
-                                struct aom_write_bit_buffer *wb) {
-  const int num_planes = av1_num_planes(cm);
-
-  aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
-  write_delta_q(wb, cm->y_dc_delta_q);
+static AOM_INLINE void encode_quantization(
+    const CommonQuantParams *const quant_params, int num_planes,
+    bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, quant_params->y_dc_delta_q);
   if (num_planes > 1) {
-    int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
-                        (cm->u_ac_delta_q != cm->v_ac_delta_q);
-    if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
-    write_delta_q(wb, cm->u_dc_delta_q);
-    write_delta_q(wb, cm->u_ac_delta_q);
+    int diff_uv_delta =
+        (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) ||
+        (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q);
+    if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, quant_params->u_dc_delta_q);
+    write_delta_q(wb, quant_params->u_ac_delta_q);
     if (diff_uv_delta) {
-      write_delta_q(wb, cm->v_dc_delta_q);
-      write_delta_q(wb, cm->v_ac_delta_q);
+      write_delta_q(wb, quant_params->v_dc_delta_q);
+      write_delta_q(wb, quant_params->v_ac_delta_q);
     }
   }
-  aom_wb_write_bit(wb, cm->using_qmatrix);
-  if (cm->using_qmatrix) {
-    aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
-    aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
-    if (!cm->seq_params.separate_uv_delta_q)
-      assert(cm->qm_u == cm->qm_v);
+  aom_wb_write_bit(wb, quant_params->using_qmatrix);
+  if (quant_params->using_qmatrix) {
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS);
+    if (!separate_uv_delta_q)
+      assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v);
     else
-      aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
+      aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS);
   }
 }
 
-static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                           struct aom_write_bit_buffer *wb) {
   int i, j;
   struct segmentation *seg = &cm->seg;
 
@@ -2036,7 +2081,7 @@
   if (!seg->enabled) return;
 
   // Write update flags
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     assert(seg->update_map == 1);
     seg->temporal_update = 0;
     assert(seg->update_data == 1);
@@ -2073,15 +2118,16 @@
   }
 }
 
-static void write_frame_interp_filter(InterpFilter filter,
-                                      struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_frame_interp_filter(
+    InterpFilter filter, struct aom_write_bit_buffer *wb) {
   aom_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
     aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
 }
 
 // Same function as write_uniform but writing to uncompresses header wb
-static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
+static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+                                        int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return;
@@ -2093,50 +2139,50 @@
   }
 }
 
-static void write_tile_info_max_tile(const AV1_COMMON *const cm,
-                                     struct aom_write_bit_buffer *wb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+static AOM_INLINE void write_tile_info_max_tile(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  int width_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
   int width_sb = width_mi >> cm->seq_params.mib_size_log2;
   int height_sb = height_mi >> cm->seq_params.mib_size_log2;
   int size_sb, i;
+  const CommonTileParams *const tiles = &cm->tiles;
 
-  aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
+  aom_wb_write_bit(wb, tiles->uniform_spacing);
 
-  if (cm->uniform_tile_spacing_flag) {
-    // Uniform spaced tiles with power-of-two number of rows and columns
-    // tile columns
-    int ones = cm->log2_tile_cols - cm->min_log2_tile_cols;
+  if (tiles->uniform_spacing) {
+    int ones = tiles->log2_cols - tiles->min_log2_cols;
     while (ones--) {
       aom_wb_write_bit(wb, 1);
     }
-    if (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+    if (tiles->log2_cols < tiles->max_log2_cols) {
       aom_wb_write_bit(wb, 0);
     }
 
     // rows
-    ones = cm->log2_tile_rows - cm->min_log2_tile_rows;
+    ones = tiles->log2_rows - tiles->min_log2_rows;
     while (ones--) {
       aom_wb_write_bit(wb, 1);
     }
-    if (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+    if (tiles->log2_rows < tiles->max_log2_rows) {
       aom_wb_write_bit(wb, 0);
     }
   } else {
     // Explicit tiles with configurable tile widths and heights
     // columns
-    for (i = 0; i < cm->tile_cols; i++) {
-      size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
-      wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb),
-                       size_sb - 1);
+    for (i = 0; i < tiles->cols; i++) {
+      size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1);
       width_sb -= size_sb;
     }
     assert(width_sb == 0);
 
     // rows
-    for (i = 0; i < cm->tile_rows; i++) {
-      size_sb = cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
-      wb_write_uniform(wb, AOMMIN(height_sb, cm->max_tile_height_sb),
+    for (i = 0; i < tiles->rows; i++) {
+      size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb),
                        size_sb - 1);
       height_sb -= size_sb;
     }
@@ -2144,30 +2190,30 @@
   }
 }
 
-static void write_tile_info(const AV1_COMMON *const cm,
-                            struct aom_write_bit_buffer *saved_wb,
-                            struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       struct aom_write_bit_buffer *wb) {
   write_tile_info_max_tile(cm, wb);
 
   *saved_wb = *wb;
-  if (cm->tile_rows * cm->tile_cols > 1) {
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
     // tile id used for cdf update
-    aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows);
+    aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows);
     // Number of bytes in tile size - 1
     aom_wb_write_literal(wb, 3, 2);
   }
 }
 
-static void write_ext_tile_info(const AV1_COMMON *const cm,
-                                struct aom_write_bit_buffer *saved_wb,
-                                struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_ext_tile_info(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
   // This information is stored as a separate byte.
   int mod = wb->bit_offset % CHAR_BIT;
   if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
   assert(aom_wb_is_byte_aligned(wb));
 
   *saved_wb = *wb;
-  if (cm->tile_rows * cm->tile_cols > 1) {
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
     // Note that the last item in the uncompressed header is the data
     // describing tile configuration.
     // Number of bytes in tile column size - 1
@@ -2235,8 +2281,8 @@
   return 0;
 }
 
-static void write_render_size(const AV1_COMMON *cm,
-                              struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
   const int scaling_active = av1_resize_scaled(cm);
   aom_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
@@ -2245,8 +2291,8 @@
   }
 }
 
-static void write_superres_scale(const AV1_COMMON *const cm,
-                                 struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
+                                            struct aom_write_bit_buffer *wb) {
   const SequenceHeader *const seq_params = &cm->seq_params;
   if (!seq_params->enable_superres) {
     assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
@@ -2267,8 +2313,9 @@
   }
 }
 
-static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
-                             struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
+                                        int frame_size_override,
+                                        struct aom_write_bit_buffer *wb) {
   const int coded_width = cm->superres_upscaled_width - 1;
   const int coded_height = cm->superres_upscaled_height - 1;
 
@@ -2284,8 +2331,8 @@
   write_render_size(cm, wb);
 }
 
-static void write_frame_size_with_refs(const AV1_COMMON *const cm,
-                                       struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_frame_size_with_refs(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
   int found = 0;
 
   MV_REFERENCE_FRAME ref_frame;
@@ -2311,14 +2358,14 @@
   }
 }
 
-static void write_profile(BITSTREAM_PROFILE profile,
-                          struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
+                                     struct aom_write_bit_buffer *wb) {
   assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
   aom_wb_write_literal(wb, profile, PROFILE_BITS);
 }
 
-static void write_bitdepth(const SequenceHeader *const seq_params,
-                           struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
+                                      struct aom_write_bit_buffer *wb) {
   // Profile 0/1: [0] for 8 bit, [1]  10-bit
   // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
   aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
@@ -2327,8 +2374,8 @@
   }
 }
 
-static void write_color_config(const SequenceHeader *const seq_params,
-                               struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_color_config(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
   write_bitdepth(seq_params, wb);
   const int is_monochrome = seq_params->monochrome;
   // monochrome bit
@@ -2393,67 +2440,49 @@
   aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
 }
 
-static void write_timing_info_header(AV1_COMMON *const cm,
-                                     struct aom_write_bit_buffer *wb) {
-  aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick,
-                                32);  // Number of units in tick
-  aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale,
-                                32);  // Time scale
-  aom_wb_write_bit(
-      wb,
-      cm->timing_info.equal_picture_interval);  // Equal picture interval bit
-  if (cm->timing_info.equal_picture_interval) {
-    aom_wb_write_uvlc(
-        wb,
-        cm->timing_info.num_ticks_per_picture - 1);  // ticks per picture
+static AOM_INLINE void write_timing_info_header(
+    const aom_timing_info_t *const timing_info,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
+  aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32);
+  aom_wb_write_bit(wb, timing_info->equal_picture_interval);
+  if (timing_info->equal_picture_interval) {
+    aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1);
   }
 }
 
-static void write_decoder_model_info(AV1_COMMON *const cm,
-                                     struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_decoder_model_info(
+    const aom_dec_model_info_t *const decoder_model_info,
+    struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(
-      wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
-  aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
-                                32);  // Number of units in decoding tick
-  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5);
-  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1,
+      wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(
+      wb, decoder_model_info->num_units_in_decoding_tick, 32);
+  aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1,
                        5);
+  aom_wb_write_literal(
+      wb, decoder_model_info->frame_presentation_time_length - 1, 5);
 }
 
-static void write_dec_model_op_parameters(AV1_COMMON *const cm,
-                                          struct aom_write_bit_buffer *wb,
-                                          int op_num) {
-  if (op_num > MAX_NUM_OPERATING_POINTS)
-    aom_internal_error(
-        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-        "Encoder does not support %d decoder model operating points", op_num);
-
-  //  aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
-  //  if (!cm->op_params[op_num].has_parameters) return;
-
-  aom_wb_write_unsigned_literal(
-      wb, cm->op_params[op_num].decoder_buffer_delay,
-      cm->buffer_model.encoder_decoder_buffer_delay_length);
-
-  aom_wb_write_unsigned_literal(
-      wb, cm->op_params[op_num].encoder_buffer_delay,
-      cm->buffer_model.encoder_decoder_buffer_delay_length);
-
-  aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
-
-  cm->op_frame_timing[op_num].buffer_removal_time =
-      0;  // reset the decoded frame counter
+static AOM_INLINE void write_dec_model_op_parameters(
+    const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
 }
 
-static void write_tu_pts_info(AV1_COMMON *const cm,
-                              struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
+                                         struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
       wb, cm->frame_presentation_time,
-      cm->buffer_model.frame_presentation_time_length);
+      cm->seq_params.decoder_model_info.frame_presentation_time_length);
 }
 
-static void write_film_grain_params(const AV1_COMP *const cpi,
-                                    struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_film_grain_params(
+    const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
   const AV1_COMMON *const cm = &cpi->common;
   const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
 
@@ -2557,8 +2586,8 @@
   aom_wb_write_bit(wb, pars->clip_to_restricted_range);
 }
 
-static void write_sb_size(const SequenceHeader *const seq_params,
-                          struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
+                                     struct aom_write_bit_buffer *wb) {
   (void)seq_params;
   (void)wb;
   assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
@@ -2568,8 +2597,8 @@
   aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-static void write_sequence_header(const SequenceHeader *const seq_params,
-                                  struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_sequence_header(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
   aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
   aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
@@ -2634,10 +2663,9 @@
   aom_wb_write_bit(wb, seq_params->enable_restoration);
 }
 
-static void write_global_motion_params(const WarpedMotionParams *params,
-                                       const WarpedMotionParams *ref_params,
-                                       struct aom_write_bit_buffer *wb,
-                                       int allow_hp) {
+static AOM_INLINE void write_global_motion_params(
+    const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
+    struct aom_write_bit_buffer *wb, int allow_hp) {
   const TransformationType type = params->wmtype;
 
   aom_wb_write_bit(wb, type != IDENTITY);
@@ -2688,8 +2716,8 @@
   }
 }
 
-static void write_global_motion(AV1_COMP *cpi,
-                                struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
+                                           struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   int frame;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
@@ -2697,7 +2725,7 @@
         cm->prev_frame ? &cm->prev_frame->global_motion[frame]
                        : &default_warp_params;
     write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
-                               cm->allow_high_precision_mv);
+                               cm->features.allow_high_precision_mv);
     // TODO(sarahparker, debargha): The logic in the commented out code below
     // does not work currently and causes mismatches when resize is on.
     // Fix it before turning the optimization back on.
@@ -2707,7 +2735,7 @@
         cpi->source->y_crop_height == ref_buf->y_crop_height) {
       write_global_motion_params(&cm->global_motion[frame],
                                  &cm->prev_frame->global_motion[frame], wb,
-                                 cm->allow_high_precision_mv);
+                                 cm->features.allow_high_precision_mv);
     } else {
       assert(cm->global_motion[frame].wmtype == IDENTITY &&
              "Invalid warp type for frames of different resolutions");
@@ -2792,13 +2820,15 @@
 }
 
 // New function based on HLS R18
-static void write_uncompressed_header_obu(AV1_COMP *cpi,
-                                          struct aom_write_bit_buffer *saved_wb,
-                                          struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_uncompressed_header_obu(
+    AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
+  const CommonQuantParams *quant_params = &cm->quant_params;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
 
   current_frame->frame_refs_short_signaling = 0;
 
@@ -2813,7 +2843,7 @@
       aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
 
       if (seq_params->decoder_model_info_present_flag &&
-          cm->timing_info.equal_picture_interval == 0) {
+          seq_params->timing_info.equal_picture_interval == 0) {
         write_tu_pts_info(cm, wb);
       }
       if (seq_params->frame_id_numbers_present_flag) {
@@ -2831,34 +2861,35 @@
     aom_wb_write_bit(wb, cm->show_frame);
     if (cm->show_frame) {
       if (seq_params->decoder_model_info_present_flag &&
-          cm->timing_info.equal_picture_interval == 0)
+          seq_params->timing_info.equal_picture_interval == 0)
         write_tu_pts_info(cm, wb);
     } else {
       aom_wb_write_bit(wb, cm->showable_frame);
     }
     if (frame_is_sframe(cm)) {
-      assert(cm->error_resilient_mode);
+      assert(features->error_resilient_mode);
     } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) {
-      aom_wb_write_bit(wb, cm->error_resilient_mode);
+      aom_wb_write_bit(wb, features->error_resilient_mode);
     }
   }
-  aom_wb_write_bit(wb, cm->disable_cdf_update);
+  aom_wb_write_bit(wb, features->disable_cdf_update);
 
   if (seq_params->force_screen_content_tools == 2) {
-    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+    aom_wb_write_bit(wb, features->allow_screen_content_tools);
   } else {
-    assert(cm->allow_screen_content_tools ==
+    assert(features->allow_screen_content_tools ==
            seq_params->force_screen_content_tools);
   }
 
-  if (cm->allow_screen_content_tools) {
+  if (features->allow_screen_content_tools) {
     if (seq_params->force_integer_mv == 2) {
-      aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
+      aom_wb_write_bit(wb, features->cur_frame_force_integer_mv);
     } else {
-      assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv);
+      assert(features->cur_frame_force_integer_mv ==
+             seq_params->force_integer_mv);
     }
   } else {
-    assert(cm->cur_frame_force_integer_mv == 0);
+    assert(features->cur_frame_force_integer_mv == 0);
   }
 
   int frame_size_override_flag = 0;
@@ -2890,8 +2921,8 @@
           wb, current_frame->order_hint,
           seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
 
-    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
-      aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
+    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS);
     }
   }
 
@@ -2900,7 +2931,7 @@
     if (cm->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
-        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
           if (((seq_params->operating_point_idc[op_num] >>
                 cm->temporal_layer_id) &
                    0x1 &&
@@ -2909,10 +2940,10 @@
                    0x1) ||
               seq_params->operating_point_idc[op_num] == 0) {
             aom_wb_write_unsigned_literal(
-                wb, cm->op_frame_timing[op_num].buffer_removal_time,
-                cm->buffer_model.buffer_removal_time_length);
-            cm->op_frame_timing[op_num].buffer_removal_time++;
-            if (cm->op_frame_timing[op_num].buffer_removal_time == 0) {
+                wb, cm->buffer_removal_times[op_num],
+                seq_params->decoder_model_info.buffer_removal_time_length);
+            cm->buffer_removal_times[op_num]++;
+            if (cm->buffer_removal_times[op_num] == 0) {
               aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                                  "buffer_removal_time overflowed");
             }
@@ -2931,7 +2962,7 @@
 
   if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
     // Write all ref frame order hints if error_resilient_mode == 1
-    if (cm->error_resilient_mode &&
+    if (features->error_resilient_mode &&
         seq_params->order_hint_info.enable_order_hint) {
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
         aom_wb_write_literal(
@@ -2943,15 +2974,15 @@
 
   if (current_frame->frame_type == KEY_FRAME) {
     write_frame_size(cm, frame_size_override_flag, wb);
-    assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
-    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
-      aom_wb_write_bit(wb, cm->allow_intrabc);
+    assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, features->allow_intrabc);
   } else {
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       write_frame_size(cm, frame_size_override_flag, wb);
-      assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
-      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
-        aom_wb_write_bit(wb, cm->allow_intrabc);
+      assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, features->allow_intrabc);
     } else if (current_frame->frame_type == INTER_FRAME ||
                frame_is_sframe(cm)) {
       MV_REFERENCE_FRAME ref_frame;
@@ -3007,46 +3038,47 @@
         }
       }
 
-      if (!cm->error_resilient_mode && frame_size_override_flag) {
+      if (!features->error_resilient_mode && frame_size_override_flag) {
         write_frame_size_with_refs(cm, wb);
       } else {
         write_frame_size(cm, frame_size_override_flag, wb);
       }
 
-      if (!cm->cur_frame_force_integer_mv)
-        aom_wb_write_bit(wb, cm->allow_high_precision_mv);
-      write_frame_interp_filter(cm->interp_filter, wb);
-      aom_wb_write_bit(wb, cm->switchable_motion_mode);
+      if (!features->cur_frame_force_integer_mv)
+        aom_wb_write_bit(wb, features->allow_high_precision_mv);
+      write_frame_interp_filter(features->interp_filter, wb);
+      aom_wb_write_bit(wb, features->switchable_motion_mode);
       if (frame_might_allow_ref_frame_mvs(cm)) {
-        aom_wb_write_bit(wb, cm->allow_ref_frame_mvs);
+        aom_wb_write_bit(wb, features->allow_ref_frame_mvs);
       } else {
-        assert(cm->allow_ref_frame_mvs == 0);
+        assert(features->allow_ref_frame_mvs == 0);
       }
     }
   }
 
-  const int might_bwd_adapt =
-      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
-  if (cm->large_scale_tile)
-    assert(cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+                              !(features->disable_cdf_update);
+  if (cm->tiles.large_scale)
+    assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
 
   if (might_bwd_adapt) {
     aom_wb_write_bit(
-        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+        wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
   }
 
   write_tile_info(cm, saved_wb, wb);
-  encode_quantization(cm, wb);
+  encode_quantization(quant_params, av1_num_planes(cm),
+                      cm->seq_params.separate_uv_delta_q, wb);
   encode_segmentation(cm, xd, wb);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  if (delta_q_info->delta_q_present_flag) assert(cm->base_qindex > 0);
-  if (cm->base_qindex > 0) {
+  if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
+  if (quant_params->base_qindex > 0) {
     aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
     if (delta_q_info->delta_q_present_flag) {
       aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
-      xd->current_qindex = cm->base_qindex;
-      if (cm->allow_intrabc)
+      xd->current_qindex = quant_params->base_qindex;
+      if (features->allow_intrabc)
         assert(delta_q_info->delta_lf_present_flag == 0);
       else
         aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag);
@@ -3058,10 +3090,10 @@
     }
   }
 
-  if (cm->all_lossless) {
+  if (features->all_lossless) {
     assert(!av1_superres_scaled(cm));
   } else {
-    if (!cm->coded_lossless) {
+    if (!features->coded_lossless) {
       encode_loopfilter(cm, wb);
       encode_cdef(cm, wb);
     }
@@ -3069,10 +3101,10 @@
   }
 
   // Write TX mode
-  if (cm->coded_lossless)
-    assert(cm->tx_mode == ONLY_4X4);
+  if (features->coded_lossless)
+    assert(features->tx_mode == ONLY_4X4);
   else
-    aom_wb_write_bit(wb, cm->tx_mode == TX_MODE_SELECT);
+    aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT);
 
   if (!frame_is_intra_only(cm)) {
     const int use_hybrid_pred =
@@ -3085,11 +3117,11 @@
     aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag);
 
   if (frame_might_allow_warped_motion(cm))
-    aom_wb_write_bit(wb, cm->allow_warped_motion);
+    aom_wb_write_bit(wb, features->allow_warped_motion);
   else
-    assert(!cm->allow_warped_motion);
+    assert(!features->allow_warped_motion);
 
-  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+  aom_wb_write_bit(wb, features->reduced_tx_set_used);
 
   if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
 
@@ -3097,7 +3129,7 @@
       (cm->show_frame || cm->showable_frame))
     write_film_grain_params(cpi, wb);
 
-  if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
+  if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb);
 }
 
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
@@ -3120,7 +3152,8 @@
     return 1;
 }
 
-static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
+                                       const int val) {
   switch (sz) {
     case 1: dst[0] = (uint8_t)(val & 0xff); break;
     case 2: mem_put_le16(dst, val); break;
@@ -3130,7 +3163,7 @@
   }
 }
 
-static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
                        const uint32_t data_size, const uint32_t max_tile_size,
                        const uint32_t max_tile_col_size,
                        int *const tile_size_bytes,
@@ -3139,7 +3172,7 @@
   int tsb;
   int tcsb;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     // The top bit in the tile size field indicates tile copy mode, so we
     // have 1 less bit to code the tile size
     tsb = choose_size_bytes(max_tile_size, 1);
@@ -3160,25 +3193,25 @@
   uint32_t wpos = 0;
   uint32_t rpos = 0;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     int tile_row;
     int tile_col;
 
-    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+    for (tile_col = 0; tile_col < tiles->cols; tile_col++) {
       // All but the last column has a column header
-      if (tile_col < cm->tile_cols - 1) {
+      if (tile_col < tiles->cols - 1) {
         uint32_t tile_col_size = mem_get_le32(dst + rpos);
         rpos += 4;
 
         // Adjust the tile column size by the number of bytes removed
         // from the tile size fields.
-        tile_col_size -= (4 - tsb) * cm->tile_rows;
+        tile_col_size -= (4 - tsb) * tiles->rows;
 
         mem_put_varsize(dst + wpos, tcsb, tile_col_size);
         wpos += tcsb;
       }
 
-      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+      for (tile_row = 0; tile_row < tiles->rows; tile_row++) {
         // All, including the last row has a header
         uint32_t tile_header = mem_get_le32(dst + rpos);
         rpos += 4;
@@ -3206,7 +3239,7 @@
 
     return wpos;
   }
-  const int n_tiles = cm->tile_cols * cm->tile_rows;
+  const int n_tiles = tiles->cols * tiles->rows;
   int n;
 
   for (n = 0; n < n_tiles; n++) {
@@ -3234,11 +3267,12 @@
   return wpos;
 }
 
-uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
-                              int obu_extension, uint8_t *const dst) {
-  if (cpi->keep_level_stats &&
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              OBU_TYPE obu_type, int obu_extension,
+                              uint8_t *const dst) {
+  if (level_params->keep_level_stats &&
       (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
-    ++cpi->frame_header_count;
+    ++level_params->frame_header_count;
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
@@ -3257,11 +3291,12 @@
   return size;
 }
 
-int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
-                        uint8_t *dest) {
-  const uint32_t obu_size = obu_payload_size;
-  const uint32_t offset = obu_header_size;
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest) {
+  const size_t offset = obu_header_size;
   size_t coded_obu_size = 0;
+  const uint32_t obu_size = (uint32_t)obu_payload_size;
+  assert(obu_size == obu_payload_size);
 
   if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
                       &coded_obu_size) != 0) {
@@ -3271,18 +3306,17 @@
   return AOM_CODEC_OK;
 }
 
-static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size,
+static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
                           uint8_t *data) {
   const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
-  const uint32_t move_dst_offset =
-      (uint32_t)length_field_size + obu_header_size;
-  const uint32_t move_src_offset = obu_header_size;
-  const uint32_t move_size = obu_payload_size;
+  const size_t move_dst_offset = length_field_size + obu_header_size;
+  const size_t move_src_offset = obu_header_size;
+  const size_t move_size = obu_payload_size;
   memmove(data + move_dst_offset, data + move_src_offset, move_size);
   return length_field_size;
 }
 
-static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
   if (aom_wb_is_byte_aligned(wb)) {
     aom_wb_write_literal(wb, 0x80, 8);
   } else {
@@ -3291,74 +3325,80 @@
   }
 }
 
-static void write_bitstream_level(AV1_LEVEL seq_level_idx,
-                                  struct aom_write_bit_buffer *wb) {
+static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
+                                             struct aom_write_bit_buffer *wb) {
   assert(is_valid_seq_level_idx(seq_level_idx));
   aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
 
-uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
-  AV1_COMMON *const cm = &cpi->common;
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  write_profile(cm->seq_params.profile, &wb);
+  write_profile(seq_params->profile, &wb);
 
   // Still picture or not
-  aom_wb_write_bit(&wb, cm->seq_params.still_picture);
-  assert(IMPLIES(!cm->seq_params.still_picture,
-                 !cm->seq_params.reduced_still_picture_hdr));
+  aom_wb_write_bit(&wb, seq_params->still_picture);
+  assert(IMPLIES(!seq_params->still_picture,
+                 !seq_params->reduced_still_picture_hdr));
   // whether to use reduced still picture header
-  aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr);
+  aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr);
 
-  if (cm->seq_params.reduced_still_picture_hdr) {
-    assert(cm->timing_info_present == 0);
-    assert(cm->seq_params.decoder_model_info_present_flag == 0);
-    assert(cm->seq_params.display_model_info_present_flag == 0);
-    write_bitstream_level(cm->seq_params.seq_level_idx[0], &wb);
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(seq_params->timing_info_present == 0);
+    assert(seq_params->decoder_model_info_present_flag == 0);
+    assert(seq_params->display_model_info_present_flag == 0);
+    write_bitstream_level(seq_params->seq_level_idx[0], &wb);
   } else {
-    aom_wb_write_bit(&wb, cm->timing_info_present);  // timing info present flag
+    aom_wb_write_bit(
+        &wb, seq_params->timing_info_present);  // timing info present flag
 
-    if (cm->timing_info_present) {
+    if (seq_params->timing_info_present) {
       // timing_info
-      write_timing_info_header(cm, &wb);
-      aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag);
-      if (cm->seq_params.decoder_model_info_present_flag) {
-        write_decoder_model_info(cm, &wb);
+      write_timing_info_header(&seq_params->timing_info, &wb);
+      aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag);
+      if (seq_params->decoder_model_info_present_flag) {
+        write_decoder_model_info(&seq_params->decoder_model_info, &wb);
       }
     }
-    aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag);
-    aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1,
+    aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag);
+    aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1,
                          OP_POINTS_CNT_MINUS_1_BITS);
     int i;
-    for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
-      aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
+    for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, seq_params->operating_point_idc[i],
                            OP_POINTS_IDC_BITS);
-      write_bitstream_level(cm->seq_params.seq_level_idx[i], &wb);
-      if (cm->seq_params.seq_level_idx[i] >= SEQ_LEVEL_4_0)
-        aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
-      if (cm->seq_params.decoder_model_info_present_flag) {
-        aom_wb_write_bit(&wb,
-                         cm->op_params[i].decoder_model_param_present_flag);
-        if (cm->op_params[i].decoder_model_param_present_flag)
-          write_dec_model_op_parameters(cm, &wb, i);
+      write_bitstream_level(seq_params->seq_level_idx[i], &wb);
+      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+        aom_wb_write_bit(&wb, seq_params->tier[i]);
+      if (seq_params->decoder_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].decoder_model_param_present_flag);
+        if (seq_params->op_params[i].decoder_model_param_present_flag) {
+          write_dec_model_op_parameters(
+              &seq_params->op_params[i],
+              seq_params->decoder_model_info
+                  .encoder_decoder_buffer_delay_length,
+              &wb);
+        }
       }
-      if (cm->seq_params.display_model_info_present_flag) {
-        aom_wb_write_bit(&wb,
-                         cm->op_params[i].display_model_param_present_flag);
-        if (cm->op_params[i].display_model_param_present_flag) {
-          assert(cm->op_params[i].initial_display_delay <= 10);
-          aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1,
-                               4);
+      if (seq_params->display_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].display_model_param_present_flag);
+        if (seq_params->op_params[i].display_model_param_present_flag) {
+          assert(seq_params->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(
+              &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
         }
       }
     }
   }
-  write_sequence_header(&cm->seq_params, &wb);
+  write_sequence_header(seq_params, &wb);
 
-  write_color_config(&cm->seq_params, &wb);
+  write_color_config(seq_params, &wb);
 
-  aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present);
+  aom_wb_write_bit(&wb, seq_params->film_grain_params_present);
 
   add_trailing_bits(&wb);
 
@@ -3401,27 +3441,32 @@
   size_t total_length;
 } FrameHeaderInfo;
 
+extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                                const char *filename);
+
 static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
                                        struct aom_write_bit_buffer *saved_wb,
                                        uint8_t obu_extension_header,
                                        const FrameHeaderInfo *fh_info,
                                        int *const largest_tile_id) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  AV1LevelParams *const level_params = &cpi->level_params;
   aom_writer mode_bc;
   int tile_row, tile_col;
   // Store the location and size of each tile's data in the bitstream:
   TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
   uint32_t total_size = 0;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
   unsigned int tile_size = 0;
   unsigned int max_tile_size = 0;
   unsigned int max_tile_col_size = 0;
-  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  const int n_log2_tiles = tiles->log2_rows + tiles->log2_cols;
   // Fixed size tile groups for the moment
-  const int num_tg_hdrs = cm->num_tg;
+  const int num_tg_hdrs = cpi->num_tg;
   const int tg_size =
-      (cm->large_scale_tile)
+      (tiles->large_scale)
           ? 1
           : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
@@ -3433,11 +3478,12 @@
 
   *largest_tile_id = 0;
 
-  if (cm->large_scale_tile) {
+  if (tiles->large_scale) {
     // For large_scale_tile case, we always have only one tile group, so it can
     // be written as an OBU_FRAME.
     const OBU_TYPE obu_type = OBU_FRAME;
-    const uint32_t tg_hdr_size = av1_write_obu_header(cpi, obu_type, 0, data);
+    const uint32_t tg_hdr_size =
+        av1_write_obu_header(level_params, obu_type, 0, data);
     data += tg_hdr_size;
 
     const uint32_t frame_header_size =
@@ -3445,9 +3491,8 @@
     data += frame_header_size;
     total_size += frame_header_size;
 
-#define EXT_TILE_DEBUG 0
-#if EXT_TILE_DEBUG
-    {
+    // (yunqing) This test ensures the correctness of large scale tile coding.
+    if (cpi->oxcf.ext_tile_debug) {
       char fn[20] = "./fh";
       fn[4] = cm->current_frame.frame_number / 100 + '0';
       fn[5] = (cm->current_frame.frame_number % 100) / 10 + '0';
@@ -3456,8 +3501,6 @@
       av1_print_uncompressed_frame_header(data - frame_header_size,
                                           frame_header_size, fn);
     }
-#endif  // EXT_TILE_DEBUG
-#undef EXT_TILE_DEBUG
 
     int tile_size_bytes = 0;
     int tile_col_size_bytes = 0;
@@ -3485,9 +3528,9 @@
         // even for the last one, unless no tiling is used at all.
         total_size += data_offset;
         cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-        mode_bc.allow_update_cdf = !cm->large_scale_tile;
+        mode_bc.allow_update_cdf = !tiles->large_scale;
         mode_bc.allow_update_cdf =
-            mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+            mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
         aom_start_encode(&mode_bc, buf->data + data_offset);
         write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
         aom_stop_encode(&mode_bc);
@@ -3504,7 +3547,7 @@
           // tile header: size of this tile, or copy offset
           uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
           const int tile_copy_mode =
-              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
+              ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256)
                   ? 1
                   : 0;
 
@@ -3541,7 +3584,7 @@
     }
 
     if (have_tiles) {
-      total_size = remux_tiles(cm, data, total_size - frame_header_size,
+      total_size = remux_tiles(tiles, data, total_size - frame_header_size,
                                max_tile_size, max_tile_col_size,
                                &tile_size_bytes, &tile_col_size_bytes);
       total_size += frame_header_size;
@@ -3554,7 +3597,7 @@
     const uint32_t obu_payload_size = total_size - tg_hdr_size;
     const size_t length_field_size =
         obu_memmove(tg_hdr_size, obu_payload_size, dst);
-    if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+    if (av1_write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
         AOM_CODEC_OK) {
       assert(0);
     }
@@ -3591,8 +3634,8 @@
         // tile group header
         const OBU_TYPE obu_type =
             (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
-        curr_tg_data_size =
-            av1_write_obu_header(cpi, obu_type, obu_extension_header, data);
+        curr_tg_data_size = av1_write_obu_header(level_params, obu_type,
+                                                 obu_extension_header, data);
         obu_header_size = curr_tg_data_size;
 
         if (num_tg_hdrs == 1) {
@@ -3602,7 +3645,7 @@
         curr_tg_data_size += write_tile_group_header(
             data + curr_tg_data_size, tile_idx,
             AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
-            n_log2_tiles, cm->num_tg > 1);
+            n_log2_tiles, cpi->num_tg > 1);
         total_size += curr_tg_data_size;
         tile_data_start += curr_tg_data_size;
         new_tg = 0;
@@ -3626,7 +3669,7 @@
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       mode_bc.allow_update_cdf = 1;
       mode_bc.allow_update_cdf =
-          mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
       const int num_planes = av1_num_planes(cm);
       av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
 
@@ -3651,7 +3694,7 @@
         const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
         const size_t length_field_size =
             obu_memmove(obu_header_size, obu_payload_size, data);
-        if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
             AOM_CODEC_OK) {
           assert(0);
         }
@@ -3664,7 +3707,7 @@
           saved_wb->bit_buffer += length_field_size;
         }
 
-        if (!first_tg && cm->error_resilient_mode) {
+        if (!first_tg && cm->features.error_resilient_mode) {
           // Make room for a duplicate Frame Header OBU.
           memmove(data + fh_info->total_length, data, curr_tg_data_size);
 
@@ -3678,7 +3721,7 @@
 
           // Rewrite the OBU header to change the OBU type to Redundant Frame
           // Header.
-          av1_write_obu_header(cpi, OBU_REDUNDANT_FRAME_HEADER,
+          av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER,
                                obu_extension_header,
                                &data[fh_info->obu_header_byte_offset]);
 
@@ -3699,7 +3742,7 @@
     // cdf update. The encoder currently sets it to the largest tile
     // (but is up to the encoder)
     aom_wb_overwrite_literal(saved_wb, *largest_tile_id,
-                             cm->log2_tile_cols + cm->log2_tile_rows);
+                             tiles->log2_cols + tiles->log2_rows);
     // If more than one tile group. tile_size_bytes takes the default value 4
     // and does not need to be set. For a single tile group it is set in the
     // section below.
@@ -3709,7 +3752,7 @@
       const uint32_t tile_data_size = total_size - tile_data_offset;
 
       total_size =
-          remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size,
+          remux_tiles(tiles, tile_data_start, tile_data_size, max_tile_size,
                       max_tile_col_size, &tile_size_bytes, &unused);
       total_size += tile_data_offset;
       assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
@@ -3744,33 +3787,92 @@
   return total_size;
 }
 
+static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
+                                     uint8_t *const dst) {
+  size_t coded_metadata_size = 0;
+  const uint64_t metadata_type = (uint64_t)metadata->type;
+  if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst,
+                      &coded_metadata_size) != 0) {
+    return 0;
+  }
+  memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz);
+  // Add trailing bits.
+  dst[coded_metadata_size + metadata->sz] = 0x80;
+  return (uint32_t)(coded_metadata_size + metadata->sz + 1);
+}
+
+static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
+  if (!cpi->source) return 0;
+  AV1_COMMON *const cm = &cpi->common;
+  aom_metadata_array_t *arr = cpi->source->metadata;
+  if (!arr) return 0;
+  size_t obu_header_size = 0;
+  size_t obu_payload_size = 0;
+  size_t total_bytes_written = 0;
+  size_t length_field_size = 0;
+  for (size_t i = 0; i < arr->sz; i++) {
+    aom_metadata_t *current_metadata = arr->metadata_array[i];
+    if (current_metadata && current_metadata->payload) {
+      if ((cm->current_frame.frame_type == KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_KEY_FRAME) ||
+          (cm->current_frame.frame_type != KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
+          current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
+        obu_header_size =
+            av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst);
+        obu_payload_size =
+            av1_write_metadata_obu(current_metadata, dst + obu_header_size);
+        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
+            AOM_CODEC_OK) {
+          const size_t obu_size = obu_header_size + obu_payload_size;
+          dst += obu_size + length_field_size;
+          total_bytes_written += obu_size + length_field_size;
+        } else {
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                             "Error writing metadata OBU size");
+        }
+      }
+    }
+  }
+  return total_bytes_written;
+}
+
 int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id) {
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
+  AV1LevelParams *const level_params = &cpi->level_params;
   uint32_t obu_header_size = 0;
   uint32_t obu_payload_size = 0;
   FrameHeaderInfo fh_info = { NULL, 0, 0 };
   const uint8_t obu_extension_header =
       cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
 
+  // If no non-zero delta_q has been used, reset delta_q_present_flag
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_reset_write();
 #endif
 
-  cpi->frame_header_count = 0;
+  level_params->frame_header_count = 0;
 
   // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
   if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-    obu_header_size = av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, data);
+    obu_header_size =
+        av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
 
-    obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
+    obu_payload_size =
+        av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
         obu_memmove(obu_header_size, obu_payload_size, data);
-    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
@@ -3778,20 +3880,23 @@
     data += obu_header_size + obu_payload_size + length_field_size;
   }
 
+  // write metadata obus before the frame obu that has the show_frame flag set
+  if (cm->show_frame) data += av1_write_metadata_array(cpi, data);
+
   const int write_frame_header =
-      (cm->num_tg > 1 || encode_show_existing_frame(cm));
+      (cpi->num_tg > 1 || encode_show_existing_frame(cm));
   struct aom_write_bit_buffer saved_wb;
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
-    obu_header_size =
-        av1_write_obu_header(cpi, OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER,
+                                           obu_extension_header, data);
     obu_payload_size =
         write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
 
     const size_t length_field_size =
         obu_memmove(obu_header_size, obu_payload_size, data);
-    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }

diff --git a/libaom/av1/encoder/bitstream.h b/libaom/av1/encoder/bitstream.h
index b05d0d5..45151e2 100644
--- a/libaom/av1/encoder/bitstream.h
+++ b/libaom/av1/encoder/bitstream.h

@@ -23,22 +23,23 @@
 // Writes only the OBU Sequence Header payload, and returns the size of the
 // payload written to 'dst'. This function does not write the OBU header, the
 // optional extension, or the OBU size to 'dst'.
-uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst);
 
 // Writes the OBU header byte, and the OBU header extension byte when
 // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
-uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
-                              int obu_extension, uint8_t *const dst);
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              OBU_TYPE obu_type, int obu_extension,
+                              uint8_t *const dst);
 
-int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
-                        uint8_t *dest);
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest);
 
 int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id);
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
-                       aom_writer *w);
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/block.h b/libaom/av1/encoder/block.h
index 96b0991..5a74567 100644
--- a/libaom/av1/encoder/block.h
+++ b/libaom/av1/encoder/block.h

@@ -15,23 +15,48 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
 #include "av1/common/mvref_common.h"
-#include "av1/encoder/hash.h"
-#if CONFIG_DIST_8X8
-#include "aom/aomcx.h"
+
+#include "av1/encoder/enc_enums.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_cnn_weights.h"
 #endif
 
+#include "av1/encoder/hash.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MC_FLOW_BSIZE_1D 16
+#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D)
+#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D)
+#define MAX_WINNER_MODE_COUNT_INTRA 3
+#define MAX_WINNER_MODE_COUNT_INTER 1
+typedef struct {
+  MB_MODE_INFO mbmi;
+  RD_STATS rd_cost;
+  int64_t rd;
+  int rate_y;
+  int rate_uv;
+  uint8_t color_index_map[64 * 64];
+  THR_MODES mode_index;
+} WinnerModeStats;
+
 typedef struct {
   unsigned int sse;
   int sum;
   unsigned int var;
 } DIFF;
 
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
 typedef struct macroblock_plane {
-  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]);
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
@@ -67,28 +92,32 @@
 typedef struct {
   tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
   uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  uint8_t txb_skip_ctx[MAX_MB_PLANE]
-                      [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  int dc_sign_ctx[MAX_MB_PLANE]
-                 [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  // Transform block entropy contexts.
+  // Bits 0~3: txb_skip_ctx; bits 4~5: dc_sign_ctx.
+  uint8_t entropy_ctx[MAX_MB_PLANE]
+                     [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 } CB_COEFF_BUFFER;
 
 typedef struct {
   // TODO(angiebird): Reduce the buffer size according to sb_type
-  CB_COEFF_BUFFER *cb_coef_buff;
-  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
   int_mv global_mvs[REF_FRAMES];
-  int cb_offset;
   int16_t mode_context[MODE_CTX_REF_FRAMES];
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
+// Structure to store best mode information at frame level. This
+// frame level information will be used during bitstream preparation stage.
 typedef struct {
-  int col_min;
-  int col_max;
-  int row_min;
-  int row_max;
-} MvLimits;
+  CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+  uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+  // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+  int_mv global_mvs[REF_FRAMES];
+  int cb_offset;
+  int16_t mode_context;
+  uint8_t ref_mv_count;
+} MB_MODE_INFO_EXT_FRAME;
 
 typedef struct {
   uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
@@ -99,7 +128,7 @@
   TX_SIZE tx_size;
   TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   RD_STATS rd_stats;
   uint32_t hash_value;
 } MB_RD_INFO;
@@ -122,6 +151,7 @@
   uint8_t txb_entropy_ctx;
   uint8_t valid;
   uint8_t fast;  // This is not being used now.
+  uint8_t perform_block_coeff_opt;
 } TXB_RD_INFO;
 
 #define TX_SIZE_RD_RECORD_BUFFER_LEN 256
@@ -143,6 +173,7 @@
   RD_STATS rd_stats_y;
   RD_STATS rd_stats_uv;
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t skip;
   uint8_t disable_skip;
   uint8_t early_skipped;
@@ -151,58 +182,41 @@
 // 4: NEAREST, NEW, NEAR, GLOBAL
 #define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
 
-// Region size for mode decision sampling in the first pass of partition
-// search(two_pass_partition_search speed feature), in units of mi size(4).
-// Used by the mode pruning in two_pass_partition_search feature.
-#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
-#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
-#define FIRST_PARTITION_PASS_STATS_TABLES                     \
-  (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
-      (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
-#define FIRST_PARTITION_PASS_STATS_STRIDE \
-  (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
-
-static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
-  const int row =
-      (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
-  const int col =
-      (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
-  return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
-}
-
-typedef struct {
-  uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
-  uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
-  int sample_counts;                // Number of samples collected.
-  uint8_t interintra_motion_mode_count[REF_FRAMES];  // Counter for interintra
-                                                     // motion mode
-} FIRST_PARTITION_PASS_STATS;
-
-#define MAX_INTERP_FILTER_STATS 64
-typedef struct {
-  InterpFilters filters;
-  int_mv mv[2];
-  int8_t ref_frames[2];
-  COMPOUND_TYPE comp_type;
-  int64_t rd;
-  int skip_txfm_sb;
-  int64_t skip_sse_sb;
-  unsigned int pred_sse;
-} INTERPOLATION_FILTER_STATS;
-
 #define MAX_COMP_RD_STATS 64
 typedef struct {
   int32_t rate[COMPOUND_TYPES];
   int64_t dist[COMPOUND_TYPES];
-  int64_t comp_model_rd[COMPOUND_TYPES];
+  int32_t model_rate[COMPOUND_TYPES];
+  int64_t model_dist[COMPOUND_TYPES];
+  int comp_rs2[COMPOUND_TYPES];
   int_mv mv[2];
   MV_REFERENCE_FRAME ref_frames[2];
   PREDICTION_MODE mode;
-  InterpFilters filter;
+  int_interpfilters filter;
   int ref_mv_idx;
   int is_global[2];
+  INTERINTER_COMPOUND_DATA interinter_comp;
 } COMP_RD_STATS;
 
+// Struct for buffers used by av1_compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+  uint8_t *pred0;
+  uint8_t *pred1;
+  int16_t *residual1;          // src - pred1
+  int16_t *diff10;             // pred1 - pred0
+  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
 struct inter_modes_info;
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
@@ -213,24 +227,9 @@
   // to select transform kernel.
   int rd_model;
 
-  // Indicate if the encoder is running in the first pass partition search.
-  // In that case, apply certain speed features therein to reduce the overhead
-  // cost in the first pass search.
-  int cb_partition_scan;
-
-  FIRST_PARTITION_PASS_STATS
-  first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
-
-  // [comp_idx][saved stat_idx]
-  INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
-  int interp_filter_stats_idx[2];
-
-  // prune_comp_search_by_single_result (3:MAX_REF_MV_SERCH)
+  // prune_comp_search_by_single_result (3:MAX_REF_MV_SEARCH)
   SimpleRDState simple_rd_state[SINGLE_REF_MODES][3];
 
-  // Activate constrained coding block partition search range.
-  int use_cb_search_range;
-
   // Inter macroblock RD search info.
   MB_RD_RECORD mb_rd_record;
 
@@ -245,6 +244,11 @@
 
   MACROBLOCKD e_mbd;
   MB_MODE_INFO_EXT *mbmi_ext;
+  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+  // Array of mode stats for winner mode processing
+  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
+                                           MAX_WINNER_MODE_COUNT_INTER)];
+  int winner_mode_count;
   int skip_block;
   int qindex;
 
@@ -253,16 +257,10 @@
   int errorperbit;
   // The equivalend SAD error of one (whole) bit at the current quantizer
   // for large blocks.
-  int sadperbit16;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
-  // for sub-8x8 blocks.
-  int sadperbit4;
+  int sadperbit;
   int rdmult;
-  int cb_rdmult;
   int mb_energy;
   int sb_energy_level;
-  int *m_search_count_ptr;
-  int *ex_search_count_ptr;
 
   unsigned int txb_split_count;
 #if CONFIG_SPEED_STATS
@@ -279,8 +277,11 @@
   unsigned int simple_motion_pred_sse;
   unsigned int pred_sse[REF_FRAMES];
   int pred_mv_sad[REF_FRAMES];
+  int best_pred_mv_sad;
 
   int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
   int *nmvcost[2];
   int *nmvcost_hp[2];
   int **mv_cost_stack;
@@ -291,6 +292,7 @@
   uint8_t *left_pred_buf;
 
   PALETTE_BUFFER *palette_buffer;
+  CompoundTypeRdBuffers comp_rd_buffer;
 
   CONV_BUF_TYPE *tmp_conv_dst;
   uint8_t *tmp_obmc_bufs[2];
@@ -305,31 +307,23 @@
 
   struct inter_modes_info *inter_modes_info;
 
-  // buffer for hash value calculation of a block
-  // used only in av1_get_block_hash_value()
-  // [first hash/second hash]
-  // [two buffers used ping-pong]
-  uint32_t *hash_value_buffer[2][2];
-
-  CRC_CALCULATOR crc_calculator1;
-  CRC_CALCULATOR crc_calculator2;
-  int g_crc_initialized;
+  // Contains the hash table, hash function, and buffer used for intrabc
+  IntraBCHashInfo intrabc_hash_info;
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
-  MvLimits mv_limits;
+  FullMvLimits mv_limits;
 
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
-  int skip;
-  int skip_chroma_rd;
+  // Force the coding block to skip transform and quantization.
+  int force_skip;
   int skip_cost[SKIP_CONTEXTS][2];
 
   int skip_mode;  // 0: off; 1: on
   int skip_mode_cost[SKIP_CONTEXTS][2];
 
-  int compound_idx;
-
   LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
   LV_MAP_EOB_COST eob_costs[7][2];
   uint16_t cb_offset;
@@ -393,14 +387,6 @@
   // Used to store sub partition's choices.
   MV pred_mv[REF_FRAMES];
 
-  // Store the best motion vector during motion search
-  int_mv best_mv;
-  // Store the second best motion vector during full-pixel motion search
-  int_mv second_best_mv;
-
-  // Store the fractional best motion vector during sub/Qpel-pixel motion search
-  int_mv fractional_best_mv[3];
-
   // Ref frames that are selected by square partition blocks within a super-
   // block, in MI resolution. They can be used to prune ref frames for
   // rectangular blocks.
@@ -410,18 +396,14 @@
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
   int use_default_inter_tx_type;
-#if CONFIG_DIST_8X8
-  int using_dist_8x8;
-  aom_tune_metric tune_metric;
-#endif  // CONFIG_DIST_8X8
   int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
   int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
-  // Bit flags for pruning tx type search, tx split, etc.
-  int tx_search_prune[EXT_TX_SET_TYPES];
   int must_find_valid_partition;
-  int tx_split_prune_flag;  // Flag to skip tx split RD search.
   int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
                             // interpolation filter search
+  int prune_mode;
+  uint32_t tx_domain_dist_threshold;
+  int use_transform_domain_distortion;
   // The likelihood of an edge existing in the block (using partial Canny edge
   // detection). For reference, 556 is the value returned for a solid
   // vertical black/white edge.
@@ -429,12 +411,87 @@
   // The strongest edge strength seen along the x/y axis.
   uint16_t edge_strength_x;
   uint16_t edge_strength_y;
+  uint8_t compound_idx;
 
   // [Saved stat index]
   COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
   int comp_rd_stats_idx;
+
+  CB_COEFF_BUFFER *cb_coef_buff;
+
+  // Threshold used to decide the applicability of R-D optimization of
+  // quantized coeffs
+  uint32_t coeff_opt_dist_threshold;
+
+#if !CONFIG_REALTIME_ONLY
+  int quad_tree_idx;
+  int cnn_output_valid;
+  float cnn_buffer[CNN_OUT_BUF_SIZE];
+  float log_q;
+#endif
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  // 0 - 128x128
+  // 1-2 - 128x64
+  // 3-4 - 64x128
+  // 5-8 - 64x64
+  // 9-16 - 64x32
+  // 17-24 - 32x64
+  // 25-40 - 32x32
+  // 41-104 - 16x16
+  uint8_t variance_low[105];
+  uint8_t content_state_sb;
+  // Strong color activity detection. Used in REALTIME coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+  int nonrd_prune_ref_frame_search;
+
+  // Used to control the tx size search evaluation for mode processing
+  // (normal/winner mode)
+  int tx_size_search_method;
+  // This tx_mode_search_type is used internally by the encoder, and is not
+  // written to the bitstream. It determines what kind of tx_mode should be
+  // searched. For example, we might set it to TX_MODE_LARGEST to find a good
+  // candidate, then use TX_MODE_SELECT on it
+  TX_MODE tx_mode_search_type;
+
+  // Used to control aggressiveness of skip flag prediction for mode processing
+  // (normal/winner mode)
+  unsigned int predict_skip_level;
+
+  // Copy out this SB's TPL block stats.
+  int valid_cost_b;
+  int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
+             [INTER_REFS_PER_FRAME];
+  int cost_stride;
+
+  // The type of mv cost used during motion search
+  MV_COST_TYPE mv_cost_type;
+
+  uint8_t search_ref_frame[REF_FRAMES];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+#else
+  void (*fwd_txfm4x4)(const int16_t *input, int16_t *output, int stride);
+  void (*inv_txfm_add)(const int16_t *input, uint8_t *dest, int stride,
+                       int eob);
+#endif
 };
 
+// Only consider full SB, MC_FLOW_BSIZE_1D = 16.
+static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_64X64: return 16;
+    case BLOCK_128X128: return 64;
+    default: assert(0);
+  }
+  return -1;
+}
+
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   static const char LUT[BLOCK_SIZES_ALL] = {
     0,  // BLOCK_4X4

diff --git a/libaom/av1/encoder/cnn.c b/libaom/av1/encoder/cnn.c
new file mode 100644
index 0000000..5d8a236
--- /dev/null
+++ b/libaom/av1/encoder/cnn.c

@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/cnn.h"
+#include "av1/common/av1_common_int.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+  const float **input;
+  int in_width;
+  int in_height;
+  int in_stride;
+  const CNN_LAYER_CONFIG *layer_config;
+  float **output;
+  int out_stride;
+  int start_idx;
+  int th_step;
+} CONVOLVE_OPS;
+
+typedef float (*activation_fn)(float);
+
+static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
+
+static float relu(float x) { return (x < 0) ? 0 : x; }
+
+static float identity(float x) { return x; }
+
+typedef struct {
+  int allocsize;
+  int channels;
+  int width, height, stride;
+  float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+  if (tensor->allocsize) {
+    aom_free(tensor->buf[0]);
+    tensor->buf[0] = NULL;
+    tensor->allocsize = 0;
+  }
+}
+
+static void realloc_tensor(TENSOR *tensor, int channels, int width,
+                           int height) {
+  const int newallocsize = channels * width * height;
+  if (tensor->allocsize < newallocsize) {
+    free_tensor(tensor);
+    tensor->buf[0] =
+        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+    tensor->allocsize = newallocsize;
+  }
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = width;
+  tensor->channels = channels;
+  for (int c = 1; c < channels; ++c)
+    tensor->buf[c] = &tensor->buf[0][c * width * height];
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+                        TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+  assert(copy_channels <= src->channels);
+  if (src->stride == dst->width && dst->stride == dst->width) {
+    for (int c = 0; c < copy_channels; ++c) {
+      memcpy(dst->buf[dst_offset + c], src->buf[c],
+             sizeof(*dst->buf[0]) * src->width * src->height);
+    }
+  } else {
+    for (int c = 0; c < copy_channels; ++c) {
+      for (int r = 0; r < dst->height; ++r) {
+        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+               &src->buf[c][r * src->stride],
+               dst->width * sizeof(*dst->buf[c]));
+      }
+    }
+  }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+                          int channels, int width, int height, int stride) {
+  tensor->allocsize = 0;
+  tensor->channels = channels;
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = stride;
+  if (buf) {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+  } else {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+  }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+  TENSOR t = *t1;
+  *t1 = *t2;
+  *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static void concat_tensor(const TENSOR *src, TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+
+  const int dst_channels = dst->channels;
+  const int channels = dst->channels + src->channels;
+  const int newallocsize = channels * dst->width * dst->height;
+  if (dst->allocsize < newallocsize) {
+    TENSOR t;
+    init_tensor(&t);
+    // allocate new buffers and copy first the dst channels
+    realloc_tensor(&t, channels, dst->width, dst->height);
+    copy_tensor(dst, dst->channels, 0, &t);
+    // Swap the tensors and free the old buffers
+    swap_tensor(dst, &t);
+    free_tensor(&t);
+  }
+  for (int c = 1; c < channels; ++c)
+    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+  // Copy the channels in src after the first dst_channels channels.
+  copy_tensor(src, src->channels, dst_channels, dst);
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+  return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+  return (t1->channels == t2->channels && t1->width == t2->width &&
+          t1->height == t2->height);
+}
+
+static void find_layer_output_size(int in_width, int in_height,
+                                   const CNN_LAYER_CONFIG *layer_config,
+                                   int *out_width, int *out_height) {
+  if (!layer_config->deconvolve) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = (in_width + layer_config->skip_width - 1) /
+                     layer_config->skip_width;
+        *out_height = (in_height + layer_config->skip_height - 1) /
+                      layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width =
+            (in_width - layer_config->filter_width + layer_config->skip_width) /
+            layer_config->skip_width;
+        *out_height = (in_height - layer_config->filter_height +
+                       layer_config->skip_height) /
+                      layer_config->skip_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = in_width * layer_config->skip_width;
+        *out_height = in_height * layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width = (in_width - 1) * layer_config->skip_width +
+                     layer_config->filter_width;
+        *out_height = (in_height - 1) * layer_config->skip_height +
+                      layer_config->filter_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+                           int channels_per_branch[]) {
+  int branch = layer_config->branch;
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      if (layer_config->branch_copy_type == BRANCH_INPUT) {
+        channels_per_branch[b] = layer_config->in_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+        channels_per_branch[b] = layer_config->out_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+        channels_per_branch[b] = layer_config->out_channels;
+        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+            assert(channels_per_branch[c] > 0);
+            channels_per_branch[b] += channels_per_branch[c];
+          }
+        }
+      }
+    }
+  }
+  channels_per_branch[branch] = layer_config->out_channels;
+  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+      assert(channels_per_branch[c] > 0);
+      channels_per_branch[branch] += channels_per_branch[c];
+    }
+  }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+  const int num_layers = cnn_config->num_layers;
+  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+  for (int idx = 0; idx < num_layers; idx++) {
+    if (layer_configs[idx].output_num != -1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels) {
+  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+  int i_width[CNN_MAX_BRANCHES] = { 0 };
+  int i_height[CNN_MAX_BRANCHES] = { 0 };
+  i_width[0] = in_width + cnn_config->ext_width * 2;
+  i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+  assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+  for (int i = 0; i < cnn_config->num_layers; ++i) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+    const int branch = layer_config->branch;
+    int o_width = 0, o_height = 0;
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          assert(i_width[branch] > 0 && i_height[branch] > 0);
+          i_width[b] = i_width[branch];
+          i_height[b] = i_height[branch];
+        }
+      }
+    }
+
+    find_layer_output_size(i_width[branch], i_height[branch], layer_config,
+                           &o_width, &o_height);
+    i_width[branch] = o_width;
+    i_height[branch] = o_height;
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          i_width[b] = o_width;
+          i_height[b] = o_height;
+        }
+      }
+    }
+
+    find_cnn_out_channels(layer_config, channels_per_branch);
+
+    const int output_num = layer_config->output_num;
+    if (output_num != -1) {  // Current layer is an output layer
+      out_width[output_num] = o_width;
+      out_height[output_num] = o_height;
+      out_channels[output_num] = channels_per_branch[layer_config->branch];
+    }
+  }
+}
+
+activation_fn get_activation(ACTIVATION layer_activation) {
+  switch (layer_activation) {
+    case NONE: return identity;
+    case RELU: return relu;
+    case SOFTSIGN: return softsign;
+    case SIGMOID:
+      assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
+      return NULL;
+    default: assert(0 && "Unknown activation type"); return NULL;
+  }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+                                           int stride) {
+  const int mod = (width % stride);
+  const int filt_off = (filt_width - 1) / 2;
+  const int dif = (mod ? mod - 1 : stride - 1);
+  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+                   int stride, const float **add) {
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] += add[c][i * stride + j];
+  }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+                        int stride, ACTIVATION layer_activation) {
+  activation_fn activation = get_activation(layer_activation);
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] = activation(output[c][i * stride + j]);
+  }
+}
+
+static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+                                           const CNN_LAYER_CONFIG *layer_config,
+                                           int branch, TENSOR branch_output[]) {
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      // Copy layer's active tensor to output tensor of branch b if set in
+      // mask. The output becomes the input of the first layer of the branch
+      // because the layer of the branch is not the first layer.
+      int copy_channels = branch_config->channels_to_copy > 0
+                              ? branch_config->channels_to_copy
+                              : layer_active_tensor->channels;
+      realloc_tensor(&branch_output[b], copy_channels,
+                     layer_active_tensor->width, layer_active_tensor->height);
+      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+    }
+  }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+  const CONVOLVE_OPS *convolve_ops = arg1;
+  (void)arg2;
+  av1_cnn_convolve(
+      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+  return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+                              int in_stride,
+                              const CNN_LAYER_CONFIG *layer_config,
+                              const CNN_THREAD_DATA *thread_data,
+                              float **output, int out_stride) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = thread_data->num_workers;
+
+  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    AVxWorker *const worker = &thread_data->workers[th];
+    winterface->reset(worker);
+
+    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
+                                 in_stride,  layer_config, output,
+                                 out_stride, th,           num_workers };
+    convolve_ops[th] = convolve_op;
+    worker->hook = convolve_layer;
+    worker->data1 = &(convolve_ops[th]);
+    worker->data2 = NULL;
+
+    // Start convolving.
+    if (th == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait until all workers have finished.
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    winterface->sync(&thread_data->workers[th]);
+  }
+}
+
+void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
+                        int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                        float **output, int out_stride, int start_idx,
+                        int step) {
+  assert(!layer_config->deconvolve);
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+  const int filter_height_half = layer_config->filter_height >> 1;
+  const int filter_width_half = layer_config->filter_width >> 1;
+  const int channel_step = AOMMAX(step, 1);
+
+  if (layer_config->maxpool &&
+      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0; w < in_width;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii = hh + l - filter_height_half;
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj = ww + m - filter_width_half;
+                        if (ii < 0 || ii >= in_height || jj < 0 ||
+                            jj >= in_width)
+                          continue;
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PADDING_SAME_REPLICATE:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0; w < in_width;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii =
+                          CLAMPINDEX(hh + l - filter_height_half, in_height);
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj =
+                            CLAMPINDEX(ww + m - filter_width_half, in_width);
+                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                               jj < in_width);
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PADDING_VALID:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0;
+               h < in_height - layer_config->filter_height + 1;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0;
+                 w < in_width - layer_config->filter_width + 1;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii = hh + l;
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj = ww + m;
+                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                               jj < in_width);
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    // Results in element-wise matrix multiplication.
+    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+      const int start_h = get_start_shift_convolve(
+          in_height, layer_config->filter_height, layer_config->skip_height);
+      const int start_w =
+          get_start_shift_convolve(in_width, layer_config->filter_width,
+                                   layer_config->skip_width) +
+          start_idx * layer_config->skip_width;
+      const int out_w_step = AOMMAX(step, 1);
+      const int in_w_step = layer_config->skip_width * out_w_step;
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int h = start_h, u = 0; h < in_height;
+             h += layer_config->skip_height, ++u) {
+          const int in_h = h * in_stride;
+          const int out_h = u * out_stride + start_idx;
+          for (int w = start_w, out_index = out_h; w < in_width;
+               w += in_w_step, out_index += out_w_step) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              sum += layer_config->weights[k * layer_config->out_channels + i] *
+                     input[k][in_h + w];
+            }
+            output[i][out_index] = sum;
+          }
+        }
+      }
+      return;
+    }
+    const int ii_shift =
+        filter_height_half - (layer_config->filter_height - 1) % 2;
+    const int jj_shift =
+        filter_width_half - (layer_config->filter_width - 1) % 2;
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO: {
+        const int start_h = get_start_shift_convolve(
+            in_height, layer_config->filter_height, layer_config->skip_height);
+        const int start_w = get_start_shift_convolve(
+            in_width, layer_config->filter_width, layer_config->skip_width);
+        const int end_ii_shift = filter_height_half + 1;
+        const int end_jj_shift = filter_width_half + 1;
+        // *_filter_margin stores the number of pixels along a dimension in the
+        // intersection of the complement of the image in the extended image
+        // and the filter.
+        const int top_filter_margin = layer_config->filter_width * ii_shift;
+        const int right_filter_margin = end_jj_shift - in_width;
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = start_h, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int top_cstep =
+                AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+                    cstep +
+                i;
+            const int start_ii = AOMMAX(0, h - ii_shift);
+            const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+            for (int w = start_w, out_index = out_h; w < in_width;
+                 w += layer_config->skip_width, ++out_index) {
+              const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+              const int right_cstep =
+                  AOMMAX(0, right_filter_margin + w) * cstep;
+              const int start_jj = AOMMAX(0, w - jj_shift);
+              const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + top_cstep;
+                for (int ii = start_ii; ii < end_ii; ++ii) {
+                  off += left_cstep;
+                  for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+                    sum += layer_config->weights[off] *
+                           input[k][ii * in_stride + jj];
+                  }
+                  off += right_cstep;
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      case PADDING_SAME_REPLICATE: {
+        // h and w are shifted to an offset coordinate system to reduce in-loop
+        // computation.
+        const int start_h =
+            get_start_shift_convolve(in_height, layer_config->filter_height,
+                                     layer_config->skip_height) -
+            ii_shift;
+        const int start_w =
+            get_start_shift_convolve(in_width, layer_config->filter_width,
+                                     layer_config->skip_width) -
+            jj_shift;
+        const int end_h = in_height - ii_shift;
+        const int end_w = in_width - jj_shift;
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = start_h, u = 0; h < end_h;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int upper_ii_index = layer_config->filter_height + h;
+            for (int w = start_w, out_index = out_h; w < end_w;
+                 w += layer_config->skip_width, ++out_index) {
+              const int upper_jj_index = layer_config->filter_width + w;
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + i;
+                for (int ii = h; ii < upper_ii_index; ++ii) {
+                  const int clamped_ii = CLAMPINDEX(ii, in_height);
+                  for (int jj = w; jj < upper_jj_index; ++jj) {
+                    const int clamped_jj = CLAMPINDEX(jj, in_width);
+                    assert(clamped_ii >= 0 && clamped_ii < in_height &&
+                           clamped_jj >= 0 && clamped_jj < in_width);
+                    sum += layer_config->weights[off] *
+                           input[k][clamped_ii * in_stride + clamped_jj];
+                    off += cstep;
+                  }
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      case PADDING_VALID: {
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = 0, u = 0;
+               h < in_height - layer_config->filter_height + 1;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int upper_ii_index = layer_config->filter_height + h;
+            for (int w = 0, out_index = out_h;
+                 w < in_width - layer_config->filter_width + 1;
+                 w += layer_config->skip_width, ++out_index) {
+              const int upper_jj_index = layer_config->filter_width + w;
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + i;
+                for (int ii = h; ii < upper_ii_index; ++ii) {
+                  for (int jj = w; jj < upper_jj_index; ++jj) {
+                    assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                           jj < in_width);
+                    sum += layer_config->weights[off] *
+                           input[k][ii * in_stride + jj];
+                    off += cstep;
+                  }
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+  const int dif = AOMMAX(filt_width - stride, 0);
+  return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+                         int stride, const float *gamma, const float *beta,
+                         const float *mean, const float *std) {
+  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+  for (int ch = 0; ch < channels; ch++) {
+    const float ch_gamma = gamma[ch];
+    const float ch_beta = beta[ch];
+    const float ch_mean = mean[ch];
+    const float ch_std = std[ch];
+    float *image_row = image[ch];
+
+    for (int row = 0; row < height; row++) {
+      for (int col = 0; col < width; col++) {
+        image_row[col] =
+            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+      }
+      image_row += stride;
+    }
+  }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                          float **output, int out_stride) {
+  assert(layer_config->deconvolve);
+
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+  int out_width = 0;
+  int out_height = 0;
+  find_layer_output_size(in_width, in_height, layer_config, &out_width,
+                         &out_height);
+  switch (layer_config->pad) {
+    case PADDING_SAME_ZERO:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_SAME_REPLICATE:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii =
+                      CLAMPINDEX(h / layer_config->skip_height, in_height);
+                  const int jj =
+                      CLAMPINDEX(w / layer_config->skip_width, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_VALID:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h = u - l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w = v - m;
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Unknown padding type");
+  }
+}
+
+void av1_cnn_predict_c(const float **input, int in_width, int in_height,
+                       int in_stride, const CNN_CONFIG *cnn_config,
+                       const CNN_THREAD_DATA *thread_data,
+                       CNN_MULTI_OUT *output_struct) {
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };
+
+  float **output[CNN_MAX_BRANCHES];
+  const int *out_chs = output_struct->output_channels;
+  output[0] = output_struct->output_buffer;
+  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+  }
+
+  int i_width = in_width;
+  int i_height = in_height;
+  int o_width = 0, o_height = 0;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    init_tensor(&tensor1[b]);
+    init_tensor(&tensor2[b]);
+  }
+
+  const int *out_stride = output_struct->output_strides;
+  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+    const int branch = layer_config->branch;
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+    // Allocate input tensor
+    if (layer == 0) {       // First layer
+      assert(branch == 0);  // First layer must be primary branch
+      assign_tensor(&tensor1[branch], (float **)input,
+                    layer_config->in_channels, in_width, in_height, in_stride);
+    } else {  // Non-first layer
+      // Swap tensor1 and tensor2
+      swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+      i_width = tensor1[branch].width;
+      i_height = tensor1[branch].height;
+    }
+
+    // Allocate output tensor
+    find_layer_output_size(i_width, i_height, layer_config, &o_width,
+                           &o_height);
+    const int output_num = layer_config->output_num;
+    if (output_num == -1) {  // Non-output layer
+      realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+                     o_height);
+    } else {  // Output layer
+      free_tensor(&tensor2[branch]);
+      assign_tensor(&tensor2[branch], output[output_num],
+                    layer_config->out_channels, o_width, o_height,
+                    out_stride[output_num]);
+    }
+
+    // If we are combining branches make sure that the branch to combine
+    // is different from the current branch.
+    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+                   !(branch_config->branches_to_combine & (1 << branch))));
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
+                                     tensor2);
+    }
+    // Check consistency of input and output channels
+    assert(tensor1[branch].channels == layer_config->in_channels);
+    assert(tensor2[branch].channels == layer_config->out_channels);
+
+    // Convolve/Deconvolve
+    if (!cnn_config->layer_config[layer].deconvolve) {
+      if (thread_data->num_workers > 1) {
+        convolve_layer_mt((const float **)tensor1[branch].buf,
+                          tensor1[branch].width, tensor1[branch].height,
+                          tensor1[branch].stride, layer_config, thread_data,
+                          tensor2[branch].buf, tensor2[branch].stride);
+      } else {
+        av1_cnn_convolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+      }
+    } else {
+      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride);
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
+                                     tensor2);
+    }
+
+    // Add tensors from other branches if needed
+    if (layer_config->branch_combine_type == BRANCH_ADD) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+                      tensor2[branch].width, tensor2[branch].height,
+                      tensor2[branch].stride, (const float **)tensor2[b].buf);
+        }
+      }
+    }
+
+    // Non-linearity
+    if (layer_config->activation != IDENTITY)
+      av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+                       tensor2[branch].width, tensor2[branch].height,
+                       tensor2[branch].stride, layer_config->activation);
+
+    if (layer_config->bn_params.bn_gamma) {
+      av1_cnn_batchnorm(
+          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+          tensor2[branch].height, tensor2[branch].stride,
+          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+    }
+
+    // Concatenate tensors
+    if (layer_config->branch_combine_type == BRANCH_CAT) {
+      if (output_num == -1) {  // Non-output layer
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            assert(tensor2[b].channels > 0);
+            concat_tensor(&tensor2[b], &tensor2[branch]);
+          }
+        }
+      } else {  // Output layer
+        const int existing_channels = tensor2[branch].channels;
+        int num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            num_chs += tensor2[b].channels;
+          }
+        }
+        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+                      o_height, out_stride[output_num]);
+
+        num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+                        &tensor2[branch]);
+            num_chs += tensor2[b].channels;
+          }
+        }
+      }
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
+                                     tensor2);
+    }
+  }
+
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    free_tensor(&tensor1[b]);
+    free_tensor(&tensor2[b]);
+  }
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   CNN_MULTI_OUT *output) {
+  const float max_val = 255.0;
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
+                  cnn_config, thread_data, output);
+
+  aom_free(input_);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth,
+                                          CNN_MULTI_OUT *output) {
+  const float max_val = (float)((1 << bit_depth) - 1);
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+
+  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
+                  cnn_config, thread_data, output);
+
+  aom_free(input_);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+                         const CNN_CONFIG *cnn_config,
+                         const CNN_THREAD_DATA *thread_data, float **output,
+                         int out_stride) {
+  int out_width = 0, out_height = 0, out_channels = 0;
+  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
+                           &out_channels);
+  const int output_chs[1] = { out_channels };
+  const int output_strides[1] = { out_stride };
+  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
+                                  .output_strides = output_strides,
+                                  .output_buffer = output };
+  av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
+                                thread_data, &output_struct);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+                                int stride, const CNN_CONFIG *cnn_config,
+                                const CNN_THREAD_DATA *thread_data,
+                                int bit_depth, float **output, int out_stride) {
+  int out_width = 0, out_height = 0, out_channels = 0;
+  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
+                           &out_channels);
+  const int output_chs[1] = { out_channels };
+  const int output_strides[1] = { out_stride };
+  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
+                                  .output_strides = output_strides,
+                                  .output_buffer = output };
+  av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
+                                       thread_data, bit_depth, &output_struct);
+}

diff --git a/libaom/av1/encoder/cnn.h b/libaom/av1/encoder/cnn.h
new file mode 100644
index 0000000..706be44
--- /dev/null
+++ b/libaom/av1/encoder/cnn.h

@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CNN_H_
+#define AOM_AV1_COMMON_CNN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <math.h>
+
+#include "aom_util/aom_thread.h"
+#include "config/av1_rtcd.h"
+
+struct AV1Common;
+
+#define CNN_MAX_HIDDEN_LAYERS 64
+#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
+#define CNN_MAX_CHANNELS 256
+#define CNN_MAX_BRANCHES 4
+#define CNN_MAX_THREADS 32
+
+#define NO_BRANCH_CONFIG \
+  { 0, 0, 0 }
+#define NO_BN_PARAMS \
+  { NULL, NULL, NULL, NULL }
+
+enum {
+  PADDING_SAME_ZERO,       // tensorflow's SAME padding with pixels outside
+                           // the image area assumed to be 0 (default)
+  PADDING_SAME_REPLICATE,  // tensorflow's SAME padding with pixels outside
+                           // the image area replicated from closest edge
+  PADDING_VALID            // tensorflow's VALID padding
+} UENUM1BYTE(PADDING_TYPE);
+
+// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
+
+// Times when input tensor may be copied to branches given in input_to_branches.
+// BRANCH_NO_COPY: doesn't copy any tensor.
+// BRANCH_INPUT: copies the input tensor to branches.
+// BRANCH_OUTPUT: copies the convolved tensor to branches.
+// BRANCH_COMBINED: copies the combined (after convolving and branch combining)
+//   tensor. If no combinations happen at this layer, then this option
+//   has the same effect as COPY_OUTPUT.
+enum {
+  BRANCH_NO_COPY,
+  BRANCH_INPUT,
+  BRANCH_OUTPUT,
+  BRANCH_COMBINED
+} UENUM1BYTE(BRANCH_COPY);
+
+// Types of combining branches with output of current layer:
+// BRANCH_NOC: no branch combining
+// BRANCH_ADD: Add previously stored branch tensor to output of layer
+// BRANCH_CAT: Concatenate branch tensor to output of layer
+enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
+
+// The parameters used to scale each channel in batch
+// normalization. The processing in done on a per-channel basis.
+// e.g. bn_mean[c] is the mean for all pixels in channel c. This
+// is always applied after activation. The output is given by
+// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
+// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
+// here we assume that the effect of variance_epsilon is already
+// taken into account when bn_std is calculated. The pointers
+// needs to be either all zero or all valid. If all zero, then
+// batchnorm is disabled, else batchnorm is applied.
+struct CNN_BATCHNORM_PARAMS {
+  const float *bn_gamma;
+  const float *bn_beta;
+  const float *bn_mean;
+  const float *bn_std;
+};
+
+struct CNN_BRANCH_CONFIG {
+  int input_to_branches;  // If nonzero, copy the active tensor to the current
+  // layer and store for future use in branches
+  // specified in the field as a binary mask. For
+  // example, if input_to_branch = 0x06, it means the
+  // input tensor to the current branch is copied to
+  // branches 1 and 2 (where 0 represents the primary
+  // branch). One restriction is that the mask
+  // cannot indicate copying to the current branch.
+  // If greater than 0, only copies the channels up
+  // to the given index.
+  int channels_to_copy;  // Within the layer, input a copy of active
+  // tensor to branches given in input_to_branches.
+  int branches_to_combine;  // mask of branches to combine with output of
+  // current layer, if
+  // branch_combine_type != BRANCH_NOC
+  // For example, if branches_to_combine = 0x0A,
+  // it means that braches 1 and 3 are combined
+  // with the current branch.
+};
+
+struct CNN_LAYER_CONFIG {
+  int in_channels;
+  int filter_width;
+  int filter_height;
+  int out_channels;
+  int skip_width;
+  int skip_height;
+  int maxpool;            // whether to use maxpool or not (only effective when
+                          // skip width or skip_height are > 1)
+  const float *weights;   // array of length filter_height x filter_width x
+                          // in_channels x out_channels where the inner-most
+                          // scan is out_channels and the outer most scan is
+                          // filter_height.
+  const float *bias;      // array of length out_channels
+  PADDING_TYPE pad;       // padding type
+  ACTIVATION activation;  // the activation function to use after convolution
+  int deconvolve;         // whether this is a deconvolution layer.
+                          // 0: If skip_width or skip_height are > 1, then we
+                          // reduce resolution
+                          // 1: If skip_width or skip_height are > 1, then we
+                          // increase resolution
+  int branch;             // branch index in [0, CNN_MAX_BRANCHES - 1], where
+                          // 0 refers to the primary branch.
+  BRANCH_COPY branch_copy_type;
+  BRANCH_COMBINE branch_combine_type;
+  struct CNN_BRANCH_CONFIG branch_config;
+  struct CNN_BATCHNORM_PARAMS
+      bn_params;   // A struct that contains the parameters
+                   // used for batch normalization.
+  int output_num;  // The output buffer idx to which the layer output is
+                   // written. Set to -1 to disable writing it to the output. In
+                   // the case that branch_combine_type is BRANCH_CAT, all
+                   // concatenated channels will be written to output. In the
+                   // case of BRANCH_ADD, the output will be the result of
+                   // summation.
+};
+
+struct CNN_CONFIG {
+  int num_layers;  // number of CNN layers ( = number of hidden layers + 1)
+  int is_residue;  // whether the output activation is a residue
+  int ext_width, ext_height;  // extension horizontally and vertically
+  int strict_bounds;          // whether the input bounds are strict or not.
+                              // If strict, the extension area is filled by
+                              // replication; if not strict, image data is
+                              // assumed available beyond the bounds.
+  CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
+};
+
+struct CNN_THREAD_DATA {
+  int num_workers;
+  AVxWorker *workers;
+};
+
+struct CNN_MULTI_OUT {
+  int num_outputs;
+  const int *output_channels;
+  const int *output_strides;
+  float **output_buffer;
+};
+
+// Function to return size of output
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels);
+
+// Prediction functions from set of input image buffers. This function supports
+// CNN with multiple outputs.
+void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   struct CNN_MULTI_OUT *output);
+void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth, CNN_MULTI_OUT *output);
+
+// Prediction functions from set of input image buffers. This function only
+// supports a single output.
+void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+                         const CNN_CONFIG *cnn_config,
+                         const CNN_THREAD_DATA *thread_data, float **output,
+                         int out_stride);
+void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+                                int stride, const CNN_CONFIG *cnn_config,
+                                const CNN_THREAD_DATA *thread_data,
+                                int bit_depth, float **output, int out_stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_CNN_H_

diff --git a/libaom/av1/encoder/compound_type.c b/libaom/av1/encoder/compound_type.c
new file mode 100644
index 0000000..42095b7
--- /dev/null
+++ b/libaom/av1/encoder/compound_type.c

@@ -0,0 +1,1508 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tx_search.h"
+
+typedef int64_t (*pick_interinter_mask_type)(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse);
+
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+                                   const MACROBLOCK *const x,
+                                   const COMP_RD_STATS *st,
+                                   const MB_MODE_INFO *const mi,
+                                   int32_t *comp_rate, int64_t *comp_dist,
+                                   int32_t *comp_model_rate,
+                                   int64_t *comp_model_dist, int *comp_rs2) {
+  // TODO(ranjit): Ensure that compound type search use regular filter always
+  // and check if following check can be removed
+  // Check if interp filter matches with previous case
+  if (st->filter.as_int != mi->interp_filters.as_int) return 0;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  // Match MV and reference indices
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+    if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+  }
+
+  // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD
+  for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+       comp_type++) {
+    comp_rate[comp_type] = st->rate[comp_type];
+    comp_dist[comp_type] = st->dist[comp_type];
+    comp_model_rate[comp_type] = st->model_rate[comp_type];
+    comp_model_dist[comp_type] = st->model_dist[comp_type];
+    comp_rs2[comp_type] = st->comp_rs2[comp_type];
+  }
+
+  // For compound wedge/segment, reuse data only if NEWMV is not present in
+  // either of the directions
+  if ((!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)) ||
+      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) {
+    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
+           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
+           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE],
+           sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE],
+           sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE],
+           sizeof(comp_rs2[COMPOUND_WEDGE]) * 2);
+  }
+  return 1;
+}
+
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *x,
+                                        const MB_MODE_INFO *const mbmi,
+                                        int32_t *comp_rate, int64_t *comp_dist,
+                                        int32_t *comp_model_rate,
+                                        int64_t *comp_model_dist, int *comp_rs2,
+                                        int *match_index) {
+  for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+    if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+                         comp_dist, comp_model_rate, comp_model_dist,
+                         comp_rs2)) {
+      *match_index = j;
+      return 1;
+    }
+  }
+  return 0;  // no match result found
+}
+
+static INLINE bool enable_wedge_search(MACROBLOCK *const x,
+                                       const AV1_COMP *const cpi) {
+  // Enable wedge search if source variance and edge strength are above
+  // the thresholds.
+  return x->source_variance >
+             cpi->sf.inter_sf.disable_wedge_search_var_thresh &&
+         x->edge_strength > cpi->sf.inter_sf.disable_wedge_search_edge_thresh;
+}
+
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge &&
+         !cpi->sf.inter_sf.disable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
+         !cpi->sf.inter_sf.disable_wedge_interintra_search;
+}
+
+static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const BLOCK_SIZE bsize, const uint8_t *pred0,
+                                  int stride0, const uint8_t *pred1,
+                                  int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int bw_by2 = bw >> 1;
+  const int bh_by2 = bh >> 1;
+  uint32_t esq[2][2];
+  int64_t tl, br;
+
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+
+  // Residual variance computation over relevant quandrants in order to
+  // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
+  // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0)
+  // The 2nd and 3rd quadrants cancel out in TL + BR
+  // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0)
+  // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
+  // for all codebooks; experiment with other quadrant combinations for
+  // 0, 90 and 135 degrees also.
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                          pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                          &esq[0][1]);
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                          pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                          &esq[1][1]);
+
+  tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
+  br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
+  return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const int16_t *const residual1,
+                          const int16_t *const diff10,
+                          int8_t *const best_wedge_sign,
+                          int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  int8_t wedge_sign;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+  }
+#else
+  (void)hbd;
+  aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+#endif
+
+  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+                        (int64_t)aom_sum_squares_i16(residual1, N)) *
+                       (1 << WEDGE_WEIGHT_BITS) / 2;
+  int16_t *ds = residual0;
+
+  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    // int rate2;
+    // int64_t dist2;
+    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+    // sse, rate, dist, rate2, dist2); dist = dist2;
+    // rate = rate2;
+
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const BLOCK_SIZE bsize, const int16_t *const residual1,
+    const int16_t *const diff10, const int8_t wedge_sign,
+    int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+
+  int64_t rd;
+  int8_t wedge_index = -1;
+  int8_t wedge_sign = 0;
+
+  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+  assert(cpi->common.seq_params.enable_masked_compound);
+
+  if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+                               &wedge_index, best_sse);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+                    &wedge_index, best_sse);
+  }
+
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
+                                   const uint8_t *const p0,
+                                   const uint8_t *const p1,
+                                   const int16_t *const residual1,
+                                   const int16_t *const diff10,
+                                   uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = 1 << num_pels_log2_lookup[bsize];
+  int rate;
+  int64_t dist;
+  DIFFWTD_MASK_TYPE cur_mask_type;
+  int64_t best_rd = INT64_MAX;
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+  // try each mask type and its inverse
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
+    if (hbd)
+      av1_build_compound_diffwtd_mask_highbd(
+          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+    else
+      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+                                      p0, bw, p1, bw, bh, bw);
+
+    // compute rd for mask
+    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+                                                tmp_mask[cur_mask_type], N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+    if (rd0 < best_rd) {
+      best_mask_type = cur_mask_type;
+      best_rd = rd0;
+      *best_sse = sse;
+    }
+  }
+  mbmi->interinter_comp.mask_type = best_mask_type;
+  if (best_mask_type == DIFFWTD_38_INV) {
+    memcpy(xd->seg_mask, seg_mask, N * 2);
+  }
+  return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(av1_is_wedge_used(bsize));
+  assert(cpi->common.seq_params.enable_interintra_compound);
+
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+#endif
+  int8_t wedge_index = -1;
+  uint64_t sse;
+  int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0,
+                                     &wedge_index, &sse);
+
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+
+static AOM_INLINE void get_inter_predictors_masked_compound(
+    MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
+    int16_t *residual1, int16_t *diff10, int *strides) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  // get inter predictors to use for masked compound modes
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0,
+                                                   strides);
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1,
+                                                   strides);
+  const struct buf_2d *const src = &x->plane[0].src;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+                       bw);
+    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+#endif
+}
+
+// Computes the rd cost for the given interintra mode and updates the best
+static INLINE void compute_best_interintra_mode(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
+    INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int rate, skip_txfm_sb;
+  int64_t dist, skip_sse_sb;
+  const int bw = block_size_wide[bsize];
+  mbmi->interintra_mode = interintra_mode;
+  int rmode = interintra_mode_cost[interintra_mode];
+  av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                            intrapred, bw);
+  av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
+                                          &skip_txfm_sb, &skip_sse_sb, NULL,
+                                          NULL, NULL);
+  int64_t rd = RDCOST(x->rdmult, rate + rmode, dist);
+  if (rd < *best_interintra_rd) {
+    *best_interintra_rd = rd;
+    *best_interintra_mode = mbmi->interintra_mode;
+  }
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int64_t ref_best_rd,
+                                   RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (ref_best_rd < 0) return INT64_MAX;
+  av1_subtract_plane(x, bs, 0);
+  x->rd_model = LOW_TXFM_RD;
+  const int skip_trellis = (cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
+                            NO_ESTIMATE_YRD_TRELLIS_OPT);
+  const int64_t rd =
+      av1_uniform_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+                           max_txsize_rect_lookup[bs], FTXS_NONE, skip_trellis);
+  x->rd_model = FULL_TXFM_RD;
+  if (rd != INT64_MAX) {
+    const int skip_ctx = av1_get_skip_context(xd);
+    if (rd_stats->skip) {
+      const int s1 = x->skip_cost[skip_ctx][1];
+      rd_stats->rate = s1;
+    } else {
+      const int s0 = x->skip_cost[skip_ctx][0];
+      rd_stats->rate += s0;
+    }
+  }
+  return rd;
+}
+
+// Computes the rd_threshold for smooth interintra rd search.
+static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
+                                            int total_mode_rate,
+                                            int64_t ref_best_rd) {
+  const int64_t rd_thresh = get_rd_thresh_from_best_rd(
+      ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
+      INTER_INTRA_RD_THRESH_SCALE);
+  const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0);
+  return (rd_thresh - mode_rd);
+}
+
+// Computes the best wedge interintra mode
+static AOM_INLINE int64_t compute_best_wedge_interintra(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
+    int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  int64_t best_total_rd = INT64_MAX;
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
+    mbmi->interintra_mode = mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    const int rate_overhead =
+        interintra_mode_cost[mode] +
+        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+    const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
+    if (total_rd < best_total_rd) {
+      best_total_rd = total_rd;
+      best_interintra_rd_wedge = rd;
+      *best_mode = mbmi->interintra_mode;
+      *best_wedge_index = mbmi->interintra_wedge_index;
+    }
+  }
+  return best_interintra_rd_wedge;
+}
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst) {
+  const int try_smooth_interintra = cpi->oxcf.enable_smooth_interintra &&
+                                    !cpi->sf.inter_sf.disable_smooth_interintra;
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int try_wedge_interintra =
+      is_wedge_used && enable_wedge_interintra_search(x, cpi);
+  if (!try_smooth_interintra && !try_wedge_interintra) return -1;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int64_t rd = INT64_MAX;
+  const int bw = block_size_wide[bsize];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Single reference inter prediction
+  mbmi->ref_frame[1] = NONE_FRAME;
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = bw;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  const int num_planes = av1_num_planes(cm);
+
+  // Restore the buffers for intra prediction
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  INTERINTRA_MODE best_interintra_mode =
+      args->inter_intra_mode[mbmi->ref_frame[0]];
+
+  // Compute smooth_interintra
+  int64_t best_interintra_rd_nowedge = INT64_MAX;
+  int best_mode_rate = INT_MAX;
+  if (try_smooth_interintra) {
+    mbmi->use_wedge_interintra = 0;
+    int interintra_mode_reuse = 1;
+    if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+        best_interintra_mode == INTERINTRA_MODES) {
+      interintra_mode_reuse = 0;
+      int64_t best_interintra_rd = INT64_MAX;
+      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+           ++cur_mode) {
+        if ((!cpi->oxcf.enable_smooth_intra ||
+             cpi->sf.intra_sf.disable_smooth_intra) &&
+            cur_mode == II_SMOOTH_PRED)
+          continue;
+        compute_best_interintra_mode(cpi, mbmi, xd, x, interintra_mode_cost,
+                                     orig_dst, intrapred, tmp_buf,
+                                     &best_interintra_mode, &best_interintra_rd,
+                                     cur_mode, bsize);
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+    }
+    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
+                       cpi->sf.inter_sf.disable_smooth_interintra,
+                   best_interintra_mode != II_SMOOTH_PRED));
+    // Recompute prediction if required
+    if (interintra_mode_reuse || best_interintra_mode != INTERINTRA_MODES - 1) {
+      mbmi->interintra_mode = best_interintra_mode;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    }
+
+    // Compute rd cost for best smooth_interintra
+    RD_STATS rd_stats;
+    const int rmode = interintra_mode_cost[best_interintra_mode] +
+                      (is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0);
+    const int total_mode_rate = rmode + *rate_mv;
+    const int64_t rd_thresh =
+        compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+    } else {
+      return -1;
+    }
+    best_interintra_rd_nowedge = rd;
+    best_mode_rate = rmode;
+    // Return early if best_interintra_rd_nowedge not good enough
+    if (ref_best_rd < INT64_MAX &&
+        (best_interintra_rd_nowedge >> INTER_INTRA_RD_THRESH_SHIFT) *
+                INTER_INTRA_RD_THRESH_SCALE >
+            ref_best_rd) {
+      return -1;
+    }
+  }
+
+  // Compute wedge interintra
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  if (try_wedge_interintra) {
+    mbmi->use_wedge_interintra = 1;
+    if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+      // Exhaustive search of all wedge and mode combinations.
+      int best_mode = 0;
+      int best_wedge_index = 0;
+      best_interintra_rd_wedge = compute_best_wedge_interintra(
+          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_,
+          tmp_buf_, &best_mode, &best_wedge_index, bsize);
+      mbmi->interintra_mode = best_mode;
+      mbmi->interintra_wedge_index = best_wedge_index;
+      if (best_mode != INTERINTRA_MODES - 1) {
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+      }
+    } else if (!try_smooth_interintra) {
+      if (best_interintra_mode == INTERINTRA_MODES) {
+        mbmi->interintra_mode = INTERINTRA_MODES - 1;
+        best_interintra_mode = INTERINTRA_MODES - 1;
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        // Pick wedge mask based on INTERINTRA_MODES - 1
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+        // Find the best interintra mode for the chosen wedge mask
+        for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+             ++cur_mode) {
+          compute_best_interintra_mode(
+              cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+              tmp_buf, &best_interintra_mode, &best_interintra_rd_wedge,
+              cur_mode, bsize);
+        }
+        args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+        mbmi->interintra_mode = best_interintra_mode;
+
+        // Recompute prediction if required
+        if (best_interintra_mode != INTERINTRA_MODES - 1) {
+          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                    intrapred, bw);
+        }
+      } else {
+        // Pick wedge mask for the best interintra mode (reused)
+        mbmi->interintra_mode = best_interintra_mode;
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      }
+    } else {
+      // Pick wedge mask for the best interintra mode from smooth_interintra
+      best_interintra_rd_wedge =
+          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    }
+
+    const int rate_overhead =
+        interintra_mode_cost[mbmi->interintra_mode] +
+        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+        x->wedge_interintra_cost[bsize][1];
+    best_interintra_rd_wedge += RDCOST(x->rdmult, rate_overhead + *rate_mv, 0);
+
+    const int_mv mv0 = mbmi->mv[0];
+    int_mv tmp_mv = mv0;
+    rd = INT64_MAX;
+    int tmp_rate_mv = 0;
+    // Refine motion vector for NEWMV case.
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      int rate_sum, skip_txfm_sb;
+      int64_t dist_sum, skip_sse_sb;
+      // get negative of mask
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+      av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, intrapred,
+                                        mask, bw, &tmp_rate_mv, 0);
+      if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+        // predictor is not calculated again in av1_enc_build_inter_predictor().
+        mbmi->ref_frame[1] = NONE_FRAME;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        mbmi->ref_frame[1] = INTRA_FRAME;
+        av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+                               xd->plane[AOM_PLANE_Y].dst.stride, intrapred,
+                               bw);
+        model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+            cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+            &skip_sse_sb, NULL, NULL, NULL);
+        rd =
+            RDCOST(x->rdmult, tmp_rate_mv + rate_overhead + rate_sum, dist_sum);
+      }
+    }
+    if (rd >= best_interintra_rd_wedge) {
+      tmp_mv.as_int = mv0.as_int;
+      tmp_rate_mv = *rate_mv;
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    }
+    // Evaluate closer to true rd
+    RD_STATS rd_stats;
+    const int64_t mode_rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv, 0);
+    const int64_t tmp_rd_thresh = best_interintra_rd_nowedge - mode_rd;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv + rd_stats.rate,
+                  rd_stats.dist);
+    } else {
+      if (best_interintra_rd_nowedge == INT64_MAX) return -1;
+    }
+    best_interintra_rd_wedge = rd;
+    if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+      mbmi->mv[0].as_int = tmp_mv.as_int;
+      *tmp_rate2 += tmp_rate_mv - *rate_mv;
+      *rate_mv = tmp_rate_mv;
+      best_mode_rate = rate_overhead;
+    } else {
+      mbmi->use_wedge_interintra = 0;
+      mbmi->interintra_mode = best_interintra_mode;
+      mbmi->mv[0].as_int = mv0.as_int;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    }
+  }
+
+  if (best_interintra_rd_nowedge == INT64_MAX &&
+      best_interintra_rd_wedge == INT64_MAX) {
+    return -1;
+  }
+
+  *tmp_rate2 += best_mode_rate;
+
+  if (num_planes > 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_U, num_planes - 1);
+  }
+  return 0;
+}
+
+static void alloc_compound_type_rd_buffers_no_check(
+    CompoundTypeRdBuffers *const bufs) {
+  bufs->pred0 =
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0));
+  bufs->pred1 =
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1));
+  bufs->residual1 =
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1));
+  bufs->diff10 =
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10));
+  bufs->tmp_best_mask_buf = (uint8_t *)aom_malloc(
+      2 * MAX_SB_SQUARE * sizeof(*bufs->tmp_best_mask_buf));
+}
+
+// Computes the valid compound_types to be evaluated
+static INLINE int compute_valid_comp_types(
+    MACROBLOCK *x, const AV1_COMP *const cpi, int *try_average_and_distwtd_comp,
+    BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask,
+    COMPOUND_TYPE *valid_comp_types) {
+  const AV1_COMMON *cm = &cpi->common;
+  int valid_type_count = 0;
+  int comp_type, valid_check;
+  int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 };
+
+  const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+  const int try_distwtd_comp =
+      ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+  *try_average_and_distwtd_comp = try_average_comp && try_distwtd_comp;
+
+  // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
+  for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+       comp_type++) {
+    valid_check =
+        (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
+    if (!*try_average_and_distwtd_comp && valid_check &&
+        is_interinter_compound_used(comp_type, bsize))
+      valid_comp_types[valid_type_count++] = comp_type;
+  }
+  // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
+  if (masked_compound_used) {
+    // enable_masked_type[0] corresponds to COMPOUND_WEDGE
+    // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
+    enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
+    enable_masked_type[1] = cpi->oxcf.enable_diff_wtd_comp;
+    for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
+         comp_type++) {
+      if ((mode_search_mask & (1 << comp_type)) &&
+          is_interinter_compound_used(comp_type, bsize) &&
+          enable_masked_type[comp_type - COMPOUND_WEDGE])
+        valid_comp_types[valid_type_count++] = comp_type;
+    }
+  }
+  return valid_type_count;
+}
+
+// Calculates the cost for compound type mask
+static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         int comp_group_idx_ctx,
+                                         int comp_index_ctx,
+                                         int masked_compound_used,
+                                         int *masked_type_cost) {
+  av1_zero_array(masked_type_cost, COMPOUND_TYPES);
+  // Account for group index cost when wedge and/or diffwtd prediction are
+  // enabled
+  if (masked_compound_used) {
+    // Compound group index of average and distwtd is 0
+    // Compound group index of wedge and diffwtd is 1
+    masked_type_cost[COMPOUND_AVERAGE] +=
+        x->comp_group_idx_cost[comp_group_idx_ctx][0];
+    masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
+    masked_type_cost[COMPOUND_WEDGE] +=
+        x->comp_group_idx_cost[comp_group_idx_ctx][1];
+    masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
+  }
+
+  // Compute the cost to signal compound index/type
+  masked_type_cost[COMPOUND_AVERAGE] += x->comp_idx_cost[comp_index_ctx][1];
+  masked_type_cost[COMPOUND_DISTWTD] += x->comp_idx_cost[comp_index_ctx][0];
+  masked_type_cost[COMPOUND_WEDGE] += x->compound_type_cost[bsize][0];
+  masked_type_cost[COMPOUND_DIFFWTD] += x->compound_type_cost[bsize][1];
+}
+
+// Updates mbmi structure with the relevant compound type info
+static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+                                                 COMPOUND_TYPE cur_type) {
+  mbmi->interinter_comp.type = cur_type;
+  mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
+  mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD);
+}
+
+// When match is found, populate the compound type data
+// and calculate the rd cost using the stored stats and
+// update the mbmi appropriately.
+static INLINE int populate_reuse_comp_type_data(
+    const MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
+    int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
+    int match_index) {
+  const int winner_comp_type =
+      x->comp_rd_stats[match_index].interinter_comp.type;
+  if (comp_rate[winner_comp_type] == INT_MAX)
+    return best_type_stats->best_compmode_interinter_cost;
+  update_mbmi_for_compound_type(mbmi, winner_comp_type);
+  mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp;
+  *rd = RDCOST(
+      x->rdmult,
+      comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type],
+      comp_dist[winner_comp_type]);
+  mbmi->mv[0].as_int = cur_mv[0].as_int;
+  mbmi->mv[1].as_int = cur_mv[1].as_int;
+  return comp_rs2[winner_comp_type];
+}
+
+// Updates rd cost and relevant compound type data for the best compound type
+static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+                                    BEST_COMP_TYPE_STATS *best_type_stats,
+                                    int64_t best_rd_cur,
+                                    int64_t comp_model_rd_cur, int rs2) {
+  *rd = best_rd_cur;
+  best_type_stats->comp_best_model_rd = comp_model_rd_cur;
+  best_type_stats->best_compound_data = mbmi->interinter_comp;
+  best_type_stats->best_compmode_interinter_cost = rs2;
+}
+
+// Updates best_mv for masked compound types
+static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+                                       int_mv *best_mv, int_mv *cur_mv,
+                                       const COMPOUND_TYPE cur_type,
+                                       int *best_tmp_rate_mv, int tmp_rate_mv,
+                                       const SPEED_FEATURES *const sf) {
+  if (cur_type == COMPOUND_WEDGE ||
+      (sf->inter_sf.enable_interinter_diffwtd_newmv_search &&
+       cur_type == COMPOUND_DIFFWTD)) {
+    *best_tmp_rate_mv = tmp_rate_mv;
+    best_mv[0].as_int = mbmi->mv[0].as_int;
+    best_mv[1].as_int = mbmi->mv[1].as_int;
+  } else {
+    best_mv[0].as_int = cur_mv[0].as_int;
+    best_mv[1].as_int = cur_mv[1].as_int;
+  }
+}
+
+// Choose the better of the two COMPOUND_AVERAGE,
+// COMPOUND_DISTWTD based on modeled cost
+static int find_best_avg_distwtd_comp_type(MACROBLOCK *x, int *comp_model_rate,
+                                           int64_t *comp_model_dist,
+                                           int rate_mv, int64_t *best_rd) {
+  int64_t est_rd[2];
+  est_rd[COMPOUND_AVERAGE] =
+      RDCOST(x->rdmult, comp_model_rate[COMPOUND_AVERAGE] + rate_mv,
+             comp_model_dist[COMPOUND_AVERAGE]);
+  est_rd[COMPOUND_DISTWTD] =
+      RDCOST(x->rdmult, comp_model_rate[COMPOUND_DISTWTD] + rate_mv,
+             comp_model_dist[COMPOUND_DISTWTD]);
+  int best_type = (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD])
+                      ? COMPOUND_AVERAGE
+                      : COMPOUND_DISTWTD;
+  *best_rd = est_rd[best_type];
+  return best_type;
+}
+
+static INLINE void save_comp_rd_search_stat(
+    MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
+    const int64_t *comp_dist, const int32_t *comp_model_rate,
+    const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
+  const int offset = x->comp_rd_stats_idx;
+  if (offset < MAX_COMP_RD_STATS) {
+    COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+    memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+    memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+    memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate));
+    memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist));
+    memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2));
+    memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+    memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+    rd_stats->mode = mbmi->mode;
+    rd_stats->filter = mbmi->interp_filters;
+    rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 2; ++i) {
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mbmi->ref_frame[i]];
+      rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+    }
+    memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp,
+           sizeof(rd_stats->interinter_comp));
+    ++x->comp_rd_stats_idx;
+  }
+}
+
+static INLINE int get_interinter_compound_mask_rate(
+    const MACROBLOCK *const x, const MB_MODE_INFO *const mbmi) {
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  if (compound_type == COMPOUND_WEDGE) {
+    return av1_is_wedge_used(mbmi->sb_type)
+               ? av1_cost_literal(1) +
+                     x->wedge_idx_cost[mbmi->sb_type]
+                                      [mbmi->interinter_comp.wedge_index]
+               : 0;
+  } else {
+    assert(compound_type == COMPOUND_DIFFWTD);
+    return av1_cost_literal(1);
+  }
+}
+
+// Takes a backup of rate, distortion and model_rd for future reuse
+static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+                                int64_t *comp_dist, int32_t *comp_model_rate,
+                                int64_t *comp_model_dist, int rate_sum,
+                                int64_t dist_sum, RD_STATS *rd_stats,
+                                int *comp_rs2, int rs2) {
+  comp_rate[cur_type] = rd_stats->rate;
+  comp_dist[cur_type] = rd_stats->dist;
+  comp_model_rate[cur_type] = rate_sum;
+  comp_model_dist[cur_type] = dist_sum;
+  comp_rs2[cur_type] = rs2;
+}
+
+static int64_t masked_compound_type_rd(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+    int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+    int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound,
+    int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate,
+    int64_t *comp_model_dist, const int64_t comp_best_model_rd,
+    int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t best_rd_cur = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+  pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
+                                                        pick_interinter_seg };
+
+  // TODO(any): Save pred and mask calculation as well into records. However
+  // this may increase memory requirements as compound segment mask needs to be
+  // stored in each record.
+  if (*calc_pred_masked_compound) {
+    get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1,
+                                         diff10, strides);
+    *calc_pred_masked_compound = 0;
+  }
+  if (cpi->sf.inter_sf.prune_wedge_pred_diff_based &&
+      compound_type == COMPOUND_WEDGE) {
+    unsigned int sse;
+    if (is_cur_buf_hbd(xd))
+      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
+    else
+      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
+    const unsigned int mse =
+        ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
+    // If two predictors are very similar, skip wedge compound mode search
+    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+  // Function pointer to pick the appropriate mask
+  // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge()
+  // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg()
+  uint64_t cur_sse = UINT64_MAX;
+  best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+      cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
+  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+  assert(cur_sse != UINT64_MAX);
+  int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
+
+  // Although the true rate_mv might be different after motion search, but it
+  // is unlikely to be the best mode considering the transform rd cost and other
+  // mode overhead cost
+  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+  if (mode_rd > rd_thresh) {
+    *comp_model_rd_cur = INT64_MAX;
+    return INT64_MAX;
+  }
+
+  // Check if the mode is good enough based on skip rd
+  // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
+  // setting
+  if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+    int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
+                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+    if (!eval_txfm) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+
+  // Compute cost if matching record not found, else, reuse data
+  if (comp_rate[compound_type] == INT_MAX) {
+    // Check whether new MV search for wedge is to be done
+    int wedge_newmv_search =
+        have_newmv_in_inter_mode(this_mode) &&
+        (compound_type == COMPOUND_WEDGE) &&
+        (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
+    int diffwtd_newmv_search =
+        cpi->sf.inter_sf.enable_interinter_diffwtd_newmv_search &&
+        compound_type == COMPOUND_DIFFWTD &&
+        have_newmv_in_inter_mode(this_mode);
+
+    // Search for new MV if needed and build predictor
+    if (wedge_newmv_search) {
+      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                           bsize, this_mode);
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    } else if (diffwtd_newmv_search) {
+      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                           bsize, this_mode);
+      // we need to update the mask according to the new motion vector
+      CompoundTypeRdBuffers tmp_buf;
+      int64_t tmp_rd = INT64_MAX;
+      alloc_compound_type_rd_buffers_no_check(&tmp_buf);
+
+      uint8_t *tmp_preds0[1] = { tmp_buf.pred0 };
+      uint8_t *tmp_preds1[1] = { tmp_buf.pred1 };
+
+      get_inter_predictors_masked_compound(x, bsize, tmp_preds0, tmp_preds1,
+                                           tmp_buf.residual1, tmp_buf.diff10,
+                                           strides);
+
+      tmp_rd = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+          cpi, x, bsize, *tmp_preds0, *tmp_preds1, tmp_buf.residual1,
+          tmp_buf.diff10, &cur_sse);
+      // we can reuse rs2 here
+      tmp_rd += RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+
+      if (tmp_rd >= best_rd_cur) {
+        // restore the motion vector
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+      } else {
+        // build the final prediciton using the updated mv
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, tmp_preds0,
+                                                 strides, tmp_preds1, strides);
+      }
+      av1_release_compound_type_rd_buffers(&tmp_buf);
+    } else {
+      *out_rate_mv = rate_mv;
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
+    }
+    // Get the RD cost from model RD
+    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+        cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb,
+        &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+    *comp_model_rd_cur = rd;
+    // Override with best if current is worse than best for new MV
+    if (wedge_newmv_search) {
+      if (rd >= best_rd_cur) {
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+        *comp_model_rd_cur = best_rd_cur;
+      }
+    }
+    if (cpi->sf.inter_sf.prune_comp_type_by_model_rd &&
+        (*comp_model_rd_cur > comp_best_model_rd) &&
+        comp_best_model_rd != INT64_MAX) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+    // Compute RD cost for the current type
+    RD_STATS rd_stats;
+    const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+    const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd =
+          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+      // Backup rate and distortion for future reuse
+      backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate,
+                   comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2,
+                   *rs2);
+    }
+  } else {
+    // Reuse data as matching record is found
+    assert(comp_dist[compound_type] != INT64_MAX);
+    // When disable_interinter_wedge_newmv_search is set, motion refinement is
+    // disabled. Hence rate and distortion can be reused in this case as well
+    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+                   cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
+    assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+    assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
+    *out_rate_mv = rate_mv;
+    // Calculate RD cost based on stored stats
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+                comp_dist[compound_type]);
+    // Recalculate model rdcost with the updated rate
+    *comp_model_rd_cur =
+        RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type],
+               comp_model_dist[compound_type]);
+  }
+  return rd;
+}
+
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int bw = block_size_wide[bsize];
+  int rs2;
+  int_mv best_mv[2];
+  int best_tmp_rate_mv = *rate_mv;
+  BEST_COMP_TYPE_STATS best_type_stats;
+  // Initializing BEST_COMP_TYPE_STATS
+  best_type_stats.best_compound_data.type = COMPOUND_AVERAGE;
+  best_type_stats.best_compmode_interinter_cost = 0;
+  best_type_stats.comp_best_model_rd = INT64_MAX;
+
+  uint8_t *preds0[1] = { buffers->pred0 };
+  uint8_t *preds1[1] = { buffers->pred1 };
+  int strides[1] = { bw };
+  int tmp_rate_mv;
+  const int num_pix = 1 << num_pels_log2_lookup[bsize];
+  const int mask_len = 2 * num_pix * sizeof(uint8_t);
+  COMPOUND_TYPE cur_type;
+  // Local array to store the mask cost for different compound types
+  int masked_type_cost[COMPOUND_TYPES];
+
+  int calc_pred_masked_compound = 1;
+  int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                        INT64_MAX };
+  int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX,
+                                              INT_MAX };
+  int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                              INT64_MAX };
+  int match_index = 0;
+  const int match_found =
+      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                            comp_model_dist, comp_rs2, &match_index);
+  best_mv[0].as_int = cur_mv[0].as_int;
+  best_mv[1].as_int = cur_mv[1].as_int;
+  *rd = INT64_MAX;
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+
+  // Local array to store the valid compound types to be evaluated in the core
+  // loop
+  COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = {
+    COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
+  };
+  int valid_type_count = 0;
+  int try_average_and_distwtd_comp = 0;
+  // compute_valid_comp_types() returns the number of valid compound types to be
+  // evaluated and populates the same in the local array valid_comp_types[].
+  // It also sets the flag 'try_average_and_distwtd_comp'
+  valid_type_count = compute_valid_comp_types(
+      x, cpi, &try_average_and_distwtd_comp, bsize, masked_compound_used,
+      mode_search_mask, valid_comp_types);
+
+  // The following context indices are independent of compound type
+  const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+  const int comp_index_ctx = get_comp_index_context(cm, xd);
+
+  // Populates masked_type_cost local array for the 4 compound types
+  calc_masked_type_cost(x, bsize, comp_group_idx_ctx, comp_index_ctx,
+                        masked_compound_used, masked_type_cost);
+
+  int64_t comp_model_rd_cur = INT64_MAX;
+  int64_t best_rd_cur = INT64_MAX;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // If the match is found, calculate the rd cost using the
+  // stored stats and update the mbmi appropriately.
+  if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) {
+    return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
+                                         comp_rate, comp_dist, comp_rs2,
+                                         rate_mv, rd, match_index);
+  }
+  // Special handling if both compound_average and compound_distwtd
+  // are to be searched. In this case, first estimate between the two
+  // modes and then call estimate_yrd_for_sb() only for the better of
+  // the two.
+  if (try_average_and_distwtd_comp) {
+    int est_rate[2];
+    int64_t est_dist[2], est_rd;
+    COMPOUND_TYPE best_type;
+    // Since modelled rate and dist are separately stored,
+    // compute better of COMPOUND_AVERAGE and COMPOUND_DISTWTD
+    // using the stored stats.
+    if ((comp_model_rate[COMPOUND_AVERAGE] != INT_MAX) &&
+        comp_model_rate[COMPOUND_DISTWTD] != INT_MAX) {
+      // Choose the better of the COMPOUND_AVERAGE,
+      // COMPOUND_DISTWTD on modeled cost.
+      best_type = find_best_avg_distwtd_comp_type(
+          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
+      update_mbmi_for_compound_type(mbmi, best_type);
+      if (comp_rate[best_type] != INT_MAX)
+        best_rd_cur = RDCOST(
+            x->rdmult,
+            masked_type_cost[best_type] + *rate_mv + comp_rate[best_type],
+            comp_dist[best_type]);
+      comp_model_rd_cur = est_rd;
+      // Update stats for best compound type
+      if (best_rd_cur < *rd) {
+        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                         comp_model_rd_cur, masked_type_cost[best_type]);
+      }
+      restore_dst_buf(xd, *tmp_dst, 1);
+    } else {
+      int64_t sse_y[COMPOUND_DISTWTD + 1];
+      // Calculate model_rd for COMPOUND_AVERAGE and COMPOUND_DISTWTD
+      for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+           comp_type++) {
+        update_mbmi_for_compound_type(mbmi, comp_type);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        model_rd_sb_fn[MODELRD_CURVFIT](
+            cpi, bsize, x, xd, 0, 0, &est_rate[comp_type], &est_dist[comp_type],
+            NULL, NULL, NULL, NULL, NULL);
+        est_rate[comp_type] += masked_type_cost[comp_type];
+        comp_model_rate[comp_type] = est_rate[comp_type];
+        comp_model_dist[comp_type] = est_dist[comp_type];
+        sse_y[comp_type] = x->pred_sse[xd->mi[0]->ref_frame[0]];
+        if (comp_type == COMPOUND_AVERAGE) {
+          *is_luma_interp_done = 1;
+          restore_dst_buf(xd, *tmp_dst, 1);
+        }
+      }
+      // Choose the better of the two based on modeled cost and call
+      // estimate_yrd_for_sb() for that one.
+      best_type = find_best_avg_distwtd_comp_type(
+          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
+      update_mbmi_for_compound_type(mbmi, best_type);
+      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *orig_dst, 1);
+      rs2 = masked_type_cost[best_type];
+      RD_STATS est_rd_stats;
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + *rate_mv, 0);
+      const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+      int64_t est_rd_ = INT64_MAX;
+      int eval_txfm = 1;
+      // Check if the mode is good enough based on skip rd
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        int64_t skip_rd =
+            RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4));
+        eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
+                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+      }
+      // Evaluate further if skip rd is low enough
+      if (eval_txfm) {
+        est_rd_ =
+            estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats);
+      }
+
+      if (est_rd_ != INT64_MAX) {
+        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                             est_rd_stats.dist);
+        // Backup rate and distortion for future reuse
+        backup_stats(best_type, comp_rate, comp_dist, comp_model_rate,
+                     comp_model_dist, est_rate[best_type], est_dist[best_type],
+                     &est_rd_stats, comp_rs2, rs2);
+        comp_model_rd_cur = est_rd;
+      }
+      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+      // Update stats for best compound type
+      if (best_rd_cur < *rd) {
+        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                         comp_model_rd_cur, rs2);
+      }
+    }
+  }
+
+  // If COMPOUND_AVERAGE is not valid, use the spare buffer
+  if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+
+  // Loop over valid compound types
+  for (int i = 0; i < valid_type_count; i++) {
+    cur_type = valid_comp_types[i];
+    comp_model_rd_cur = INT64_MAX;
+    tmp_rate_mv = *rate_mv;
+    best_rd_cur = INT64_MAX;
+
+    // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
+    if (cur_type < COMPOUND_WEDGE) {
+      update_mbmi_for_compound_type(mbmi, cur_type);
+      rs2 = masked_type_cost[cur_type];
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+      if (mode_rd < ref_best_rd) {
+        // Reuse data if matching record is found
+        if (comp_rate[cur_type] == INT_MAX) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
+          if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+          // Compute RD cost for the current type
+          RD_STATS est_rd_stats;
+          const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+          int64_t est_rd = INT64_MAX;
+          int eval_txfm = 1;
+          // Check if the mode is good enough based on skip rd
+          if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+            int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4));
+            eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
+                                        cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+          }
+          // Evaluate further if skip rd is low enough
+          if (eval_txfm) {
+            est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
+                                         &est_rd_stats);
+          }
+
+          if (est_rd != INT64_MAX) {
+            best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                                 est_rd_stats.dist);
+            model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+                cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+            comp_model_rd_cur =
+                RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+
+            // Backup rate and distortion for future reuse
+            backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
+                         comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
+                         comp_rs2, rs2);
+          }
+        } else {
+          // Calculate RD cost based on stored stats
+          assert(comp_dist[cur_type] != INT64_MAX);
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+                               comp_dist[cur_type]);
+          // Recalculate model rdcost with the updated rate
+          comp_model_rd_cur =
+              RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
+                     comp_model_dist[cur_type]);
+        }
+      }
+      // use spare buffer for following compound type try
+      if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+    } else {
+      // Handle masked compound types
+      update_mbmi_for_compound_type(mbmi, cur_type);
+      rs2 = masked_type_cost[cur_type];
+      // Factors to control gating of compound type selection based on best
+      // approximate rd so far
+      const int max_comp_type_rd_threshold_mul =
+          comp_type_rd_threshold_mul[cpi->sf.inter_sf
+                                         .prune_comp_type_by_comp_avg];
+      const int max_comp_type_rd_threshold_div =
+          comp_type_rd_threshold_div[cpi->sf.inter_sf
+                                         .prune_comp_type_by_comp_avg];
+      // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is
+      // within threshold
+      int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) *
+                           max_comp_type_rd_threshold_mul);
+
+      if (approx_rd < ref_best_rd) {
+        const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh);
+        best_rd_cur = masked_compound_type_rd(
+            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+            strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound,
+            comp_rate, comp_dist, comp_model_rate, comp_model_dist,
+            best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2,
+            ref_skip_rd);
+      }
+    }
+    // Update stats for best compound type
+    if (best_rd_cur < *rd) {
+      update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                       comp_model_rd_cur, rs2);
+      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
+        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
+        if (have_newmv_in_inter_mode(this_mode))
+          update_mask_best_mv(mbmi, best_mv, cur_mv, cur_type,
+                              &best_tmp_rate_mv, tmp_rate_mv, &cpi->sf);
+      }
+    }
+    // reset to original mvs for next iteration
+    mbmi->mv[0].as_int = cur_mv[0].as_int;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+  if (mbmi->interinter_comp.type != best_type_stats.best_compound_data.type) {
+    mbmi->comp_group_idx =
+        (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+    mbmi->compound_idx =
+        !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+    mbmi->interinter_comp = best_type_stats.best_compound_data;
+    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
+  }
+  if (have_newmv_in_inter_mode(this_mode)) {
+    mbmi->mv[0].as_int = best_mv[0].as_int;
+    mbmi->mv[1].as_int = best_mv[1].as_int;
+    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+      *rate_mv = best_tmp_rate_mv;
+    }
+  }
+  restore_dst_buf(xd, *orig_dst, 1);
+  if (!match_found)
+    save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                             comp_model_dist, cur_mv, comp_rs2);
+  return best_type_stats.best_compmode_interinter_cost;
+}

diff --git a/libaom/av1/encoder/compound_type.h b/libaom/av1/encoder/compound_type.h
new file mode 100644
index 0000000..f2bd857
--- /dev/null
+++ b/libaom/av1/encoder/compound_type.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Structure to store the compound type related stats for best compound type
+typedef struct {
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  int64_t comp_best_model_rd;
+  int best_compmode_interinter_cost;
+} BEST_COMP_TYPE_STATS;
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst);
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COMPOUND_TYPE_H_

diff --git a/libaom/av1/encoder/context_tree.c b/libaom/av1/encoder/context_tree.c
index 40df6c1..9b5b1cb 100644
--- a/libaom/av1/encoder/context_tree.c
+++ b/libaom/av1/encoder/context_tree.c

@@ -22,15 +22,18 @@
   tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
 } PC_TREE_SHARED_BUFFERS;
 
-static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-                               PICK_MODE_CONTEXT *ctx,
-                               PC_TREE_SHARED_BUFFERS *shared_bufs) {
+static AOM_INLINE void alloc_mode_context(AV1_COMMON *cm, int num_pix,
+                                          PICK_MODE_CONTEXT *ctx,
+                                          PC_TREE_SHARED_BUFFERS *shared_bufs) {
   const int num_planes = av1_num_planes(cm);
   int i;
   const int num_blk = num_pix / 16;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(cm, ctx->blk_skip,
+                  aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+  CHECK_MEM_ERROR(cm, ctx->tx_type_map,
+                  aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
   for (i = 0; i < num_planes; ++i) {
     ctx->coeff[i] = shared_bufs->coeff_buf[i];
     ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
@@ -51,10 +54,13 @@
   }
 }
 
-static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) {
+static AOM_INLINE void free_mode_context(PICK_MODE_CONTEXT *ctx,
+                                         const int num_planes) {
   int i;
   aom_free(ctx->blk_skip);
   ctx->blk_skip = 0;
+  aom_free(ctx->tx_type_map);
+  ctx->tx_type_map = 0;
   for (i = 0; i < num_planes; ++i) {
     ctx->coeff[i] = 0;
     ctx->qcoeff[i] = 0;
@@ -71,9 +77,9 @@
   }
 }
 
-static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
-                                int is_leaf,
-                                PC_TREE_SHARED_BUFFERS *shared_bufs) {
+static AOM_INLINE void alloc_tree_contexts(
+    AV1_COMMON *cm, PC_TREE *tree, int num_pix, int is_leaf,
+    PC_TREE_SHARED_BUFFERS *shared_bufs) {
   alloc_mode_context(cm, num_pix, &tree->none, shared_bufs);
 
   if (is_leaf) return;
@@ -106,7 +112,7 @@
   }
 }
 
-static void free_tree_contexts(PC_TREE *tree, const int num_planes) {
+static AOM_INLINE void free_tree_contexts(PC_TREE *tree, const int num_planes) {
   int i;
   for (i = 0; i < 3; i++) {
     free_mode_context(&tree->horizontala[i], num_planes);
@@ -125,16 +131,26 @@
   free_mode_context(&tree->vertical[1], num_planes);
 }
 
+// This function will compute the number of pc_tree nodes to be allocated
+// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
+static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
+                                        int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
 // This function sets up a tree of contexts such that at each square
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
 // represents the state of our search.
-void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
-  int i, j;
-  const int tree_nodes_inc = 1024;
-  const int leaf_factor = 4;
-  const int leaf_nodes = 256 * leaf_factor;
-  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PC_TREE_SHARED_BUFFERS shared_bufs;
@@ -159,41 +175,54 @@
     shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
   }
 
-  // Sets up all the leaf nodes in the tree.
-  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-    tree->block_size = square[0];
-    alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
-  }
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
 
-  // Each node has 4 leaf nodes, fill each block_size level of the tree
-  // from leafs to the root.
-  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
-    for (i = 0; i < nodes; ++i) {
+    // Sets up all the leaf nodes in the tree.
+    for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0, &shared_bufs);
-      tree->block_size = square[square_index];
-      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
-      ++pc_tree_index;
+      tree->block_size = square[0];
+      alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
     }
-    ++square_index;
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (i = 0; i < nodes; ++i) {
+        PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+        alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
+                            &shared_bufs);
+        tree->block_size = square[square_index];
+        for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+        ++pc_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    square_index = 2;
+    alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
+    tree->block_size = square[square_index];
   }
 
-  // Set up the root node for the largest superblock size
-  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
-  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
-  td->pc_root[i]->none.best_mode_index = 2;
-  // Set up the root nodes for the rest of the possible superblock sizes
-  while (--i >= 0) {
-    td->pc_root[i] = td->pc_root[i + 1]->split[0];
-    td->pc_root[i]->none.best_mode_index = 2;
-  }
+  // Set up the root node for the applicable superblock size
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+#if CONFIG_INTERNAL_STATS
+  td->pc_root->none.best_mode_index = THR_INVALID;
+#endif  // CONFIG_INTERNAL_STATS
 }
 
-void av1_free_pc_tree(ThreadData *td, const int num_planes) {
+void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
+                      const int num_planes, BLOCK_SIZE sb_size) {
+  int stat_generation_stage = is_stat_generation_stage(cpi);
   if (td->pc_tree != NULL) {
-    const int tree_nodes_inc = 1024;
-    const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+    const int is_sb_size_128 = sb_size == BLOCK_128X128;
+    const int tree_nodes =
+        get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
     for (int i = 0; i < tree_nodes; ++i) {
       free_tree_contexts(&td->pc_tree[i], num_planes);
     }
@@ -213,27 +242,27 @@
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx) {
   dst_ctx->mic = src_ctx->mic;
-  dst_ctx->mbmi_ext = src_ctx->mbmi_ext;
+  dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
 
   dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
-  dst_ctx->skip = src_ctx->skip;
   dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
   dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
 
   memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
          sizeof(uint8_t) * src_ctx->num_4x4_blk);
+  av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+                 src_ctx->num_4x4_blk);
 
   dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
   dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
   dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
 
-  dst_ctx->rate = src_ctx->rate;
-  dst_ctx->dist = src_ctx->dist;
-  dst_ctx->rdcost = src_ctx->rdcost;
+  dst_ctx->rd_stats = src_ctx->rd_stats;
   dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
 
   memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
-  dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter;
 
   dst_ctx->partition = src_ctx->partition;
 }

diff --git a/libaom/av1/encoder/context_tree.h b/libaom/av1/encoder/context_tree.h
index 205ac8a..a399794 100644
--- a/libaom/av1/encoder/context_tree.h
+++ b/libaom/av1/encoder/context_tree.h

@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
 #define AOM_AV1_ENCODER_CONTEXT_TREE_H_
 
+#include "config/aom_config.h"
+
 #include "av1/common/blockd.h"
 #include "av1/encoder/block.h"
 
@@ -23,23 +25,10 @@
 struct AV1Common;
 struct ThreadData;
 
-enum {
-  // Search all the partition types in this plane.
-  SEARCH_FULL_PLANE = 0,
-  // Only search none_partition coding block.
-  NONE_PARTITION_PLANE = 1,
-  // Search all the partition types in this plane except split.
-  SEARCH_SAME_PLANE = 2,
-  // Skip search partition on this plane. Go split directly.
-  SPLIT_PLANE = 3,
-} UENUM1BYTE(CB_TREE_SEARCH);
-
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MB_MODE_INFO mic;
-  MB_MODE_INFO_EXT mbmi_ext;
-  int64_t dist;
-  int64_t rdcost;
+  MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
   uint8_t *color_index_map[2];
   uint8_t *blk_skip;
 
@@ -48,20 +37,20 @@
   tran_low_t *dqcoeff[MAX_MB_PLANE];
   uint16_t *eobs[MAX_MB_PLANE];
   uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+  uint8_t *tx_type_map;
 
   int num_4x4_blk;
-  int skip;
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 1.
   int skippable;
-  int best_mode_index;
+#if CONFIG_INTERNAL_STATS
+  THR_MODES best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
 
-  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
-  // scope of refactoring.
-  int rate;
+  RD_STATS rd_stats;
 
   int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
                          // been made.
@@ -69,20 +58,9 @@
   // motion vector cache for adaptive motion search control in partition
   // search loop
   MV pred_mv[REF_FRAMES];
-  InterpFilter pred_interp_filter;
   PARTITION_TYPE partition;
 } PICK_MODE_CONTEXT;
 
-typedef struct {
-  int64_t rdcost;
-  int64_t sub_block_rdcost[4];
-  int valid;
-  int split;
-  int sub_block_split[4];
-  int sub_block_skip[4];
-  int skip;
-} PC_TREE_STATS;
-
 typedef struct PC_TREE {
   PARTITION_TYPE partitioning;
   BLOCK_SIZE block_size;
@@ -96,14 +74,19 @@
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
   struct PC_TREE *split[4];
-  PC_TREE_STATS pc_tree_stats;
-  CB_TREE_SEARCH cb_search_range;
   int index;
-  MV mv_ref_fulls[REF_FRAMES];
+
+  // Simple motion search_features
+  FULLPEL_MV start_mvs[REF_FRAMES];
+  unsigned int sms_none_feat[2];
+  unsigned int sms_rect_feat[8];
+  int sms_none_valid;
+  int sms_rect_valid;
 } PC_TREE;
 
-void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
+                      const int num_planes, BLOCK_SIZE sb_size);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 

diff --git a/libaom/av1/encoder/corner_detect.c b/libaom/av1/encoder/corner_detect.c
index e4c59dd..597bb30 100644
--- a/libaom/av1/encoder/corner_detect.c
+++ b/libaom/av1/encoder/corner_detect.c

@@ -21,11 +21,11 @@
 
 // Fast_9 wrapper
 #define FAST_BARRIER 18
-int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
-                       int *points, int max_points) {
+int av1_fast_corner_detect(unsigned char *buf, int width, int height,
+                           int stride, int *points, int max_points) {
   int num_points;
-  xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
-                                                 FAST_BARRIER, &num_points);
+  xy *const frm_corners_xy = aom_fast9_detect_nonmax(buf, width, height, stride,
+                                                     FAST_BARRIER, &num_points);
   num_points = (num_points <= max_points ? num_points : max_points);
   if (num_points > 0 && frm_corners_xy) {
     memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);

diff --git a/libaom/av1/encoder/corner_detect.h b/libaom/av1/encoder/corner_detect.h
index cab59a7..15062f2 100644
--- a/libaom/av1/encoder/corner_detect.h
+++ b/libaom/av1/encoder/corner_detect.h

@@ -16,7 +16,7 @@
 #include <stdlib.h>
 #include <memory.h>
 
-int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
-                       int *points, int max_points);
+int av1_fast_corner_detect(unsigned char *buf, int width, int height,
+                           int stride, int *points, int max_points);
 
 #endif  // AOM_AV1_ENCODER_CORNER_DETECT_H_

diff --git a/libaom/av1/encoder/corner_match.c b/libaom/av1/encoder/corner_match.c
index 29e934d..12f633b 100644
--- a/libaom/av1/encoder/corner_match.c
+++ b/libaom/av1/encoder/corner_match.c

@@ -15,6 +15,7 @@
 
 #include "config/av1_rtcd.h"
 
+#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 #define SEARCH_SZ 9
@@ -44,9 +45,9 @@
    correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
    of each image, centered at (x1, y1) and (x2, y2) respectively.
 */
-double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
-                                   int y1, unsigned char *im2, int stride2,
-                                   int x2, int y2) {
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
+                                       int y1, unsigned char *im2, int stride2,
+                                       int x2, int y2) {
   int v1, v2;
   int sum1 = 0;
   int sum2 = 0;
@@ -65,6 +66,7 @@
     }
   var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }
 
@@ -99,7 +101,7 @@
                                   correspondences[i].rx + x,
                                   correspondences[i].ry + y, width, height))
           continue;
-        match_ncc = compute_cross_correlation(
+        match_ncc = av1_compute_cross_correlation(
             frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
             ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
         if (match_ncc > best_match_ncc) {
@@ -125,7 +127,7 @@
                 correspondences[i].x + x, correspondences[i].y + y,
                 correspondences[i].rx, correspondences[i].ry, width, height))
           continue;
-        match_ncc = compute_cross_correlation(
+        match_ncc = av1_compute_cross_correlation(
             ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
             frm_stride, correspondences[i].x + x, correspondences[i].y + y);
         if (match_ncc > best_match_ncc) {
@@ -139,11 +141,11 @@
   }
 }
 
-int determine_correspondence(unsigned char *frm, int *frm_corners,
-                             int num_frm_corners, unsigned char *ref,
-                             int *ref_corners, int num_ref_corners, int width,
-                             int height, int frm_stride, int ref_stride,
-                             int *correspondence_pts) {
+int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
+                                 int num_frm_corners, unsigned char *ref,
+                                 int *ref_corners, int num_ref_corners,
+                                 int width, int height, int frm_stride,
+                                 int ref_stride, int *correspondence_pts) {
   // TODO(sarahparker) Improve this to include 2-way match
   int i, j;
   Correspondence *correspondences = (Correspondence *)correspondence_pts;
@@ -164,7 +166,7 @@
                                 ref_corners[2 * j], ref_corners[2 * j + 1],
                                 width, height))
         continue;
-      match_ncc = compute_cross_correlation(
+      match_ncc = av1_compute_cross_correlation(
           frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
           ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
       if (match_ncc > best_match_ncc) {
@@ -173,7 +175,8 @@
       }
     }
     // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
-    // but need to account for the normalization in compute_cross_correlation.
+    // but need to account for the normalization in
+    // av1_compute_cross_correlation.
     template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
                                      frm_corners[2 * i + 1]);
     if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {

diff --git a/libaom/av1/encoder/corner_match.h b/libaom/av1/encoder/corner_match.h
index 535d2fa..3cf6de1 100644
--- a/libaom/av1/encoder/corner_match.h
+++ b/libaom/av1/encoder/corner_match.h

@@ -24,10 +24,10 @@
   int rx, ry;
 } Correspondence;
 
-int determine_correspondence(unsigned char *frm, int *frm_corners,
-                             int num_frm_corners, unsigned char *ref,
-                             int *ref_corners, int num_ref_corners, int width,
-                             int height, int frm_stride, int ref_stride,
-                             int *correspondence_pts);
+int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
+                                 int num_frm_corners, unsigned char *ref,
+                                 int *ref_corners, int num_ref_corners,
+                                 int width, int height, int frm_stride,
+                                 int ref_stride, int *correspondence_pts);
 
 #endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_

diff --git a/libaom/av1/encoder/enc_enums.h b/libaom/av1/encoder/enc_enums.h
new file mode 100644
index 0000000..5a06514
--- /dev/null
+++ b/libaom/av1/encoder/enc_enums.h

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
+#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
+enum {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+
+  MAX_MODES,
+  SINGLE_REF_MODE_START = THR_NEARESTMV,
+  SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA,
+  NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
+  THR_MODE_START = THR_NEARESTMV,
+  THR_MODE_END = MAX_MODES,
+  THR_INVALID = 255
+} UENUM1BYTE(THR_MODES);
+
+enum {
+  THR_LAST,
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+  THR_ALTR2,
+  THR_GOLD,
+  THR_ALTR,
+
+  THR_COMP_LA,
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+  THR_COMP_GA,
+
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+
+  THR_COMP_LA2,
+  THR_COMP_L2A2,
+  THR_COMP_L3A2,
+  THR_COMP_GA2,
+
+  THR_INTRA,
+
+  MAX_REFS
+} UENUM1BYTE(THR_MODES_SUB8X8);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENC_ENUMS_H_

diff --git a/libaom/av1/encoder/encode_strategy.c b/libaom/av1/encoder/encode_strategy.c
index e9d6ee7..326ecc0 100644
--- a/libaom/av1/encoder/encode_strategy.c
+++ b/libaom/av1/encoder/encode_strategy.c

@@ -23,15 +23,19 @@
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_MISMATCH_DEBUG
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
 
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
 void av1_configure_buffer_updates(AV1_COMP *const cpi,
                                   EncodeFrameParams *const frame_params,
                                   const FRAME_UPDATE_TYPE type,
@@ -39,105 +43,85 @@
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
+  const ExternalFlags *const ext_flags = &cpi->ext_flags;
   cpi->rc.is_src_frame_alt_ref = 0;
-  cpi->rc.is_src_frame_internal_arf = 0;
 
   switch (type) {
     case KF_UPDATE:
-      frame_params->refresh_last_frame = 1;
       frame_params->refresh_golden_frame = 1;
       frame_params->refresh_bwd_ref_frame = 1;
-      frame_params->refresh_alt2_ref_frame = 1;
       frame_params->refresh_alt_ref_frame = 1;
       break;
 
     case LF_UPDATE:
-      frame_params->refresh_last_frame = 1;
       frame_params->refresh_golden_frame = 0;
       frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt2_ref_frame = 0;
       frame_params->refresh_alt_ref_frame = 0;
       break;
 
     case GF_UPDATE:
-      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
-      //               needed.
-      frame_params->refresh_last_frame = 1;
       frame_params->refresh_golden_frame = 1;
       frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt2_ref_frame = 0;
       frame_params->refresh_alt_ref_frame = 0;
       break;
 
     case OVERLAY_UPDATE:
-      frame_params->refresh_last_frame = 0;
       frame_params->refresh_golden_frame = 1;
       frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt2_ref_frame = 0;
       frame_params->refresh_alt_ref_frame = 0;
 
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case ARF_UPDATE:
-      frame_params->refresh_last_frame = 0;
       frame_params->refresh_golden_frame = 0;
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
       frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt2_ref_frame = 0;
       frame_params->refresh_alt_ref_frame = 1;
       break;
 
     case INTNL_OVERLAY_UPDATE:
-      frame_params->refresh_last_frame = 1;
       frame_params->refresh_golden_frame = 0;
       frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt2_ref_frame = 0;
       frame_params->refresh_alt_ref_frame = 0;
 
       cpi->rc.is_src_frame_alt_ref = 1;
-      cpi->rc.is_src_frame_internal_arf = 1;
       break;
 
     case INTNL_ARF_UPDATE:
-      frame_params->refresh_last_frame = 0;
       frame_params->refresh_golden_frame = 0;
-      if (cpi->oxcf.pass == 2) {
-        frame_params->refresh_bwd_ref_frame = 1;
-        frame_params->refresh_alt2_ref_frame = 0;
-      } else {
-        frame_params->refresh_bwd_ref_frame = 0;
-        frame_params->refresh_alt2_ref_frame = 1;
-      }
+      frame_params->refresh_bwd_ref_frame = 1;
       frame_params->refresh_alt_ref_frame = 0;
       break;
 
     default: assert(0); break;
   }
 
-  if (cpi->ext_refresh_frame_flags_pending &&
-      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2)) {
-    frame_params->refresh_last_frame = cpi->ext_refresh_last_frame;
-    frame_params->refresh_golden_frame = cpi->ext_refresh_golden_frame;
-    frame_params->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
-    frame_params->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
-    frame_params->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
+  if (ext_flags->refresh_frame_flags_pending &&
+      (!is_stat_generation_stage(cpi))) {
+    frame_params->refresh_golden_frame = ext_flags->refresh_golden_frame;
+    frame_params->refresh_alt_ref_frame = ext_flags->refresh_alt_ref_frame;
+    frame_params->refresh_bwd_ref_frame = ext_flags->refresh_bwd_ref_frame;
   }
 
   if (force_refresh_all) {
-    frame_params->refresh_last_frame = 1;
     frame_params->refresh_golden_frame = 1;
     frame_params->refresh_bwd_ref_frame = 1;
-    frame_params->refresh_alt2_ref_frame = 1;
     frame_params->refresh_alt_ref_frame = 1;
   }
 }
 
 static void set_additional_frame_flags(const AV1_COMMON *const cm,
                                        unsigned int *const frame_flags) {
-  if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
-  if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
-  if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+  if (frame_is_intra_only(cm)) {
+    *frame_flags |= FRAMEFLAGS_INTRAONLY;
+  }
+  if (frame_is_sframe(cm)) {
+    *frame_flags |= FRAMEFLAGS_SWITCH;
+  }
+  if (cm->features.error_resilient_mode) {
+    *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+  }
 }
 
 static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
@@ -153,10 +137,19 @@
   }
 }
 
-static INLINE int is_frame_droppable(const AV1_COMP *const cpi) {
-  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
-           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
-           cpi->refresh_last_frame);
+static INLINE int is_frame_droppable(const SVC *const svc,
+                                     const ExternalFlags *const ext_flags) {
+  // Droppable frame is only used by external refresh flags. VoD setting won't
+  // trigger its use case.
+  if (svc->external_ref_frame_config)
+    return svc->non_reference_frame;
+  else if (ext_flags->refresh_frame_flags_pending)
+    return !(ext_flags->refresh_alt_ref_frame ||
+             ext_flags->refresh_alt2_ref_frame ||
+             ext_flags->refresh_bwd_ref_frame ||
+             ext_flags->refresh_golden_frame || ext_flags->refresh_last_frame);
+  else
+    return 0;
 }
 
 static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
@@ -164,73 +157,50 @@
   // is a work-around to handle the condition when a frame is drop.
   // We should fix the cpi->common.show_frame flag
   // instead of checking the other condition to update the counter properly.
-  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+  if (cpi->common.show_frame ||
+      is_frame_droppable(&cpi->svc, &cpi->ext_flags)) {
     // Decrement count down till next gf
     if (cpi->rc.frames_till_gf_update_due > 0)
       cpi->rc.frames_till_gf_update_due--;
   }
 }
 
-static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+static INLINE void update_gf_group_index(AV1_COMP *cpi) {
   // Increment the gf group index ready for the next frame. If this is
   // a show_existing_frame with a source other than altref, or if it is not
   // a displayed forward keyframe, the index was incremented when it was
   // originally encoded.
   if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
       cpi->common.current_frame.frame_type == KEY_FRAME) {
-    ++cpi->twopass.gf_group.index;
+    ++cpi->gf_group.index;
   }
 }
 
 static void update_rc_counts(AV1_COMP *cpi) {
   update_keyframe_counters(cpi);
   update_frames_till_gf_update(cpi);
-  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+  update_gf_group_index(cpi);
 }
 
-static void check_show_existing_frame(AV1_COMP *const cpi,
-                                      EncodeFrameParams *const frame_params) {
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  AV1_COMMON *const cm = &cpi->common;
-  const FRAME_UPDATE_TYPE frame_update_type =
-      gf_group->update_type[gf_group->index];
-  const int which_arf = (gf_group->arf_update_idx[gf_group->index] > 0);
-
-  if (cm->show_existing_frame == 1) {
-    frame_params->show_existing_frame = 0;
-  } else if (cpi->is_arf_filter_off[which_arf] &&
-             (frame_update_type == OVERLAY_UPDATE ||
-              frame_update_type == INTNL_OVERLAY_UPDATE)) {
-    // Other parameters related to OVERLAY_UPDATE will be taken care of
-    // in av1_get_second_pass_params(cpi)
-    frame_params->show_existing_frame = 1;
-    frame_params->existing_fb_idx_to_show =
-        (frame_update_type == OVERLAY_UPDATE)
-            ? get_ref_frame_map_idx(cm, ALTREF_FRAME)
-            : get_ref_frame_map_idx(cm, BWDREF_FRAME);
-  }
-}
-
-static void set_ext_overrides(AV1_COMP *const cpi,
-                              EncodeFrameParams *const frame_params) {
+static void set_ext_overrides(AV1_COMMON *const cm,
+                              EncodeFrameParams *const frame_params,
+                              ExternalFlags *const ext_flags) {
   // Overrides the defaults with the externally supplied values with
   // av1_update_reference() and av1_update_entropy() calls
   // Note: The overrides are valid only for the next frame passed
   // to av1_encode_lowlevel()
 
-  AV1_COMMON *const cm = &cpi->common;
-
-  if (cpi->ext_use_s_frame) {
+  if (ext_flags->use_s_frame) {
     frame_params->frame_type = S_FRAME;
   }
 
-  if (cpi->ext_refresh_frame_context_pending) {
-    cm->refresh_frame_context = cpi->ext_refresh_frame_context;
-    cpi->ext_refresh_frame_context_pending = 0;
+  if (ext_flags->refresh_frame_context_pending) {
+    cm->features.refresh_frame_context = ext_flags->refresh_frame_context;
+    ext_flags->refresh_frame_context_pending = 0;
   }
-  cm->allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+  cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs;
 
-  frame_params->error_resilient_mode = cpi->ext_use_error_resilient;
+  frame_params->error_resilient_mode = ext_flags->use_error_resilient;
   // A keyframe is already error resilient and keyframes with
   // error_resilient_mode interferes with the use of show_existing_frame
   // when forward reference keyframes are enabled.
@@ -239,106 +209,25 @@
   frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
 }
 
-static int get_ref_frame_flags(const AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  const RefCntBuffer *last_buf = get_ref_frame_buf(cm, LAST_FRAME);
-  const RefCntBuffer *last2_buf = get_ref_frame_buf(cm, LAST2_FRAME);
-  const RefCntBuffer *last3_buf = get_ref_frame_buf(cm, LAST3_FRAME);
-  const RefCntBuffer *golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
-  const RefCntBuffer *bwd_buf = get_ref_frame_buf(cm, BWDREF_FRAME);
-  const RefCntBuffer *alt2_buf = get_ref_frame_buf(cm, ALTREF2_FRAME);
-  const RefCntBuffer *alt_buf = get_ref_frame_buf(cm, ALTREF_FRAME);
-
-  // No.1 Priority: LAST_FRAME
-  const int last2_is_last = (last2_buf == last_buf);
-  const int last3_is_last = (last3_buf == last_buf);
-  const int gld_is_last = (golden_buf == last_buf);
-  const int bwd_is_last = (bwd_buf == last_buf);
-  const int alt2_is_last = (alt2_buf == last_buf);
-  const int alt_is_last = (alt_buf == last_buf);
-
-  // No.2 Priority: ALTREF_FRAME
-  const int last2_is_alt = (last2_buf == alt_buf);
-  const int last3_is_alt = (last3_buf == alt_buf);
-  const int gld_is_alt = (golden_buf == alt_buf);
-  const int bwd_is_alt = (bwd_buf == alt_buf);
-  const int alt2_is_alt = (alt2_buf == alt_buf);
-
-  // No.3 Priority: LAST2_FRAME
-  const int last3_is_last2 = (last3_buf == last2_buf);
-  const int gld_is_last2 = (golden_buf == last2_buf);
-  const int bwd_is_last2 = (bwd_buf == last2_buf);
-  const int alt2_is_last2 = (alt2_buf == last2_buf);
-
-  // No.4 Priority: LAST3_FRAME
-  const int gld_is_last3 = (golden_buf == last3_buf);
-  const int bwd_is_last3 = (bwd_buf == last3_buf);
-  const int alt2_is_last3 = (alt2_buf == last3_buf);
-
-  // No.5 Priority: GOLDEN_FRAME
-  const int bwd_is_gld = (bwd_buf == golden_buf);
-  const int alt2_is_gld = (alt2_buf == golden_buf);
-
-  // No.6 Priority: BWDREF_FRAME
-  const int alt2_is_bwd = (alt2_buf == bwd_buf);
-
-  // No.7 Priority: ALTREF2_FRAME
-
-  // cpi->ext_ref_frame_flags allows certain reference types to be disabled
-  // by the external interface.  These are set by av1_apply_encoding_flags().
-  // Start with what the external interface allows, then suppress any reference
-  // types which we have found to be duplicates.
-
-  int flags = cpi->ext_ref_frame_flags;
-
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
-
-  if (alt_is_last) flags &= ~AOM_ALT_FLAG;
-
-  if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
-
-  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
-
-  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
-    flags &= ~AOM_GOLD_FLAG;
-
-  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || bwd_is_gld))
-    flags &= ~AOM_BWD_FLAG;
-
-  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
-       alt2_is_gld || alt2_is_bwd))
-    flags &= ~AOM_ALT2_FLAG;
-
-  return flags;
-}
-
 static int get_current_frame_ref_type(
     const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
   // We choose the reference "type" of this frame from the flags which indicate
-  // which reference frames will be refreshed by it.  More than one of these
-  // flags may be set, so the order here implies an order of precedence.
-  // This is just used to choose the primary_ref_frame (as the most recent
-  // reference buffer of the same reference-type as the current frame)
+  // which reference frames will be refreshed by it.  More than one  of these
+  // flags may be set, so the order here implies an order of precedence. This is
+  // just used to choose the primary_ref_frame (as the most recent reference
+  // buffer of the same reference-type as the current frame)
 
-  const int intra_only = frame_params->frame_type == KEY_FRAME ||
-                         frame_params->frame_type == INTRA_ONLY_FRAME;
-  if (intra_only || frame_params->error_resilient_mode ||
-      cpi->ext_use_primary_ref_none)
-    return REGULAR_FRAME;
-  else if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
-    return INTERNAL_ARF_FRAME;
-  else if (frame_params->refresh_alt_ref_frame)
-    return ARF_FRAME;
-  else if (cpi->rc.is_src_frame_alt_ref)
-    return OVERLAY_FRAME;
-  else if (frame_params->refresh_golden_frame)
-    return GLD_FRAME;
-  else if (frame_params->refresh_bwd_ref_frame)
-    return BRF_FRAME;
-  else
-    return REGULAR_FRAME;
+  (void)frame_params;
+  // TODO(jingning): This table should be a lot simpler with the new
+  // ARF system in place. Keep frame_params for the time being as we are
+  // still evaluating a few design options.
+  switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) {
+    case 0: return 0;
+    case 1: return 1;
+    case MAX_ARF_LAYERS:
+    case MAX_ARF_LAYERS + 1: return 4;
+    default: return 7;
+  }
 }
 
 static int choose_primary_ref_frame(
@@ -347,15 +236,20 @@
 
   const int intra_only = frame_params->frame_type == KEY_FRAME ||
                          frame_params->frame_type == INTRA_ONLY_FRAME;
-  if (intra_only || frame_params->error_resilient_mode ||
-      cpi->ext_use_primary_ref_none) {
+  if (intra_only || frame_params->error_resilient_mode || cpi->use_svc ||
+      cpi->ext_flags.use_primary_ref_none) {
     return PRIMARY_REF_NONE;
   }
 
+  // In large scale case, always use Last frame's frame contexts.
+  // Note(yunqing): In other cases, primary_ref_frame is chosen based on
+  // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls
+  // frame bit allocation.
+  if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
+
   // Find the most recent reference frame with the same reference type as the
   // current frame
-  const FRAME_CONTEXT_INDEX current_ref_type =
-      get_current_frame_ref_type(cpi, frame_params);
+  const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
   int wanted_fb = cpi->fb_of_context_type[current_ref_type];
 
   int primary_ref_frame = PRIMARY_REF_NONE;
@@ -364,6 +258,7 @@
       primary_ref_frame = ref_frame - LAST_FRAME;
     }
   }
+
   return primary_ref_frame;
 }
 
@@ -371,13 +266,15 @@
     const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
     int *const fb_of_context_type) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int current_frame_ref_type =
+      get_current_frame_ref_type(cpi, frame_params);
 
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cpi->ext_use_primary_ref_none) {
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
     for (int i = 0; i < REF_FRAMES; i++) {
       fb_of_context_type[i] = -1;
     }
-    fb_of_context_type[REGULAR_FRAME] =
+    fb_of_context_type[current_frame_ref_type] =
         cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
                        : get_ref_frame_map_idx(cm, ALTREF_FRAME);
   }
@@ -386,12 +283,11 @@
     // Refresh fb_of_context_type[]: see encoder.h for explanation
     if (cm->current_frame.frame_type == KEY_FRAME) {
       // All ref frames are refreshed, pick one that will live long enough
-      fb_of_context_type[REGULAR_FRAME] = 0;
+      fb_of_context_type[current_frame_ref_type] = 0;
     } else {
       // If more than one frame is refreshed, it doesn't matter which one we
       // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
-      const int current_frame_ref_type =
-          get_current_frame_ref_type(cpi, frame_params);
+
       for (int i = 0; i < REF_FRAMES; i++) {
         if (cm->current_frame.refresh_frame_flags & (1 << i)) {
           fb_of_context_type[current_frame_ref_type] = i;
@@ -414,22 +310,28 @@
   return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
 }
 
-static void adjust_frame_rate(AV1_COMP *cpi,
-                              const struct lookahead_entry *source) {
+static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
+  TimeStamps *time_stamps = &cpi->time_stamps;
   int64_t this_duration;
   int step = 0;
 
   // Clear down mmx registers
   aom_clear_system_state();
 
-  if (source->ts_start == cpi->first_time_stamp_ever) {
-    this_duration = source->ts_end - source->ts_start;
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+    cpi->framerate = cpi->svc.base_framerate;
+    av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+    return;
+  }
+
+  if (ts_start == time_stamps->first_ever) {
+    this_duration = ts_end - ts_start;
     step = 1;
   } else {
     int64_t last_duration =
-        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+        time_stamps->prev_end_seen - time_stamps->prev_start_seen;
 
-    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+    this_duration = ts_end - time_stamps->prev_end_seen;
 
     // do a step update if the duration changes by 10%
     if (last_duration)
@@ -443,8 +345,8 @@
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
-      const double interval = AOMMIN(
-          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      const double interval =
+          AOMMIN((double)(ts_end - time_stamps->first_ever), 10000000.0);
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
@@ -452,76 +354,61 @@
       av1_new_framerate(cpi, 10000000.0 / avg_duration);
     }
   }
-  cpi->last_time_stamp_seen = source->ts_start;
-  cpi->last_end_time_stamp_seen = source->ts_end;
+  time_stamps->prev_start_seen = ts_start;
+  time_stamps->prev_end_seen = ts_end;
 }
 
 // If this is an alt-ref, returns the offset of the source frame used
 // as the arf midpoint. Otherwise, returns 0.
-static int get_arf_src_index(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
+static int get_arf_src_index(GF_GROUP *gf_group, int pass) {
   int arf_src_index = 0;
-  if (cpi->oxcf.pass == 2) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-      assert(is_altref_enabled(cpi));
-      arf_src_index = gf_group->arf_src_offset[gf_group->index];
-    }
-  } else if (rc->source_alt_ref_pending) {
-    arf_src_index = rc->frames_till_gf_update_due;
-  }
+  if (pass != 1) arf_src_index = gf_group->arf_src_offset[gf_group->index];
   return arf_src_index;
 }
 
-// If this is an internal alt-ref, returns the offset of the source frame used
-// as the internal arf midpoint. Otherwise, returns 0.
-static int get_internal_arf_src_index(AV1_COMP *cpi) {
-  int internal_arf_src_index = 0;
-  if (cpi->oxcf.pass == 2) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-      assert(is_altref_enabled(cpi) && cpi->internal_altref_allowed);
-      internal_arf_src_index = gf_group->arf_src_offset[gf_group->index];
-    }
-  }
-  return internal_arf_src_index;
-}
-
 // Called if this frame is an ARF or ARF2. Also handles forward-keyframes
 // For an ARF set arf2=0, for ARF2 set arf2=1
 // temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
 // the correct post-filter buffer can be used.
-static struct lookahead_entry *setup_arf_or_arf2(
-    AV1_COMP *const cpi, const int arf_src_index, const int arf2,
-    int *temporal_filtered, EncodeFrameParams *const frame_params) {
+static struct lookahead_entry *setup_arf_frame(
+    AV1_COMP *const cpi, const int arf_src_index, int *code_arf,
+    EncodeFrameParams *const frame_params, int *show_existing_alt_ref) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+#if !CONFIG_REALTIME_ONLY
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+#endif
 
   assert(arf_src_index <= rc->frames_to_key);
-  *temporal_filtered = 0;
+  *code_arf = 0;
 
   struct lookahead_entry *source =
-      av1_lookahead_peek(cpi->lookahead, arf_src_index);
+      av1_lookahead_peek(cpi->lookahead, arf_src_index, cpi->compressor_stage);
 
   if (source != NULL) {
     cm->showable_frame = 1;
-    cpi->alt_ref_source = source;
 
     // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
-    if (!arf2 && arf_src_index == rc->frames_to_key) {
+    if (arf_src_index == rc->frames_to_key) {
       // Skip temporal filtering and mark as intra_only if we have a fwd_kf
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      int which_arf = gf_group->arf_update_idx[gf_group->index];
-      cpi->is_arf_filter_off[which_arf] = 1;
       cpi->no_show_kf = 1;
     } else {
+#if !CONFIG_REALTIME_ONLY
       if (oxcf->arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi, arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-        *temporal_filtered = 1;
+        cm->current_frame.frame_type = INTER_FRAME;
+        FRAME_UPDATE_TYPE frame_update_type =
+            get_frame_update_type(&cpi->gf_group);
+        av1_configure_buffer_updates(cpi, frame_params, frame_update_type, 0);
+        *code_arf =
+            av1_temporal_filter(cpi, arf_src_index, show_existing_alt_ref);
+        if (*code_arf) {
+          aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+        }
       }
+#else
+      (void)show_existing_alt_ref;
+#endif
     }
     frame_params->show_frame = 0;
   }
@@ -530,79 +417,62 @@
 }
 
 // Determine whether there is a forced keyframe pending in the lookahead buffer
-static int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
-                                      const int up_to_index) {
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage) {
   for (int i = 0; i <= up_to_index; i++) {
-    const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i);
+    const struct lookahead_entry *e =
+        av1_lookahead_peek(lookahead, i, compressor_stage);
     if (e == NULL) {
       // We have reached the end of the lookahead buffer and not early-returned
       // so there isn't a forced key-frame pending.
-      return 0;
+      return -1;
     } else if (e->flags == AOM_EFLAG_FORCE_KF) {
-      return 1;
+      return (i + 1);
     } else {
       continue;
     }
   }
-  return 0;  // Never reached
+  return -1;  // Never reached
 }
 
 // Check if we should encode an ARF or internal ARF.  If not, try a LAST
 // Do some setup associated with the chosen source
 // temporal_filtered, flush, and frame_update_type are outputs.
 // Return the frame source, or NULL if we couldn't find one
-struct lookahead_entry *choose_frame_source(
-    AV1_COMP *const cpi, int *const temporal_filtered, int *const flush,
-    struct lookahead_entry **last_source, FRAME_UPDATE_TYPE *frame_update_type,
-    EncodeFrameParams *const frame_params) {
+static struct lookahead_entry *choose_frame_source(
+    AV1_COMP *const cpi, int *const code_arf, int *const flush,
+    struct lookahead_entry **last_source, EncodeFrameParams *const frame_params,
+    int *show_existing_alt_ref) {
   AV1_COMMON *const cm = &cpi->common;
   struct lookahead_entry *source = NULL;
-  *temporal_filtered = 0;
+  *code_arf = 0;
 
   // Should we encode an alt-ref frame.
-  int arf_src_index = get_arf_src_index(cpi);
+  int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+  // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
   if (arf_src_index &&
-      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+      (is_forced_keyframe_pending(cpi->lookahead, arf_src_index,
+                                  cpi->compressor_stage) != -1) &&
+      cpi->oxcf.rc_mode != AOM_Q) {
     arf_src_index = 0;
     *flush = 1;
   }
 
-  if (arf_src_index) {
-    source = setup_arf_or_arf2(cpi, arf_src_index, 0, temporal_filtered,
-                               frame_params);
-    *frame_update_type = ARF_UPDATE;
-  }
-
-  // Should we encode an internal Alt-ref frame (mutually exclusive to ARF)
-  arf_src_index = get_internal_arf_src_index(cpi);
-  if (arf_src_index &&
-      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
-    arf_src_index = 0;
-    *flush = 1;
-  }
-
-  if (arf_src_index) {
-    source = setup_arf_or_arf2(cpi, arf_src_index, 1, temporal_filtered,
-                               frame_params);
-    *frame_update_type = INTNL_ARF_UPDATE;
-  }
+  if (arf_src_index)
+    source = setup_arf_frame(cpi, arf_src_index, code_arf, frame_params,
+                             show_existing_alt_ref);
 
   if (!source) {
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
-      *last_source = av1_lookahead_peek(cpi->lookahead, -1);
+      *last_source =
+          av1_lookahead_peek(cpi->lookahead, -1, cpi->compressor_stage);
     }
     // Read in the source frame.
-    source = av1_lookahead_pop(cpi->lookahead, *flush);
+    source = av1_lookahead_pop(cpi->lookahead, *flush, cpi->compressor_stage);
     if (source == NULL) return NULL;
-    *frame_update_type = LF_UPDATE;  // Default update type
     frame_params->show_frame = 1;
-
-    // Check to see if the frame should be encoded as an arf overlay.
-    if (cpi->alt_ref_source == source) {
-      *frame_update_type = OVERLAY_UPDATE;
-      cpi->alt_ref_source = NULL;
-    }
   }
   return source;
 }
@@ -615,7 +485,7 @@
   if (cpi->common.current_frame.frame_number == 0) return 0;
 
   const struct lookahead_entry *lookahead_src =
-      av1_lookahead_peek(cpi->lookahead, 0);
+      av1_lookahead_peek(cpi->lookahead, 0, cpi->compressor_stage);
   if (lookahead_src == NULL) return 1;
 
   const int is_error_resilient =
@@ -716,134 +586,149 @@
 }
 #endif  // DUMP_REF_FRAME_IMAGES == 1
 
-// Assign new_ref in the new mapping to point at the reference buffer pointed at
-// by old_ref in the old_map.  The new mapping is stored in *new_map, while the
-// old map comes from cm->remapped_ref_idx[].
-static void assign_new_map(AV1_COMMON *const cm, int *new_map, int new_ref,
-                           int old_ref) {
-  new_map[new_ref - LAST_FRAME] = cm->remapped_ref_idx[old_ref - LAST_FRAME];
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
+  int ref_map_index = INVALID_IDX;
+
+  for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
+    if ((refresh_frame_flags >> ref_map_index) & 1) break;
+
+  return ref_map_index;
 }
 
-// Generate a new reference frame mapping.  This function updates
-// cm->remapped_ref_idx[] depending on the frame_update_type of this frame.
-// This determines which references (e.g. LAST_FRAME, ALTREF_FRAME) point at the
-// 8 underlying buffers and, together with get_refresh_frame_flags(), implements
-// our reference frame management strategy.
-static void update_ref_frame_map(AV1_COMP *cpi,
-                                 FRAME_UPDATE_TYPE frame_update_type) {
+static void update_arf_stack(int ref_map_index,
+                             RefBufferStack *ref_buffer_stack) {
+  if (ref_buffer_stack->arf_stack_size >= 0) {
+    if (ref_buffer_stack->arf_stack[0] == ref_map_index)
+      stack_pop(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size);
+  }
+
+  if (ref_buffer_stack->lst_stack_size) {
+    for (int i = ref_buffer_stack->lst_stack_size - 1; i >= 0; --i) {
+      if (ref_buffer_stack->lst_stack[i] == ref_map_index) {
+        for (int idx = i; idx < ref_buffer_stack->lst_stack_size - 1; ++idx)
+          ref_buffer_stack->lst_stack[idx] =
+              ref_buffer_stack->lst_stack[idx + 1];
+        ref_buffer_stack->lst_stack[ref_buffer_stack->lst_stack_size - 1] =
+            INVALID_IDX;
+        --ref_buffer_stack->lst_stack_size;
+      }
+    }
+  }
+
+  if (ref_buffer_stack->gld_stack_size) {
+    for (int i = ref_buffer_stack->gld_stack_size - 1; i >= 0; --i) {
+      if (ref_buffer_stack->gld_stack[i] == ref_map_index) {
+        for (int idx = i; idx < ref_buffer_stack->gld_stack_size - 1; ++idx)
+          ref_buffer_stack->gld_stack[idx] =
+              ref_buffer_stack->gld_stack[idx + 1];
+        ref_buffer_stack->gld_stack[ref_buffer_stack->gld_stack_size - 1] =
+            INVALID_IDX;
+        --ref_buffer_stack->gld_stack_size;
+      }
+    }
+  }
+}
+
+// Update reference frame stack info.
+void av1_update_ref_frame_map(AV1_COMP *cpi,
+                              FRAME_UPDATE_TYPE frame_update_type,
+                              int show_existing_frame, int ref_map_index,
+                              RefBufferStack *ref_buffer_stack) {
   AV1_COMMON *const cm = &cpi->common;
+  // TODO(jingning): Consider the S-frame same as key frame for the
+  // reference frame tracking purpose. The logic might be better
+  // expressed than converting the frame update type.
+  if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME;
 
-  // If check_frame_refs_short_signaling() decided to set
-  // frame_refs_short_signaling=1 then we update remapped_ref_idx[] here.  Every
-  // reference will still map to the same RefCntBuffer (through ref_frame_map[])
-  // after this, but that does not necessarily mean that remapped_ref_idx[] is
-  // unchanged.
-  if (cm->current_frame.frame_refs_short_signaling) {
-    const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
-    const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
-    av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_map_idx, gld_map_idx);
+  if (is_frame_droppable(&cpi->svc, &cpi->ext_flags)) return;
+
+  switch (frame_update_type) {
+    case KEY_FRAME:
+      if (show_existing_frame)
+        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                  &ref_buffer_stack->arf_stack_size);
+      stack_reset(ref_buffer_stack->lst_stack,
+                  &ref_buffer_stack->lst_stack_size);
+      stack_reset(ref_buffer_stack->gld_stack,
+                  &ref_buffer_stack->gld_stack_size);
+      stack_reset(ref_buffer_stack->arf_stack,
+                  &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      break;
+    case GF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      // For nonrd_mode: update LAST as well on GF_UPDATE frame.
+      if (cpi->sf.rt_sf.use_nonrd_pick_mode)
+        stack_push(ref_buffer_stack->lst_stack,
+                   &ref_buffer_stack->lst_stack_size, ref_map_index);
+      break;
+    case LF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
+                 ref_map_index);
+      break;
+    case ARF_UPDATE:
+    case INTNL_ARF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size,
+                 ref_map_index);
+      break;
+    case OVERLAY_UPDATE:
+      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      break;
+    case INTNL_OVERLAY_UPDATE:
+      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
+                 ref_map_index);
+      break;
+    default: assert(0 && "unknown type");
   }
-
-  // For shown keyframes and S-frames all buffers are refreshed, but we don't
-  // change any of the mapping.
-  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
-      frame_is_sframe(cm)) {
-    return;
-  }
-
-  // Initialize the new reference map as a copy of the old one.
-  int new_map[REF_FRAMES];
-  memcpy(new_map, cm->remapped_ref_idx, sizeof(new_map));
-
-  // The reference management strategy is currently as follows.  See
-  // gop_structure.c for more details of the structure and DOI
-  // 10.1109/DCC.2018.00045 for a higher-level explanation
-  //
-  // * ALTREF_FRAME and GOLDEN_FRAME are kept separate from the other
-  //   references.  When we code an ALTREF it refreshes the ALTREF buffer.  When
-  //   we code an OVERLAY the old GOLDEN becomes the new ALTREF and the old
-  //   ALTREF (possibly refreshed by the OVERLAY) becomes the new GOLDEN.
-  // * LAST_FRAME, LAST2_FRAME, and LAST3_FRAME work like a FIFO.  When we code
-  //   a frame which does a last-frame update we pick a buffer to refresh and
-  //   then point the LAST_FRAME reference at it.  The old LAST_FRAME becomes
-  //   LAST2_FRAME and the old LAST2_FRAME becomes LAST3_FRAME.  The old
-  //   LAST3_FRAME is re-used somewhere else.
-  // * BWDREF, ALTREF2, and EXTREF act like a stack structure, so we can
-  //   "push" and "pop" internal alt-ref frames through the three references.
-  // * When we code a BRF or internal-ARF (they work the same in this
-  //   structure) we push it onto the bwdref stack.  Because we have a finite
-  //   number of buffers, we actually refresh EXTREF, the bottom of the stack,
-  //   and rotate the three references to make EXTREF the top.
-  // * When we code an INTNL_OVERLAY we refresh BWDREF, then pop it off of the
-  //   bwdref stack and push it into the last-frame FIFO.  The old LAST3
-  //   buffer gets pushed out of the last-frame FIFO and becomes the new
-  //   EXTREF, bottom of the bwdref stack.
-  // * LAST_BIPRED just acts like a LAST_FRAME.  The BWDREF will have an
-  //   INTNL_OVERLAY and so can do its own ref map update.
-  //
-  // Note that this function runs *after* a frame has been coded, so it does not
-  // affect reference assignment of the current frame, it only affects future
-  // frames.  This is why we refresh buffers using the old reference map before
-  // remapping them.
-  //
-  // show_existing_frames don't refresh any buffers or send the reference map to
-  // the decoder, but we can still update our reference map if we want to: the
-  // decoder will update its map next time we code a non-show-existing frame.
-
-  if (frame_update_type == OVERLAY_UPDATE) {
-    // We want the old golden-frame to become our new ARF so swap the
-    // references.  If cpi->preserve_arf_as_gld == 0 then we will refresh the
-    // old ARF before it becomes our new GF
-    assign_new_map(cm, new_map, ALTREF_FRAME, GOLDEN_FRAME);
-    assign_new_map(cm, new_map, GOLDEN_FRAME, ALTREF_FRAME);
-  } else if (frame_update_type == INTNL_OVERLAY_UPDATE &&
-             encode_show_existing_frame(cm)) {
-    // Note that because encode_show_existing_frame(cm) we don't refresh any
-    // buffers.
-    // Pop BWDREF (shown as current frame) from the bwdref stack and make it
-    // the new LAST_FRAME.
-    assign_new_map(cm, new_map, LAST_FRAME, BWDREF_FRAME);
-
-    // Progress the last-frame FIFO and the bwdref stack
-    assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
-    assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
-    assign_new_map(cm, new_map, BWDREF_FRAME, ALTREF2_FRAME);
-    assign_new_map(cm, new_map, ALTREF2_FRAME, EXTREF_FRAME);
-    assign_new_map(cm, new_map, EXTREF_FRAME, LAST3_FRAME);
-  } else if (frame_update_type == INTNL_ARF_UPDATE &&
-             !cm->show_existing_frame) {
-    // We want to push the current frame onto the bwdref stack.  We refresh
-    // EXTREF (the old bottom of the stack) and rotate the references so it
-    // becomes BWDREF, the top of the stack.
-    assign_new_map(cm, new_map, BWDREF_FRAME, EXTREF_FRAME);
-    assign_new_map(cm, new_map, ALTREF2_FRAME, BWDREF_FRAME);
-    assign_new_map(cm, new_map, EXTREF_FRAME, ALTREF2_FRAME);
-  }
-
-  if ((frame_update_type == LF_UPDATE || frame_update_type == GF_UPDATE ||
-       frame_update_type == INTNL_OVERLAY_UPDATE) &&
-      !encode_show_existing_frame(cm) &&
-      (!cm->show_existing_frame || frame_update_type == INTNL_OVERLAY_UPDATE)) {
-    // A standard last-frame: we refresh the LAST3_FRAME buffer and then push it
-    // into the last-frame FIFO.
-    assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
-    assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
-    assign_new_map(cm, new_map, LAST_FRAME, LAST3_FRAME);
-  }
-
-  memcpy(cm->remapped_ref_idx, new_map, sizeof(new_map));
-
-#if DUMP_REF_FRAME_IMAGES == 1
-  // Dump out all reference frame images.
-  dump_ref_frame_images(cpi);
-#endif  // DUMP_REF_FRAME_IMAGES
+  return;
 }
 
-static int get_refresh_frame_flags(const AV1_COMP *const cpi,
-                                   const EncodeFrameParams *const frame_params,
-                                   FRAME_UPDATE_TYPE frame_update_type) {
-  const AV1_COMMON *const cm = &cpi->common;
+static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
+  for (int idx = 0; idx < REF_FRAMES; ++idx) {
+    int is_free = 1;
+    for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) {
+      if (ref_buffer_stack->arf_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
 
+    for (int i = 0; i < ref_buffer_stack->lst_stack_size; ++i) {
+      if (ref_buffer_stack->lst_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
+
+    for (int i = 0; i < ref_buffer_stack->gld_stack_size; ++i) {
+      if (ref_buffer_stack->gld_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
+
+    if (is_free) return idx;
+  }
+  return INVALID_IDX;
+}
+
+int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
+                                const EncodeFrameParams *const frame_params,
+                                FRAME_UPDATE_TYPE frame_update_type,
+                                const RefBufferStack *const ref_buffer_stack) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  const SVC *const svc = &cpi->svc;
   // Switch frames and shown key-frames overwrite all reference slots
   if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
       frame_params->frame_type == S_FRAME)
@@ -857,99 +742,302 @@
     return 0;
   }
 
+  if (is_frame_droppable(svc, ext_flags)) return 0;
+
   int refresh_mask = 0;
 
-  if (cpi->ext_refresh_frame_flags_pending) {
+  if (ext_flags->refresh_frame_flags_pending) {
+    if (svc->external_ref_frame_config) {
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = svc->ref_idx[i];
+        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+      }
+      return refresh_mask;
+    }
     // Unfortunately the encoder interface reflects the old refresh_*_frame
     // flags so we have to replicate the old refresh_frame_flags logic here in
     // order to preserve the behaviour of the flag overrides.
-    refresh_mask |= cpi->ext_refresh_last_frame
-                    << get_ref_frame_map_idx(cm, LAST3_FRAME);
-    refresh_mask |= cpi->ext_refresh_bwd_ref_frame
-                    << get_ref_frame_map_idx(cm, EXTREF_FRAME);
-    refresh_mask |= cpi->ext_refresh_alt2_ref_frame
-                    << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+    int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_last_frame << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_bwd_ref_frame << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_alt2_ref_frame << ref_frame_map_idx;
+
     if (frame_update_type == OVERLAY_UPDATE) {
-      if (!cpi->preserve_arf_as_gld) {
-        refresh_mask |= cpi->ext_refresh_golden_frame
-                        << get_ref_frame_map_idx(cm, ALTREF_FRAME);
-      }
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
     } else {
-      refresh_mask |= cpi->ext_refresh_golden_frame
-                      << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
-      refresh_mask |= cpi->ext_refresh_alt_ref_frame
-                      << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_alt_ref_frame << ref_frame_map_idx;
     }
     return refresh_mask;
   }
 
-  // See update_ref_frame_map() for a thorough description of the reference
-  // buffer management strategy currently in use.  This function just decides
-  // which buffers should be refreshed.
-
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_buffer_stack);
   switch (frame_update_type) {
     case KF_UPDATE:
-      // Note that a real shown key-frame or S-frame refreshes every buffer,
-      // handled in a special case above.  This case is for frames which aren't
-      // really a shown key-frame or S-frame but want to refresh all the
-      // important buffers.
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+    case GF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        if (ref_buffer_stack->gld_stack_size)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+      }
       break;
     case LF_UPDATE:
-      // Refresh LAST3, which becomes the new LAST while LAST becomes LAST2
-      // and LAST2 becomes the new LAST3 (like a FIFO but circular)
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
-      break;
-    case GF_UPDATE:
-      // In addition to refreshing the GF buffer, we refresh LAST3 and push it
-      // into the last-frame FIFO.
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
-      break;
-    case OVERLAY_UPDATE:
-      if (!cpi->preserve_arf_as_gld) {
-        // The result of our OVERLAY should become the GOLDEN_FRAME but we'd
-        // like to keep the old GOLDEN as our new ALTREF.  So we refresh the
-        // ALTREF and swap around the ALTREF and GOLDEN references.
-        refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        if (ref_buffer_stack->lst_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+        else if (ref_buffer_stack->gld_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else
+          assert(0 && "No ref map index found");
       }
       break;
     case ARF_UPDATE:
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
-      break;
-    case INTNL_OVERLAY_UPDATE:
-      // INTNL_OVERLAY may be a show_existing_frame in which case we don't
-      // refresh anything and the BWDREF or ALTREF2 being shown becomes the new
-      // LAST_FRAME.  But, if it's not a show_existing_frame, then we update as
-      // though it's a normal LF_UPDATE: we refresh LAST3 and
-      // update_ref_frame_map() makes that the new LAST_FRAME.
-      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
-      break;
-    case INTNL_ARF_UPDATE:
-      if (cpi->oxcf.pass == 2) {
-        // Push the new ARF2 onto the bwdref stack.  We refresh EXTREF which is
-        // at the bottom of the stack then move it to the top.
-        refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
       } else {
-        // ARF2 just gets stored in the ARF2 slot, no reference map change.
-        refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+        if (ref_buffer_stack->gld_stack_size >= 3)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else if (ref_buffer_stack->lst_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+        else
+          assert(0 && "No ref map index found");
       }
       break;
+    case INTNL_ARF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        refresh_mask =
+            1 << ref_buffer_stack
+                     ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+      }
+      break;
+    case OVERLAY_UPDATE: break;
+    case INTNL_OVERLAY_UPDATE: break;
     default: assert(0); break;
   }
+
   return refresh_mask;
 }
 
+#if !CONFIG_REALTIME_ONLY
+void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+
+  set_mi_offsets(&cm->mi_params, xd, 0, 0);
+}
+
+// Apply temporal filtering to key frames and encode the filtered frame.
+// If the current frame is not key frame, this function is identical to
+// av1_encode().
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+                              EncodeFrameInput *const frame_input,
+                              EncodeFrameParams *const frame_params,
+                              EncodeFrameResults *const frame_results) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Decide whether to apply temporal filtering to the source frame.
+  int apply_filtering =
+      frame_params->frame_type == KEY_FRAME &&
+      oxcf->enable_keyframe_filtering && !is_stat_generation_stage(cpi) &&
+      !frame_params->show_existing_frame &&
+      cpi->rc.frames_to_key > TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME &&
+      !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0;
+  if (apply_filtering) {
+    const double y_noise_level = av1_estimate_noise_from_single_plane(
+        frame_input->source, 0, cm->seq_params.bit_depth);
+    apply_filtering = y_noise_level > 0;
+  }
+
+  // Save the pointer to the original source image.
+  YV12_BUFFER_CONFIG *source_kf_buffer = frame_input->source;
+
+  // Apply filtering to key frame.
+  if (apply_filtering) {
+    // Initialization for frame motion estimation.
+    MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+    av1_init_mi_buffers(&cm->mi_params);
+    setup_mi(cpi, frame_input->source);
+    av1_init_macroblockd(cm, xd, NULL);
+    memset(
+        cpi->mbmi_ext_info.frame_base, 0,
+        cpi->mbmi_ext_info.alloc_size * sizeof(*cpi->mbmi_ext_info.frame_base));
+
+    av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+    av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+    av1_set_rd_speed_thresholds(cpi);
+    av1_setup_frame_buf_refs(cm);
+    av1_setup_frame_sign_bias(cm);
+    av1_frame_init_quantizer(cpi);
+    av1_setup_past_independence(cm);
+
+    if (!frame_params->show_frame) {
+      int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+      av1_temporal_filter(cpi, -1 * arf_src_index, NULL);
+    } else {
+      av1_temporal_filter(cpi, -1, NULL);
+    }
+    aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+    // Use the filtered frame for encoding.
+    frame_input->source = &cpi->alt_ref_buffer;
+    // Copy metadata info to alt-ref buffer.
+    aom_remove_metadata_from_frame_buffer(frame_input->source);
+    aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                      source_kf_buffer->metadata);
+
+    if (oxcf->enable_tpl_model && oxcf->lag_in_frames > 0 &&
+        frame_params->show_frame) {
+      av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
+    }
+  }
+
+  if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  // Set frame_input source to true source for psnr calculation.
+  if (apply_filtering) {
+    cpi->source = source_kf_buffer;
+    cpi->unscaled_source = source_kf_buffer;
+  }
+
+  return AOM_CODEC_OK;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE int find_unused_ref_frame(const int *used_ref_frames,
+                                        const int *stack, int stack_size) {
+  for (int i = 0; i < stack_size; ++i) {
+    const int this_ref = stack[i];
+    int ref_idx = 0;
+    for (ref_idx = 0; ref_idx <= ALTREF_FRAME - LAST_FRAME; ++ref_idx) {
+      if (this_ref == used_ref_frames[ref_idx]) break;
+    }
+
+    // not in use
+    if (ref_idx > ALTREF_FRAME - LAST_FRAME) return this_ref;
+  }
+
+  return INVALID_IDX;
+}
+
+void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
+  AV1_COMMON *cm = &cpi->common;
+  int *const remapped_ref_idx = cm->remapped_ref_idx;
+  int *const arf_stack = ref_buffer_stack->arf_stack;
+  int *const lst_stack = ref_buffer_stack->lst_stack;
+  int *const gld_stack = ref_buffer_stack->gld_stack;
+  const int arf_stack_size = ref_buffer_stack->arf_stack_size;
+  const int lst_stack_size = ref_buffer_stack->lst_stack_size;
+  const int gld_stack_size = ref_buffer_stack->gld_stack_size;
+
+  // Initialization
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+  if (arf_stack_size) {
+    remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] = arf_stack[arf_stack_size - 1];
+
+    if (arf_stack_size > 1)
+      remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = arf_stack[0];
+
+    if (arf_stack_size > 2)
+      remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = arf_stack[1];
+  }
+
+  if (lst_stack_size) {
+    remapped_ref_idx[LAST_FRAME - LAST_FRAME] = lst_stack[0];
+
+    if (lst_stack_size > 1)
+      remapped_ref_idx[LAST2_FRAME - LAST_FRAME] = lst_stack[1];
+  }
+
+  if (gld_stack_size) {
+    remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] = gld_stack[0];
+
+    if (gld_stack_size > 1) {
+      if (arf_stack_size <= 1)
+        remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1];
+      else
+        remapped_ref_idx[LAST3_FRAME - LAST_FRAME] = gld_stack[1];
+    }
+  }
+
+  for (int idx = ALTREF_FRAME - LAST_FRAME; idx >= 0; --idx) {
+    int ref_map_index = remapped_ref_idx[idx];
+
+    if (ref_map_index != INVALID_IDX) continue;
+
+    ref_map_index =
+        find_unused_ref_frame(remapped_ref_idx, arf_stack, arf_stack_size);
+
+    if (ref_map_index == INVALID_IDX) {
+      ref_map_index =
+          find_unused_ref_frame(remapped_ref_idx, gld_stack, gld_stack_size);
+    }
+
+    if (ref_map_index == INVALID_IDX) {
+      ref_map_index =
+          find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size);
+    }
+
+    if (ref_map_index != INVALID_IDX)
+      remapped_ref_idx[idx] = ref_map_index;
+    else
+      remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
+  }
+}
+
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
-                        const aom_rational_t *const timebase, int flush) {
+                        const aom_rational64_t *const timestamp_ratio,
+                        int flush) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
 
   EncodeFrameInput frame_input;
   EncodeFrameParams frame_params;
@@ -958,66 +1046,82 @@
   memset(&frame_params, 0, sizeof(frame_params));
   memset(&frame_results, 0, sizeof(frame_results));
 
-  if (oxcf->pass == 0 || oxcf->pass == 2) {
-    check_show_existing_frame(cpi, &frame_params);
+  // TODO(sarahparker) finish bit allocation for one pass pyramid
+  if (has_no_stats_stage(cpi) && oxcf->rc_mode != AOM_Q) {
+    cpi->oxcf.gf_max_pyr_height =
+        AOMMIN(cpi->oxcf.gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+    cpi->oxcf.gf_min_pyr_height =
+        AOMMIN(cpi->oxcf.gf_min_pyr_height, cpi->oxcf.gf_max_pyr_height);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+    // If this is a forward keyframe, mark as a show_existing_frame
+    if (cpi->oxcf.fwd_kf_enabled && (gf_group->index == gf_group->size) &&
+        gf_group->update_type[1] == ARF_UPDATE && cpi->rc.frames_to_key == 0) {
+      frame_params.show_existing_frame = 1;
+    } else {
+      frame_params.show_existing_frame =
+          ((oxcf->enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames ||
+            cpi->show_existing_alt_ref) &&
+           gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) ||
+          gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+    }
     frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+
+    // Reset show_existing_alt_ref decision to 0 after it is used.
+    if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) {
+      cpi->show_existing_alt_ref = 0;
+    }
   } else {
     frame_params.show_existing_frame = 0;
   }
 
-  int temporal_filtered = 0;
+  int code_arf = 0;
   struct lookahead_entry *source = NULL;
   struct lookahead_entry *last_source = NULL;
-  FRAME_UPDATE_TYPE frame_update_type;
   if (frame_params.show_existing_frame) {
-    source = av1_lookahead_pop(cpi->lookahead, flush);
-    frame_update_type = LF_UPDATE;
+    source = av1_lookahead_pop(cpi->lookahead, flush, cpi->compressor_stage);
+    frame_params.show_frame = 1;
   } else {
-    source = choose_frame_source(cpi, &temporal_filtered, &flush, &last_source,
-                                 &frame_update_type, &frame_params);
-  }
-
-  // In pass 2 we get the frame_update_type from gf_group
-  if (oxcf->pass == 2) {
-    frame_update_type =
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    int show_existing_alt_ref = 0;
+    source = choose_frame_source(cpi, &code_arf, &flush, &last_source,
+                                 &frame_params, &show_existing_alt_ref);
+    if (gf_group->update_type[gf_group->index] == ARF_UPDATE)
+      cpi->show_existing_alt_ref = show_existing_alt_ref;
   }
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
+#if !CONFIG_REALTIME_ONLY
     if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
+#endif
     return -1;
   }
 
-  frame_input.source = temporal_filtered ? &cpi->alt_ref_buffer : &source->img;
+  frame_input.source = code_arf ? &cpi->alt_ref_buffer : &source->img;
   frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
   frame_input.ts_duration = source->ts_end - source->ts_start;
+  // Save unfiltered source. It is used in av1_get_second_pass_params().
+  cpi->unfiltered_source = frame_input.source;
 
   *time_stamp = source->ts_start;
   *time_end = source->ts_end;
-  if (source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = source->ts_start;
-    cpi->last_end_time_stamp_seen = source->ts_start;
+  if (source->ts_start < cpi->time_stamps.first_ever) {
+    cpi->time_stamps.first_ever = source->ts_start;
+    cpi->time_stamps.prev_end_seen = source->ts_start;
   }
 
   av1_apply_encoding_flags(cpi, source->flags);
   if (!frame_params.show_existing_frame)
     *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-  const int is_overlay = frame_params.show_existing_frame &&
-                         (frame_update_type == OVERLAY_UPDATE ||
-                          frame_update_type == INTNL_OVERLAY_UPDATE);
-  if (frame_params.show_frame || is_overlay) {
-    // Shown frames and arf-overlay frames need frame-rate considering
-    adjust_frame_rate(cpi, source);
-  }
+  // Shown frames and arf-overlay frames need frame-rate considering
+  if (frame_params.show_frame)
+    adjust_frame_rate(cpi, source->ts_start, source->ts_end);
 
-  if (frame_params.show_existing_frame) {
-    // show_existing_frame implies this frame is shown!
-    frame_params.show_frame = 1;
-  } else {
+  if (!frame_params.show_existing_frame) {
     if (cpi->film_grain_table) {
       cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
           cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
@@ -1027,17 +1131,21 @@
           cm->seq_params.film_grain_params_present;
     }
     // only one operating point supported now
-    const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+    const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
     if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
-    cpi->common.frame_presentation_time = (uint32_t)pts64;
+    cm->frame_presentation_time = (uint32_t)pts64;
   }
 
-  if (oxcf->pass == 2 && (!frame_params.show_existing_frame || is_overlay)) {
-    // GF_GROUP needs updating for arf overlays as well as non-show-existing
-    av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
-    frame_update_type =
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
-  }
+#if CONFIG_REALTIME_ONLY
+  av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+#else
+  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+      oxcf->lag_in_frames == 0)
+    av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+  else if (!is_stat_generation_stage(cpi))
+    av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
+#endif
+  FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group);
 
   if (frame_params.show_existing_frame &&
       frame_params.frame_type != KEY_FRAME) {
@@ -1056,27 +1164,10 @@
   // parameter should be used with caution.
   frame_params.speed = oxcf->speed;
 
-  if (!frame_params.show_existing_frame) {
-    cm->using_qmatrix = cpi->oxcf.using_qm;
-    cm->min_qmlevel = cpi->oxcf.qm_minlevel;
-    cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
-    if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
-      av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
-      av1_set_frame_size(cpi, cm->width, cm->height);
-      av1_tpl_setup_stats(cpi, &frame_input);
-    }
-  }
-
   // Work out some encoding parameters specific to the pass:
-  if (oxcf->pass == 0) {
-    if (cpi->oxcf.rc_mode == AOM_CBR) {
-      av1_rc_get_one_pass_cbr_params(cpi, &frame_update_type, &frame_params,
-                                     *frame_flags);
-    } else {
-      av1_rc_get_one_pass_vbr_params(cpi, &frame_update_type, &frame_params,
-                                     *frame_flags);
-    }
-  } else if (oxcf->pass == 1) {
+  if (has_no_stats_stage(cpi) && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_update_parameters(cpi);
+  } else if (is_stat_generation_stage(cpi)) {
     cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
     const int kf_requested = (cm->current_frame.frame_number == 0 ||
                               (*frame_flags & FRAMEFLAGS_KEY));
@@ -1086,7 +1177,7 @@
     } else {
       frame_params.frame_type = INTER_FRAME;
     }
-  } else if (oxcf->pass == 2) {
+  } else if (is_stat_consumption_stage(cpi)) {
 #if CONFIG_MISMATCH_DEBUG
     mismatch_move_frame_idx_w();
 #endif
@@ -1096,7 +1187,8 @@
 #endif
   }
 
-  if (oxcf->pass == 0 || oxcf->pass == 2) set_ext_overrides(cpi, &frame_params);
+  if (!is_stat_generation_stage(cpi))
+    set_ext_overrides(cm, &frame_params, ext_flags);
 
   // Shown keyframes and S frames refresh all reference buffers
   const int force_refresh_all =
@@ -1107,17 +1199,39 @@
   av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
                                force_refresh_all);
 
-  if (oxcf->pass == 0 || oxcf->pass == 2) {
+  if (!is_stat_generation_stage(cpi)) {
+    const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
+    const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
+
+    if (!ext_flags->refresh_frame_flags_pending) {
+      av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
+    } else if (cpi->svc.external_ref_frame_config) {
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+        cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+    }
+
+    // Get the reference frames
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+      ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
+    }
     // Work out which reference frame slots may be used.
-    frame_params.ref_frame_flags = get_ref_frame_flags(cpi);
+    frame_params.ref_frame_flags = get_ref_frame_flags(
+        &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
 
     frame_params.primary_ref_frame =
         choose_primary_ref_frame(cpi, &frame_params);
-    frame_params.order_offset =
-        get_order_offset(&cpi->twopass.gf_group, &frame_params);
+    frame_params.order_offset = get_order_offset(&cpi->gf_group, &frame_params);
 
-    frame_params.refresh_frame_flags =
-        get_refresh_frame_flags(cpi, &frame_params, frame_update_type);
+    frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
+
+    frame_params.existing_fb_idx_to_show =
+        frame_params.show_existing_frame
+            ? (frame_update_type == INTNL_OVERLAY_UPDATE
+                   ? get_ref_frame_map_idx(cm, BWDREF_FRAME)
+                   : get_ref_frame_map_idx(cm, ALTREF_FRAME))
+            : INVALID_IDX;
   }
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
@@ -1131,19 +1245,50 @@
   memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
+  cpi->td.mb.e_mbd.delta_qindex = 0;
+
+  if (!frame_params.show_existing_frame) {
+    cm->quant_params.using_qmatrix = cpi->oxcf.using_qm;
+#if !CONFIG_REALTIME_ONLY
+    if (oxcf->lag_in_frames > 0 && !is_stat_generation_stage(cpi)) {
+      if (cpi->gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
+        av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
+        av1_set_frame_size(cpi, cm->width, cm->height);
+        av1_tpl_setup_stats(cpi, 0, &frame_params, &frame_input);
+        assert(cpi->num_gf_group_show_frames == 1);
+      }
+    }
+#endif
+  }
+
+#if CONFIG_REALTIME_ONLY
   if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
       AOM_CODEC_OK) {
     return AOM_CODEC_ERROR;
   }
+#else
+  if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+                         &frame_results) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#endif  // CONFIG_REALTIME_ONLY
+  if (!is_stat_generation_stage(cpi))
+    cpi->num_gf_group_show_frames += frame_params.show_frame;
 
-  if (oxcf->pass == 0 || oxcf->pass == 2) {
+  if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags
     update_frame_flags(cpi, frame_flags);
-    update_ref_frame_map(cpi, frame_update_type);
+    if (!ext_flags->refresh_frame_flags_pending) {
+      int ref_map_index =
+          av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
+      av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame,
+                               ref_map_index, &cpi->ref_buffer_stack);
+    }
   }
 
-  if (oxcf->pass == 2) {
+#if !CONFIG_REALTIME_ONLY
+  if (!is_stat_generation_stage(cpi)) {
 #if TXCOEFF_COST_TIMER
     cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
     fprintf(stderr,
@@ -1154,8 +1299,9 @@
 #endif
     av1_twopass_postencode_update(cpi);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
-  if (oxcf->pass == 0 || oxcf->pass == 2) {
+  if (!is_stat_generation_stage(cpi)) {
     update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
     set_additional_frame_flags(cm, frame_flags);
     update_rc_counts(cpi);
@@ -1166,8 +1312,10 @@
 
   // Leave a signal for a higher level caller about if this frame is droppable
   if (*size > 0) {
-    cpi->droppable = is_frame_droppable(cpi);
+    cpi->droppable = is_frame_droppable(&cpi->svc, ext_flags);
   }
 
+  if (cpi->use_svc) av1_save_layer_context(cpi);
+
   return AOM_CODEC_OK;
 }

diff --git a/libaom/av1/encoder/encode_strategy.h b/libaom/av1/encoder/encode_strategy.h
index 6830e44..b05224b 100644
--- a/libaom/av1/encoder/encode_strategy.h
+++ b/libaom/av1/encoder/encode_strategy.h

@@ -29,7 +29,8 @@
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
-                        const aom_rational_t *const timebase, int flush);
+                        const aom_rational64_t *const timestamp_ratio,
+                        int flush);
 
 // Set individual buffer update flags based on frame reference type.
 // force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
@@ -39,6 +40,23 @@
                                   const FRAME_UPDATE_TYPE type,
                                   int force_refresh_all);
 
+int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
+                                const EncodeFrameParams *const frame_params,
+                                FRAME_UPDATE_TYPE frame_update_type,
+                                const RefBufferStack *const ref_buffer_stack);
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
+
+void av1_update_ref_frame_map(AV1_COMP *cpi,
+                              FRAME_UPDATE_TYPE frame_update_type,
+                              int show_existing_frame, int ref_map_index,
+                              RefBufferStack *ref_buffer_stack);
+
+void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
+
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/encodeframe.c b/libaom/av1/encoder/encodeframe.c
index 2952184..53b47d4 100644
--- a/libaom/av1/encoder/encodeframe.c
+++ b/libaom/av1/encoder/encodeframe.c

@@ -47,6 +47,7 @@
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
+#include "av1/encoder/corner_detect.h"
 #include "av1/encoder/global_motion.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
@@ -55,23 +56,27 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
+#if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/partition_model_weights.h"
+#endif
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
 #include "av1/encoder/var_based_part.h"
 
-static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
-                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              int *rate);
-static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                               const MACROBLOCK *const x,
-                               const RD_STATS *const rd_stats,
-                               unsigned int pb_source_variance);
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
+                                         TileDataEnc *tile_data, ThreadData *td,
+                                         TOKENEXTRA **t, RUN_TYPE dry_run,
+                                         BLOCK_SIZE bsize, int *rate);
 
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
@@ -142,6 +147,43 @@
   128 * 16, 128 * 16
 };
 
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
+  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
+  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_COUNTS rd_count;
+
+  int split_count;
+  FRAME_COUNTS fc;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+#endif  // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs) {
@@ -155,24 +197,14 @@
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd) {
   unsigned int var, sse;
-  switch (bd) {
-    case 10:
-      var =
-          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse);
-      break;
-    case 12:
-      var =
-          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse);
-      break;
-    case 8:
-    default:
-      var =
-          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse);
-      break;
-  }
+  assert(bd == 8 || bd == 10 || bd == 12);
+  const int off_index = (bd - 8) >> 1;
+  const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
+                                       AV1_HIGH_VAR_OFFS_10,
+                                       AV1_HIGH_VAR_OFFS_12 };
+  var =
+      cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                         CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -206,20 +238,167 @@
     return BLOCK_8X8;
 }
 
-static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
-                                           const TileInfo *const tile,
-                                           MACROBLOCK *const x, int mi_row,
-                                           int mi_col, BLOCK_SIZE bsize) {
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  return av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+                                      quant_params->y_dc_delta_q);
+}
+
+static AOM_INLINE void set_ssim_rdmult(const AV1_COMP *const cpi,
+                                       MACROBLOCK *const x,
+                                       const BLOCK_SIZE bsize, const int mi_row,
+                                       const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tuning == AOM_TUNE_SSIM);
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  aom_clear_system_state();
+}
+
+static int get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               const BLOCK_SIZE bsize, const int mi_row,
+                               const int mi_col, int orig_rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int deltaq_rdmult = set_deltaq_rdmult(cpi, xd);
+  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible((AV1_COMP *)cpi)) return deltaq_rdmult;
+  if (tpl_idx >= MAX_LAG_BUFFERS) return deltaq_rdmult;
+  if (cpi->superres_mode != SUPERRES_NONE) return deltaq_rdmult;
+  if (cpi->oxcf.aq_mode != NO_AQ) return deltaq_rdmult;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double base_block_count = 0.0;
+  double geom_mean_of_scale = 0.0;
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+  rdmult = AOMMAX(rdmult, 0);
+  set_error_per_bit(x, rdmult);
+  aom_clear_system_state();
+  if (bsize == cm->seq_params.sb_size) {
+    const int rdmult_sb = set_deltaq_rdmult(cpi, xd);
+    assert(rdmult_sb == rdmult);
+    (void)rdmult_sb;
+  }
+  return rdmult;
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int8_t segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  const int segment_qindex =
+      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  return av1_compute_rd_mult(cpi,
+                             segment_qindex + cm->quant_params.y_dc_delta_q);
+}
+
+static AOM_INLINE void setup_block_rdmult(const AV1_COMP *const cpi,
+                                          MACROBLOCK *const x, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize,
+                                          AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+  x->rdmult = cpi->rd.RDMULT;
+
+  if (aq_mode != NO_AQ) {
+    assert(mbmi != NULL);
+    if (aq_mode == VARIANCE_AQ) {
+      if (cpi->vaq_refresh) {
+        const int energy = bsize <= BLOCK_16X16
+                               ? x->mb_energy
+                               : av1_log_block_var(cpi, x, bsize);
+        mbmi->segment_id = energy;
+      }
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == COMPLEXITY_AQ) {
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+      // If segment is boosted, use rdmult for that segment.
+      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+    }
+  }
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->delta_q_info.delta_q_present_flag &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    x->rdmult = get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
+  }
+
+  if (cpi->oxcf.tuning == AOM_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#if CONFIG_TUNE_VMAF
+  if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+}
+
+static AOM_INLINE void set_offsets_without_segment_id(
+    const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x,
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
 
-  set_skip_context(xd, mi_row, mi_col, num_planes);
-  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
@@ -229,35 +408,27 @@
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
-  x->mv_limits.row_min =
-      -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
-  x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
 
   set_plane_n4(xd, mi_width, mi_height, num_planes);
 
   // Set up distance of MB to edge of frame in 1/8th pel units.
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
-                 cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
 
   // Set up source buffers.
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
 
-  // R/D setup.
-  x->rdmult = cpi->rd.RDMULT;
-
   // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
   xd->tile = *tile;
-
-  xd->cfl.mi_row = mi_row;
-  xd->cfl.mi_col = mi_col;
 }
 
-static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
-                        MACROBLOCK *const x, int mi_row, int mi_col,
-                        BLOCK_SIZE bsize) {
+static AOM_INLINE void set_offsets(const AV1_COMP *const cpi,
+                                   const TileInfo *const tile,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -271,33 +442,40 @@
   if (seg->enabled) {
     if (seg->enabled && !cpi->vaq_refresh) {
       const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
       mbmi->segment_id =
-          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
     }
     av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
   }
 }
 
-static void update_filter_type_count(uint8_t allow_update_cdf,
-                                     FRAME_COUNTS *counts,
-                                     const MACROBLOCKD *xd,
-                                     const MB_MODE_INFO *mbmi) {
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *mbmi) {
   int dir;
   for (dir = 0; dir < 2; ++dir) {
     const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
     InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
     ++counts->switchable_interp[ctx][filter];
-    if (allow_update_cdf) {
-      update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
-                 SWITCHABLE_FILTERS);
-    }
   }
 }
 
-static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
-                                      const MB_MODE_INFO *mbmi,
-                                      RD_COUNTS *rdc) {
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                              const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+               SWITCHABLE_FILTERS);
+  }
+}
+
+static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode,
+                                                 BLOCK_SIZE bsize,
+                                                 const MB_MODE_INFO *mbmi,
+                                                 RD_COUNTS *rdc) {
   if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
     const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
     int ref;
@@ -307,8 +485,8 @@
   }
 }
 
-static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
-                          const TX_MODE tx_mode) {
+static AOM_INLINE void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+                                     const TX_MODE tx_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
   if (xd->lossless[mbmi->segment_id]) {
     mbmi->tx_size = TX_4X4;
@@ -322,17 +500,38 @@
   if (is_inter_block(mbmi)) {
     memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
   }
-  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  const int stride = xd->tx_type_map_stride;
+  const int bw = mi_size_wide[mbmi->sb_type];
+  for (int row = 0; row < mi_size_high[mbmi->sb_type]; ++row) {
+    memset(xd->tx_type_map + row * stride, DCT_DCT,
+           bw * sizeof(xd->tx_type_map[0]));
+  }
   av1_zero(x->blk_skip);
-  x->skip = 0;
+  x->force_skip = 0;
 }
 
-static void update_state(const AV1_COMP *const cpi,
-                         const TileDataEnc *const tile_data, ThreadData *td,
-                         const PICK_MODE_CONTEXT *const ctx, int mi_row,
-                         int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+static AOM_INLINE void update_state(const AV1_COMP *const cpi, ThreadData *td,
+                                    const PICK_MODE_CONTEXT *const ctx,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run) {
   int i, x_idx, y;
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   RD_COUNTS *const rdc = &td->rd_counts;
   MACROBLOCK *const x = &td->mb;
@@ -344,34 +543,52 @@
   const struct segmentation *const seg = &cm->seg;
   const int bw = mi_size_wide[mi->sb_type];
   const int bh = mi_size_high[mi->sb_type];
-  const int mis = cm->mi_stride;
+  const int mis = mi_params->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
   assert(mi->sb_type == bsize);
 
   *mi_addr = *mi;
-  *x->mbmi_ext = ctx->mbmi_ext;
+  copy_mbmi_ext_frame_to_mbmi_ext(x->mbmi_ext, &ctx->mbmi_ext_best,
+                                  av1_ref_frame_type(ctx->mic.ref_frame));
 
   memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
 
-  x->skip = ctx->skip;
+  x->force_skip = ctx->rd_stats.skip;
+
+  xd->tx_type_map = ctx->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  // If not dry_run, copy the transform type data into the frame level buffer.
+  // Encoder will fetch tx types when writing bitstream.
+  if (!dry_run) {
+    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+    const int mi_stride = mi_params->mi_stride;
+    for (int blk_row = 0; blk_row < bh; ++blk_row) {
+      av1_copy_array(tx_type_map + blk_row * mi_stride,
+                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+    }
+    xd->tx_type_map = tx_type_map;
+    xd->tx_type_map_stride = mi_stride;
+  }
 
   // If segmentation in use
   if (seg->enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
       mi_addr->segment_id =
-          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
-      reset_tx_size(x, mi_addr, cm->tx_mode);
+          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, x->tx_mode_search_type);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip);
+                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
+                                        x->force_skip);
     }
     if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
       mi_addr->uv_mode = UV_DC_PRED;
@@ -387,12 +604,14 @@
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < mi_height; y++)
-    for (x_idx = 0; x_idx < mi_width; x_idx++)
+  for (y = 0; y < mi_height; y++) {
+    for (x_idx = 0; x_idx < mi_width; x_idx++) {
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
           (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
         xd->mi[x_idx + y * mis] = mi_addr;
       }
+    }
+  }
 
   if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
 
@@ -432,11 +651,10 @@
       update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
     }
 
-    if (cm->interp_filter == SWITCHABLE &&
+    if (cm->features.interp_filter == SWITCHABLE &&
         mi_addr->motion_mode != WARPED_CAUSAL &&
         !is_nontrans_global_motion(xd, xd->mi[0])) {
-      update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd,
-                               mi_addr);
+      update_filter_type_count(td->counts, xd, mi_addr);
     }
 
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
@@ -444,9 +662,10 @@
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
   }
 
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
@@ -466,22 +685,6 @@
   }
 }
 
-static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x, segment_id);
-  aom_clear_system_state();
-  int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-  return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
-}
-
-static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  return av1_compute_rd_mult(
-      cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
-}
-
 static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
                           const bool high_bd, const int bd) {
   const int width = block_size_wide[bsize];
@@ -503,57 +706,70 @@
   return 0 && !frame_is_intra_only(&cpi->common);
 }
 
-static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
-                          MACROBLOCK *const x, int mi_row, int mi_col,
-                          RD_STATS *rd_cost, PARTITION_TYPE partition,
-                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                          int64_t best_rd, int use_nonrd_pick_mode) {
+static void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x,
+                                     RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx) {
+  // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode,
+  // which sets base_qindex to 0 on keyframe.
+  if (cpi->oxcf.rc_mode != AOM_CBR || !cpi->sf.rt_sf.hybrid_intra_pickmode ||
+      bsize < BLOCK_16X16)
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    av1_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static AOM_INLINE void pick_sb_modes(AV1_COMP *const cpi,
+                                     TileDataEnc *tile_data,
+                                     MACROBLOCK *const x, int mi_row,
+                                     int mi_col, RD_STATS *rd_cost,
+                                     PARTITION_TYPE partition, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx, RD_STATS best_rd,
+                                     int pick_mode_type) {
+  if (best_rd.rdcost < 0) {
+    ctx->rd_stats.rdcost = INT64_MAX;
+    ctx->rd_stats.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx->mic.sb_type == bsize);
+    assert(ctx->mic.partition == partition);
+    rd_cost->rate = ctx->rd_stats.rate;
+    rd_cost->dist = ctx->rd_stats.dist;
+    rd_cost->rdcost = ctx->rd_stats.rdcost;
+    return;
+  }
+
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  MB_MODE_INFO *ctx_mbmi = &ctx->mic;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
-  const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
-  int i, orig_rdmult;
+  int i;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, rd_pick_sb_modes_time);
 #endif
 
-  if (best_rd < 0) {
-    ctx->rdcost = INT64_MAX;
-    ctx->skip = 0;
-    av1_invalid_rd_stats(rd_cost);
-    return;
-  }
-
   aom_clear_system_state();
 
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-
   mbmi = xd->mi[0];
-
-  if (ctx->rd_mode_is_ready) {
-    assert(ctx_mbmi->sb_type == bsize);
-    assert(ctx_mbmi->partition == partition);
-    *mbmi = *ctx_mbmi;
-    rd_cost->rate = ctx->rate;
-    rd_cost->dist = ctx->dist;
-    rd_cost->rdcost = ctx->rdcost;
-  } else {
-    mbmi->sb_type = bsize;
-    mbmi->partition = partition;
-  }
+  mbmi->sb_type = bsize;
+  mbmi->partition = partition;
 
 #if CONFIG_RD_DEBUG
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
 
+  xd->tx_type_map = x->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+
   for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
@@ -564,25 +780,11 @@
 
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 
-  if (!ctx->rd_mode_is_ready) {
-    ctx->skippable = 0;
-
-    // Set to zero to make sure we do not use the previous encoded frame stats
-    mbmi->skip = 0;
-
-    // Reset skip mode flag.
-    mbmi->skip_mode = 0;
-  }
-
-  x->skip_chroma_rd =
-      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y);
-
-  if (ctx->rd_mode_is_ready) {
-    x->skip = ctx->skip;
-    *x->mbmi_ext = ctx->mbmi_ext;
-    return;
-  }
+  ctx->skippable = 0;
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip = 0;
+  // Reset skip mode flag.
+  mbmi->skip_mode = 0;
 
   if (is_cur_buf_hbd(xd)) {
     x->source_variance = av1_high_get_sby_perpixel_variance(
@@ -592,15 +794,15 @@
         av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
   }
   if (use_pb_simple_motion_pred_sse(cpi)) {
-    const MV ref_mv_full = { .row = 0, .col = 0 };
+    const FULLPEL_MV start_mv = kZeroFullMv;
     unsigned int var = 0;
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
                               &x->simple_motion_pred_sse, &var);
   }
 
   // If the threshold for disabling wedge search is zero, it means the feature
   // should not be used. Use a value that will always succeed in the check.
-  if (cpi->sf.disable_wedge_search_edge_thresh == 0) {
+  if (cpi->sf.inter_sf.disable_wedge_search_edge_thresh == 0) {
     x->edge_strength = UINT16_MAX;
     x->edge_strength_x = UINT16_MAX;
     x->edge_strength_y = UINT16_MAX;
@@ -611,28 +813,16 @@
     x->edge_strength_x = ei.x;
     x->edge_strength_y = ei.y;
   }
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
   // Save rdmult before it might be changed, so it can be restored later.
-  orig_rdmult = x->rdmult;
-
-  if (aq_mode == VARIANCE_AQ) {
-    if (cpi->vaq_refresh) {
-      const int energy = bsize <= BLOCK_16X16
-                             ? x->mb_energy
-                             : av1_log_block_var(cpi, x, bsize);
-      mbmi->segment_id = energy;
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-  } else if (aq_mode == COMPLEXITY_AQ) {
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
-      x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
-  } else if (cpi->oxcf.enable_tpl_model) {
-    x->rdmult = x->cb_rdmult;
-  }
-
-  if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd);
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  set_error_per_bit(x, x->rdmult);
+  av1_rd_cost_update(x->rdmult, &best_rd);
 
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
@@ -640,8 +830,15 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
 #endif
-    av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
-                              best_rd);
+    switch (pick_mode_type) {
+      case PICK_MODE_RD:
+        av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+        break;
+      case PICK_MODE_NONRD:
+        hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+        break;
+      default: assert(0 && "Unknown pick mode type.");
+    }
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
 #endif
@@ -651,16 +848,19 @@
 #endif
     if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
-                                         rd_cost, bsize, ctx, best_rd);
+                                         rd_cost, bsize, ctx, best_rd.rdcost);
     } else {
-      // TODO(kyslov): do the same for pick_intra_mode and
-      //               pick_inter_mode_sb_seg_skip
-      if (use_nonrd_pick_mode) {
-        av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-                                     bsize, ctx, best_rd);
-      } else {
-        av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-                                  bsize, ctx, best_rd);
+      // TODO(kyslov): do the same for pick_inter_mode_sb_seg_skip
+      switch (pick_mode_type) {
+        case PICK_MODE_RD:
+          av1_rd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
+                                    best_rd.rdcost);
+          break;
+        case PICK_MODE_NONRD:
+          av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
+                                       best_rd.rdcost);
+          break;
+        default: assert(0 && "Unknown pick mode type.");
       }
     }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -669,11 +869,8 @@
   }
 
   // Examine the resulting rate and for AQ mode 2 make a segment choice.
-  if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
-      (bsize >= BLOCK_16X16) &&
-      (cm->current_frame.frame_type == KEY_FRAME ||
-       cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
-       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+      bsize >= BLOCK_16X16) {
     av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
 
@@ -683,18 +880,19 @@
   // refactored to provide proper exit/return handle.
   if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
 
-  ctx->rate = rd_cost->rate;
-  ctx->dist = rd_cost->dist;
-  ctx->rdcost = rd_cost->rdcost;
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, rd_pick_sb_modes_time);
 #endif
 }
 
-static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
-                                    PREDICTION_MODE mode, int16_t mode_context,
-                                    uint8_t allow_update_cdf) {
+static AOM_INLINE void update_inter_mode_stats(FRAME_CONTEXT *fc,
+                                               FRAME_COUNTS *counts,
+                                               PREDICTION_MODE mode,
+                                               int16_t mode_context) {
   (void)counts;
 
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
@@ -702,38 +900,39 @@
 #if CONFIG_ENTROPY_STATS
     ++counts->newmv_mode[mode_ctx][0];
 #endif
-    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
     return;
-  } else {
-#if CONFIG_ENTROPY_STATS
-    ++counts->newmv_mode[mode_ctx][1];
-#endif
-    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
-
-    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
-    if (mode == GLOBALMV) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->zeromv_mode[mode_ctx][0];
-#endif
-      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
-      return;
-    } else {
-#if CONFIG_ENTROPY_STATS
-      ++counts->zeromv_mode[mode_ctx][1];
-#endif
-      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
-      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-#if CONFIG_ENTROPY_STATS
-      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
-#endif
-      if (allow_update_cdf)
-        update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
-    }
   }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->newmv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->zeromv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->zeromv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
 }
 
-static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
-                               FRAME_COUNTS *counts, uint8_t allow_update_cdf) {
+static AOM_INLINE void update_palette_cdf(MACROBLOCKD *xd,
+                                          const MB_MODE_INFO *const mbmi,
+                                          FRAME_COUNTS *counts) {
   FRAME_CONTEXT *fc = xd->tile_ctx;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
@@ -748,17 +947,14 @@
 #if CONFIG_ENTROPY_STATS
     ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
 #endif
-    if (allow_update_cdf)
-      update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
-                 n > 0, 2);
+    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+               n > 0, 2);
     if (n > 0) {
 #if CONFIG_ENTROPY_STATS
       ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
 #endif
-      if (allow_update_cdf) {
-        update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
-                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-      }
+      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
     }
   }
 
@@ -769,30 +965,26 @@
 #if CONFIG_ENTROPY_STATS
     ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
 #endif
-    if (allow_update_cdf)
-      update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
 
     if (n > 0) {
 #if CONFIG_ENTROPY_STATS
       ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
 #endif
-      if (allow_update_cdf) {
-        update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
-                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-      }
+      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
     }
   }
 }
 
-static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
-                            MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
-                            const MB_MODE_INFO *above_mi,
-                            const MB_MODE_INFO *left_mi, const int intraonly,
-                            const int mi_row, const int mi_col,
-                            uint8_t allow_update_cdf) {
+static AOM_INLINE void sum_intra_stats(const AV1_COMMON *const cm,
+                                       FRAME_COUNTS *counts, MACROBLOCKD *xd,
+                                       const MB_MODE_INFO *const mbmi,
+                                       const MB_MODE_INFO *above_mi,
+                                       const MB_MODE_INFO *left_mi,
+                                       const int intraonly) {
   FRAME_CONTEXT *fc = xd->tile_ctx;
   const PREDICTION_MODE y_mode = mbmi->mode;
-  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
   (void)counts;
   const BLOCK_SIZE bsize = mbmi->sb_type;
 
@@ -804,14 +996,12 @@
     const int left_ctx = intra_mode_context[left];
     ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
 #endif  // CONFIG_ENTROPY_STATS
-    if (allow_update_cdf)
-      update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
   } else {
 #if CONFIG_ENTROPY_STATS
     ++counts->y_mode[size_group_lookup[bsize]][y_mode];
 #endif  // CONFIG_ENTROPY_STATS
-    if (allow_update_cdf)
-      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
   }
 
   if (av1_filter_intra_allowed(cm, mbmi)) {
@@ -824,14 +1014,11 @@
             ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
     }
 #endif  // CONFIG_ENTROPY_STATS
-    if (allow_update_cdf) {
-      update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode,
-                 2);
-      if (use_filter_intra_mode) {
-        update_cdf(fc->filter_intra_mode_cdf,
-                   mbmi->filter_intra_mode_info.filter_intra_mode,
-                   FILTER_INTRA_MODES);
-      }
+    update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, 2);
+    if (use_filter_intra_mode) {
+      update_cdf(fc->filter_intra_mode_cdf,
+                 mbmi->filter_intra_mode_info.filter_intra_mode,
+                 FILTER_INTRA_MODES);
     }
   }
   if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
@@ -839,43 +1026,35 @@
     ++counts->angle_delta[mbmi->mode - V_PRED]
                          [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
 #endif
-    if (allow_update_cdf) {
-      update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
-                 mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
-                 2 * MAX_ANGLE_DELTA + 1);
-    }
+    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
   }
 
-  if (!is_chroma_reference(mi_row, mi_col, bsize,
-                           xd->plane[AOM_PLANE_U].subsampling_x,
-                           xd->plane[AOM_PLANE_U].subsampling_y))
-    return;
+  if (!xd->is_chroma_ref) return;
 
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
 #if CONFIG_ENTROPY_STATS
-  ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode];
+  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
 #endif  // CONFIG_ENTROPY_STATS
-  if (allow_update_cdf) {
-    const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
-    update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
-               UV_INTRA_MODES - !cfl_allowed);
-  }
+  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+             UV_INTRA_MODES - !cfl_allowed);
   if (uv_mode == UV_CFL_PRED) {
-    const int joint_sign = mbmi->cfl_alpha_signs;
-    const int idx = mbmi->cfl_alpha_idx;
+    const int8_t joint_sign = mbmi->cfl_alpha_signs;
+    const uint8_t idx = mbmi->cfl_alpha_idx;
 
 #if CONFIG_ENTROPY_STATS
     ++counts->cfl_sign[joint_sign];
 #endif
-    if (allow_update_cdf)
-      update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
     if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
       aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
 
 #if CONFIG_ENTROPY_STATS
       ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
 #endif
-      if (allow_update_cdf)
-        update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
     }
     if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
       aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
@@ -883,8 +1062,7 @@
 #if CONFIG_ENTROPY_STATS
       ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
 #endif
-      if (allow_update_cdf)
-        update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
     }
   }
   if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
@@ -893,18 +1071,17 @@
     ++counts->angle_delta[uv_mode - UV_V_PRED]
                          [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
 #endif
-    if (allow_update_cdf) {
-      update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
-                 mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
-                 2 * MAX_ANGLE_DELTA + 1);
-    }
+    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
   }
-  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    update_palette_cdf(xd, mbmi, counts, allow_update_cdf);
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    update_palette_cdf(xd, mbmi, counts);
+  }
 }
 
-static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
-                         ThreadData *td, int mi_row, int mi_col) {
+static AOM_INLINE void update_stats(const AV1_COMMON *const cm,
+                                    ThreadData *td) {
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -912,13 +1089,6 @@
   const CurrentFrame *const current_frame = &cm->current_frame;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *fc = xd->tile_ctx;
-  const uint8_t allow_update_cdf = tile_data->allow_update_cdf;
-
-  // delta quant applies to both intra and inter
-  const int super_block_upper_left =
-      ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-      ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
-
   const int seg_ref_active =
       segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
 
@@ -928,25 +1098,26 @@
 #if CONFIG_ENTROPY_STATS
     td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
 #endif
-    if (allow_update_cdf)
-      update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
   }
 
-  if (!mbmi->skip_mode) {
-    if (!seg_ref_active) {
-      const int skip_ctx = av1_get_skip_context(xd);
+  if (!mbmi->skip_mode && !seg_ref_active) {
+    const int skip_ctx = av1_get_skip_context(xd);
 #if CONFIG_ENTROPY_STATS
-      td->counts->skip[skip_ctx][mbmi->skip]++;
+    td->counts->skip[skip_ctx][mbmi->skip]++;
 #endif
-      if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
-    }
+    update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
   }
 
+#if CONFIG_ENTROPY_STATS
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag &&
       (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
       super_block_upper_left) {
-#if CONFIG_ENTROPY_STATS
     const int dq =
         (mbmi->current_qindex - xd->current_qindex) / delta_q_info->delta_q_res;
     const int absdq = abs(dq);
@@ -954,14 +1125,11 @@
       td->counts->delta_q[i][1]++;
     }
     if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
-#endif
-    xd->current_qindex = mbmi->current_qindex;
     if (delta_q_info->delta_lf_present_flag) {
       if (delta_q_info->delta_lf_multi) {
         const int frame_lf_count =
             av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
         for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-#if CONFIG_ENTROPY_STATS
           const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                                delta_q_info->delta_lf_res;
           const int abs_delta_lf = abs(delta_lf);
@@ -970,11 +1138,8 @@
           }
           if (abs_delta_lf < DELTA_LF_SMALL)
             td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
-#endif
-          xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
         }
       } else {
-#if CONFIG_ENTROPY_STATS
         const int delta_lf =
             (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
             delta_q_info->delta_lf_res;
@@ -984,440 +1149,371 @@
         }
         if (abs_delta_lf < DELTA_LF_SMALL)
           td->counts->delta_lf[abs_delta_lf][0]++;
-#endif
-        xd->delta_lf_from_base = mbmi->delta_lf_from_base;
       }
     }
   }
+#endif
 
   if (!is_inter_block(mbmi)) {
     sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
-                    frame_is_intra_only(cm), mi_row, mi_col,
-                    tile_data->allow_update_cdf);
+                    frame_is_intra_only(cm));
   }
 
   if (av1_allow_intrabc(cm)) {
-    if (allow_update_cdf)
-      update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
 #if CONFIG_ENTROPY_STATS
     ++td->counts->intrabc[is_intrabc_block(mbmi)];
 #endif  // CONFIG_ENTROPY_STATS
   }
 
-  if (!frame_is_intra_only(cm)) {
-    RD_COUNTS *rdc = &td->rd_counts;
+  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
 
-    FRAME_COUNTS *const counts = td->counts;
+  FRAME_COUNTS *const counts = td->counts;
+  const int inter_block = is_inter_block(mbmi);
 
-    if (mbmi->skip_mode) {
-      rdc->skip_mode_used_flag = 1;
-      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
-        assert(has_second_ref(mbmi));
-        rdc->compound_ref_used_flag = 1;
-      }
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-      return;
-    }
-
-    const int inter_block = is_inter_block(mbmi);
-
-    if (!seg_ref_active) {
+  if (!seg_ref_active) {
 #if CONFIG_ENTROPY_STATS
-      counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
 #endif
-      if (allow_update_cdf) {
-        update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
-                   inter_block, 2);
-      }
-      // If the segment reference feature is enabled we have only a single
-      // reference frame allowed for the segment so exclude it from
-      // the reference frame counts used to work out probabilities.
-      if (inter_block) {
-        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-
-        av1_collect_neighbors_ref_counts(xd);
-
-        if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
-          if (has_second_ref(mbmi))
-            // This flag is also updated for 4x4 blocks
-            rdc->compound_ref_used_flag = 1;
-          if (is_comp_ref_allowed(bsize)) {
+    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+               inter_block, 2);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if (inter_block) {
+      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+        if (is_comp_ref_allowed(bsize)) {
 #if CONFIG_ENTROPY_STATS
-            counts->comp_inter[av1_get_reference_mode_context(xd)]
-                              [has_second_ref(mbmi)]++;
+          counts->comp_inter[av1_get_reference_mode_context(xd)]
+                            [has_second_ref(mbmi)]++;
 #endif  // CONFIG_ENTROPY_STATS
-            if (allow_update_cdf) {
-              update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi),
-                         2);
-            }
-          }
+          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
         }
+      }
 
-        if (has_second_ref(mbmi)) {
-          const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
-                                                        ? UNIDIR_COMP_REFERENCE
-                                                        : BIDIR_COMP_REFERENCE;
-          if (allow_update_cdf) {
-            update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
-                       COMP_REFERENCE_TYPES);
-          }
+      if (has_second_ref(mbmi)) {
+        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                      ? UNIDIR_COMP_REFERENCE
+                                                      : BIDIR_COMP_REFERENCE;
+        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                   COMP_REFERENCE_TYPES);
 #if CONFIG_ENTROPY_STATS
-          counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
-                               [comp_ref_type]++;
+        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                             [comp_ref_type]++;
 #endif  // CONFIG_ENTROPY_STATS
 
-          if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
-            const int bit = (ref0 == BWDREF_FRAME);
-            if (allow_update_cdf)
-              update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+          const int bit = (ref0 == BWDREF_FRAME);
+          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
 #if CONFIG_ENTROPY_STATS
-            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
-                                [bit]++;
+          counts
+              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
 #endif  // CONFIG_ENTROPY_STATS
-            if (!bit) {
-              const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
-              if (allow_update_cdf)
-                update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+          if (!bit) {
+            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
 #if CONFIG_ENTROPY_STATS
-              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
-                                  [bit1]++;
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                [bit1]++;
 #endif  // CONFIG_ENTROPY_STATS
-              if (bit1) {
-                if (allow_update_cdf) {
-                  update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
-                             ref1 == GOLDEN_FRAME, 2);
-                }
+            if (bit1) {
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                         ref1 == GOLDEN_FRAME, 2);
 #if CONFIG_ENTROPY_STATS
-                counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
-                                    [2][ref1 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-              }
-            }
-          } else {
-            const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
-            if (allow_update_cdf)
-              update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-            if (!bit) {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_comp_ref_p1(xd),
-                           ref0 == LAST2_FRAME, 2);
-              }
-#if CONFIG_ENTROPY_STATS
-              counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
-                              [ref0 == LAST2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            } else {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_comp_ref_p2(xd),
-                           ref0 == GOLDEN_FRAME, 2);
-              }
-#if CONFIG_ENTROPY_STATS
-              counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
-                              [ref0 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            }
-            if (allow_update_cdf) {
-              update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd),
-                         ref1 == ALTREF_FRAME, 2);
-            }
-#if CONFIG_ENTROPY_STATS
-            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
-                               [ref1 == ALTREF_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            if (ref1 != ALTREF_FRAME) {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
-                           ref1 == ALTREF2_FRAME, 2);
-              }
-#if CONFIG_ENTROPY_STATS
-              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
-                                 [ref1 == ALTREF2_FRAME]++;
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+                                  [ref1 == GOLDEN_FRAME]++;
 #endif  // CONFIG_ENTROPY_STATS
             }
           }
         } else {
-          const int bit = (ref0 >= BWDREF_FRAME);
-          if (allow_update_cdf)
-            update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
 #if CONFIG_ENTROPY_STATS
-          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
 #endif  // CONFIG_ENTROPY_STATS
-          if (bit) {
-            assert(ref0 <= ALTREF_FRAME);
-            if (allow_update_cdf) {
-              update_cdf(av1_get_pred_cdf_single_ref_p2(xd),
-                         ref0 == ALTREF_FRAME, 2);
-            }
+          if (!bit) {
+            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+                       2);
 #if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                              [ref0 == ALTREF_FRAME]++;
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                            [ref0 == LAST2_FRAME]++;
 #endif  // CONFIG_ENTROPY_STATS
-            if (ref0 != ALTREF_FRAME) {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
-                           ref0 == ALTREF2_FRAME, 2);
-              }
-#if CONFIG_ENTROPY_STATS
-              counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
-                                [ref0 == ALTREF2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            }
           } else {
-            const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
-            if (allow_update_cdf)
-              update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+                       2);
 #if CONFIG_ENTROPY_STATS
-            counts
-                ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
 #endif  // CONFIG_ENTROPY_STATS
-            if (!bit1) {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_single_ref_p4(xd),
-                           ref0 != LAST_FRAME, 2);
-              }
+          }
+          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+                     2);
 #if CONFIG_ENTROPY_STATS
-              counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
-                                [ref0 != LAST_FRAME]++;
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
 #endif  // CONFIG_ENTROPY_STATS
-            } else {
-              if (allow_update_cdf) {
-                update_cdf(av1_get_pred_cdf_single_ref_p5(xd),
-                           ref0 != LAST3_FRAME, 2);
-              }
+          if (ref1 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                       ref1 == ALTREF2_FRAME, 2);
 #if CONFIG_ENTROPY_STATS
-              counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
-                                [ref0 != LAST3_FRAME]++;
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                               [ref1 == ALTREF2_FRAME]++;
 #endif  // CONFIG_ENTROPY_STATS
-            }
           }
         }
-
-        if (cm->seq_params.enable_interintra_compound &&
-            is_interintra_allowed(mbmi)) {
-          const int bsize_group = size_group_lookup[bsize];
-          if (mbmi->ref_frame[1] == INTRA_FRAME) {
+      } else {
+        const int bit = (ref0 >= BWDREF_FRAME);
+        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
 #if CONFIG_ENTROPY_STATS
-            counts->interintra[bsize_group][1]++;
-#endif
-            if (allow_update_cdf)
-              update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+        if (bit) {
+          assert(ref0 <= ALTREF_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+                     2);
 #if CONFIG_ENTROPY_STATS
-            counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-#endif
-            if (allow_update_cdf) {
-              update_cdf(fc->interintra_mode_cdf[bsize_group],
-                         mbmi->interintra_mode, INTERINTRA_MODES);
-            }
-            if (is_interintra_wedge_used(bsize)) {
+          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                            [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref0 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                       ref0 == ALTREF2_FRAME, 2);
 #if CONFIG_ENTROPY_STATS
-              counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
-#endif
-              if (allow_update_cdf) {
-                update_cdf(fc->wedge_interintra_cdf[bsize],
-                           mbmi->use_wedge_interintra, 2);
-              }
-              if (mbmi->use_wedge_interintra) {
+            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                              [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        } else {
+          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
 #if CONFIG_ENTROPY_STATS
-                counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
-#endif
-                if (allow_update_cdf) {
-                  update_cdf(fc->wedge_idx_cdf[bsize],
-                             mbmi->interintra_wedge_index, 16);
-                }
-              }
-            }
+          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit1) {
+            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                              [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
           } else {
+            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+                       2);
 #if CONFIG_ENTROPY_STATS
-            counts->interintra[bsize_group][0]++;
-#endif
-            if (allow_update_cdf)
-              update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                              [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
           }
         }
+      }
 
-        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-        const MOTION_MODE motion_allowed =
-            cm->switchable_motion_mode
-                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                      cm->allow_warped_motion)
-                : SIMPLE_TRANSLATION;
-        if (mbmi->ref_frame[1] != INTRA_FRAME) {
-          if (motion_allowed == WARPED_CAUSAL) {
+      if (cm->seq_params.enable_interintra_compound &&
+          is_interintra_allowed(mbmi)) {
+        const int bsize_group = size_group_lookup[bsize];
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
 #if CONFIG_ENTROPY_STATS
-            counts->motion_mode[bsize][mbmi->motion_mode]++;
+          counts->interintra[bsize_group][1]++;
 #endif
-            if (allow_update_cdf) {
-              update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
-                         MOTION_MODES);
-            }
-          } else if (motion_allowed == OBMC_CAUSAL) {
+          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
 #if CONFIG_ENTROPY_STATS
-            counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
 #endif
-            if (allow_update_cdf) {
-              update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL,
-                         2);
+          update_cdf(fc->interintra_mode_cdf[bsize_group],
+                     mbmi->interintra_mode, INTERINTRA_MODES);
+          if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+            update_cdf(fc->wedge_interintra_cdf[bsize],
+                       mbmi->use_wedge_interintra, 2);
+            if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+                         16);
             }
           }
+        } else {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][0]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+        }
+      }
+
+      const MOTION_MODE motion_allowed =
+          cm->features.switchable_motion_mode
+              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                    cm->features.allow_warped_motion)
+              : SIMPLE_TRANSLATION;
+      if (mbmi->ref_frame[1] != INTRA_FRAME) {
+        if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                     MOTION_MODES);
+        } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                         cm->seq_params.enable_masked_compound;
+        if (masked_compound_used) {
+          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                     mbmi->comp_group_idx, 2);
         }
 
-        if (has_second_ref(mbmi)) {
-          assert(current_frame->reference_mode != SINGLE_REFERENCE &&
-                 is_inter_compound_mode(mbmi->mode) &&
-                 mbmi->motion_mode == SIMPLE_TRANSLATION);
-
-          const int masked_compound_used =
-              is_any_masked_compound_used(bsize) &&
-              cm->seq_params.enable_masked_compound;
-          if (masked_compound_used) {
-            const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+        if (mbmi->comp_group_idx == 0) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
 #if CONFIG_ENTROPY_STATS
-            ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
 #endif
-            if (allow_update_cdf) {
-              update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
-                         mbmi->comp_group_idx, 2);
-            }
-          }
-
-          if (mbmi->comp_group_idx == 0) {
-            const int comp_index_ctx = get_comp_index_context(cm, xd);
-#if CONFIG_ENTROPY_STATS
-            ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
-#endif
-            if (allow_update_cdf) {
-              update_cdf(fc->compound_index_cdf[comp_index_ctx],
-                         mbmi->compound_idx, 2);
-            }
-          } else {
-            assert(masked_compound_used);
-            if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
-#if CONFIG_ENTROPY_STATS
-              ++counts->compound_type[bsize][mbmi->interinter_comp.type -
-                                             COMPOUND_WEDGE];
-#endif
-              if (allow_update_cdf) {
-                update_cdf(fc->compound_type_cdf[bsize],
-                           mbmi->interinter_comp.type - COMPOUND_WEDGE,
-                           MASKED_COMPOUND_TYPES);
-              }
-            }
-          }
-        }
-        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+                     2);
+        } else {
+          assert(masked_compound_used);
           if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
 #if CONFIG_ENTROPY_STATS
-            counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                           COMPOUND_WEDGE];
 #endif
-            if (allow_update_cdf) {
-              update_cdf(fc->wedge_idx_cdf[bsize],
-                         mbmi->interinter_comp.wedge_index, 16);
-            }
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                       MASKED_COMPOUND_TYPES);
           }
         }
       }
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+          update_cdf(fc->wedge_idx_cdf[bsize],
+                     mbmi->interinter_comp.wedge_index, 16);
+        }
+      }
+    }
+  }
+
+  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+      mbmi->motion_mode != WARPED_CAUSAL &&
+      !is_nontrans_global_motion(xd, mbmi)) {
+    update_filter_type_cdf(xd, mbmi);
+  }
+  if (inter_block &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+    if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+    } else {
+      update_inter_mode_stats(fc, counts, mode, mode_ctx);
+    }
+
+    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+    if (new_mv) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 0; idx < 2; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+          if (mbmi->ref_mv_idx == idx) break;
+        }
+      }
     }
 
-    if (inter_block &&
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      int16_t mode_ctx;
-      const PREDICTION_MODE mode = mbmi->mode;
-
-      mode_ctx =
-          av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-      if (has_second_ref(mbmi)) {
+    if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 1; idx < 3; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
 #if CONFIG_ENTROPY_STATS
-        ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
 #endif
-        if (allow_update_cdf)
-          update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
-                     INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
-      } else {
-        update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf);
-      }
-
-      int mode_allowed = (mbmi->mode == NEWMV);
-      mode_allowed |= (mbmi->mode == NEW_NEWMV);
-      if (mode_allowed) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        int idx;
-
-        for (idx = 0; idx < 2; ++idx) {
-          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-#if CONFIG_ENTROPY_STATS
-            uint8_t drl_ctx =
-                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
-#endif
-
-            if (mbmi->ref_mv_idx == idx) break;
-          }
+          if (mbmi->ref_mv_idx == idx - 1) break;
         }
       }
-
-      if (have_nearmv_in_inter_mode(mbmi->mode)) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        int idx;
-
-        for (idx = 1; idx < 3; ++idx) {
-          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-#if CONFIG_ENTROPY_STATS
-            uint8_t drl_ctx =
-                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
-#endif
-
-            if (mbmi->ref_mv_idx == idx - 1) break;
-          }
+    }
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      const int allow_hp = cm->features.cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->features.allow_high_precision_mv;
+      if (new_mv) {
+        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+          const int_mv ref_mv = av1_get_ref_mv(x, ref);
+          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                              allow_hp);
         }
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+        const int ref = 1;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+        const int ref = 0;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
       }
     }
   }
 }
 
-typedef struct {
-  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
-  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
-  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
-  TXFM_CONTEXT *p_ta;
-  TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[MAX_MIB_SIZE];
-} RD_SEARCH_MACROBLOCK_CONTEXT;
-
-static void restore_context(MACROBLOCK *x,
-                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize,
-                            const int num_planes) {
+static AOM_INLINE void restore_context(MACROBLOCK *x,
+                                       const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       const int num_planes) {
   MACROBLOCKD *xd = &x->e_mbd;
   int p;
-  const int num_4x4_blocks_wide =
-      block_size_wide[bsize] >> tx_size_wide_log2[0];
-  const int num_4x4_blocks_high =
-      block_size_high[bsize] >> tx_size_high_log2[0];
+  const int num_4x4_blocks_wide = mi_size_wide[bsize];
+  const int num_4x4_blocks_high = mi_size_high[bsize];
   int mi_width = mi_size_wide[bsize];
   int mi_height = mi_size_high[bsize];
   for (p = 0; p < num_planes; p++) {
     int tx_col = mi_col;
     int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-           ctx->a + num_4x4_blocks_wide * p,
-           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
-               xd->plane[p].subsampling_x);
-    memcpy(xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+    memcpy(
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        ctx->a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+            xd->plane[p].subsampling_x);
+    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
            ctx->l + num_4x4_blocks_high * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
                xd->plane[p].subsampling_y);
   }
-  memcpy(xd->above_seg_context + mi_col, ctx->sa,
-         sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
-         sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(xd->above_partition_context + mi_col, ctx->sa,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_partition_context[0]) * mi_height);
   xd->above_txfm_context = ctx->p_ta;
   xd->left_txfm_context = ctx->p_tl;
   memcpy(xd->above_txfm_context, ctx->ta,
@@ -1426,15 +1522,12 @@
          sizeof(*xd->left_txfm_context) * mi_height);
 }
 
-static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                         int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         const int num_planes) {
+static AOM_INLINE void save_context(const MACROBLOCK *x,
+                                    RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    const int num_planes) {
   const MACROBLOCKD *xd = &x->e_mbd;
   int p;
-  const int num_4x4_blocks_wide =
-      block_size_wide[bsize] >> tx_size_wide_log2[0];
-  const int num_4x4_blocks_high =
-      block_size_high[bsize] >> tx_size_high_log2[0];
   int mi_width = mi_size_wide[bsize];
   int mi_height = mi_size_high[bsize];
 
@@ -1442,19 +1535,18 @@
   for (p = 0; p < num_planes; ++p) {
     int tx_col = mi_col;
     int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(ctx->a + num_4x4_blocks_wide * p,
-           xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
-               xd->plane[p].subsampling_x);
-    memcpy(ctx->l + num_4x4_blocks_high * p,
-           xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
-           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
-               xd->plane[p].subsampling_y);
+    memcpy(
+        ctx->a + mi_width * p,
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+    memcpy(ctx->l + mi_height * p,
+           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
   }
-  memcpy(ctx->sa, xd->above_seg_context + mi_col,
-         sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
-         sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(ctx->sa, xd->above_partition_context + mi_col,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_partition_context[0]) * mi_height);
   memcpy(ctx->ta, xd->above_txfm_context,
          sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(ctx->tl, xd->left_txfm_context,
@@ -1463,36 +1555,38 @@
   ctx->p_tl = xd->left_txfm_context;
 }
 
-static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
-                     ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
-                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                     PARTITION_TYPE partition,
-                     const PICK_MODE_CONTEXT *const ctx, int *rate) {
+static AOM_INLINE void encode_b(const AV1_COMP *const cpi,
+                                TileDataEnc *tile_data, ThreadData *td,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                PARTITION_TYPE partition,
+                                PICK_MODE_CONTEXT *const ctx, int *rate) {
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
   MB_MODE_INFO *mbmi = xd->mi[0];
   mbmi->partition = partition;
-  update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run);
-  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
-      cpi->oxcf.deltaq_mode == 0) {
-    x->rdmult = x->cb_rdmult;
-  }
-
-  if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
-
-  encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
-                    rate);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
 
   if (!dry_run) {
+    x->mbmi_ext_frame->cb_offset = x->cb_offset;
+    assert(x->cb_offset <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+  if (!dry_run) {
+    const AV1_COMMON *const cm = &cpi->common;
     x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
     if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
-        cpi->common.delta_q_info.delta_lf_present_flag) {
-      const int frame_lf_count = av1_num_planes(&cpi->common) > 1
-                                     ? FRAME_LF_COUNT
-                                     : FRAME_LF_COUNT - 2;
+        cm->delta_q_info.delta_lf_present_flag) {
+      const int frame_lf_count =
+          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
       for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
         mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
       mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -1504,17 +1598,107 @@
       else
         mbmi->comp_group_idx = 1;
     }
-    update_stats(&cpi->common, tile_data, td, mi_row, mi_col);
+
+    // delta quant applies to both intra and inter
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+    if (delta_q_info->delta_q_present_flag &&
+        (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
+        super_block_upper_left) {
+      xd->current_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+
+    RD_COUNTS *rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+            if (has_second_ref(mbmi)) {
+              // This flag is also updated for 4x4 blocks
+              rdc->compound_ref_used_flag = 1;
+            }
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+    // Gather obmc and warped motion count to update the probability.
+    if ((!cpi->sf.inter_sf.disable_obmc &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) ||
+        (cm->features.allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->features.switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->features.allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed >= OBMC_CAUSAL) {
+            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+          if (motion_allowed == WARPED_CAUSAL) {
+            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+          }
+        }
+      }
+    }
   }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
 }
 
-static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
-                      TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row,
-                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                      PC_TREE *pc_tree, int *rate) {
+static AOM_INLINE void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                                 TileDataEnc *tile_data, TOKENEXTRA **tp,
+                                 int mi_row, int mi_col, RUN_TYPE dry_run,
+                                 BLOCK_SIZE bsize, PC_TREE *pc_tree,
+                                 int *rate) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
   const int hbs = mi_size_wide[bsize] / 2;
   const int is_partition_root = bsize >= BLOCK_8X8;
   const int ctx = is_partition_root
@@ -1526,11 +1710,11 @@
   int i;
   BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   if (!dry_run && ctx >= 0) {
-    const int has_rows = (mi_row + hbs) < cm->mi_rows;
-    const int has_cols = (mi_col + hbs) < cm->mi_cols;
+    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
 
     if (has_rows && has_cols) {
 #if CONFIG_ENTROPY_STATS
@@ -1553,7 +1737,7 @@
     case PARTITION_VERT:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, &pc_tree->vertical[0], rate);
-      if (mi_col + hbs < cm->mi_cols) {
+      if (mi_col + hbs < mi_params->mi_cols) {
         encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
                  partition, &pc_tree->vertical[1], rate);
       }
@@ -1561,7 +1745,7 @@
     case PARTITION_HORZ:
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
                partition, &pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < cm->mi_rows) {
+      if (mi_row + hbs < mi_params->mi_rows) {
         encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
                  partition, &pc_tree->horizontal[1], rate);
       }
@@ -1613,7 +1797,7 @@
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
         encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
                  partition, &pc_tree->horizontal4[i], rate);
@@ -1622,8 +1806,7 @@
     case PARTITION_VERT_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
-
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
         encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
                  partition, &pc_tree->vertical4[i], rate);
       }
@@ -1634,19 +1817,19 @@
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static void set_partial_sb_partition(const AV1_COMMON *const cm,
-                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
-                                     int mi_rows_remaining,
-                                     int mi_cols_remaining, BLOCK_SIZE bsize,
-                                     MB_MODE_INFO **mib) {
+static AOM_INLINE void set_partial_sb_partition(
+    const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in,
+    int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize,
+    MB_MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
   for (r = 0; r < cm->seq_params.mib_size; r += bh) {
     int bw = bw_in;
     for (c = 0; c < cm->seq_params.mib_size; c += bw) {
-      const int index = r * cm->mi_stride + c;
-      mib[index] = mi + index;
-      mib[index]->sb_type = find_partition_size(
+      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+      mib[grid_index] = mi + mi_index;
+      mib[grid_index]->sb_type = find_partition_size(
           bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
@@ -1657,28 +1840,34 @@
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
-static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                   MB_MODE_INFO **mib, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
+static AOM_INLINE void set_fixed_partitioning(AV1_COMP *cpi,
+                                              const TileInfo *const tile,
+                                              MB_MODE_INFO **mib, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_rows_remaining = tile->mi_row_end - mi_row;
   const int mi_cols_remaining = tile->mi_col_end - mi_col;
-  int block_row, block_col;
-  MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *const mi_upper_left =
+      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
   int bh = mi_size_high[bsize];
   int bw = mi_size_wide[bsize];
 
+  assert(bsize >= mi_params->mi_alloc_bsize &&
+         "Attempted to use bsize < mi_params->mi_alloc_bsize");
   assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
   // Apply the requested partition size to the SB if it is all "in image"
   if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
       (mi_rows_remaining >= cm->seq_params.mib_size)) {
-    for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) {
-      for (block_col = 0; block_col < cm->seq_params.mib_size;
+    for (int block_row = 0; block_row < cm->seq_params.mib_size;
+         block_row += bh) {
+      for (int block_col = 0; block_col < cm->seq_params.mib_size;
            block_col += bw) {
-        int index = block_row * cm->mi_stride + block_col;
-        mib[index] = mi_upper_left + index;
-        mib[index]->sb_type = bsize;
+        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+        mib[grid_index] = mi_upper_left + mi_index;
+        mib[grid_index]->sb_type = bsize;
       }
     }
   } else {
@@ -1688,12 +1877,12 @@
   }
 }
 
-static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon, PC_TREE *pc_tree) {
+static AOM_INLINE void rd_use_partition(
+    AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib,
+    TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate,
+    int64_t *dist, int do_recon, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
@@ -1709,24 +1898,25 @@
                            : PARTITION_NONE;
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_STATS last_part_rdc, none_rdc, chosen_rdc;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mib[0]->sb_type;
-  int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_invalid_rd_stats(&last_part_rdc);
   av1_invalid_rd_stats(&none_rdc);
   av1_invalid_rd_stats(&chosen_rdc);
+  av1_invalid_rd_stats(&invalid_rdc);
 
   pc_tree->partitioning = partition;
 
-  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
@@ -1736,16 +1926,22 @@
     x->mb_energy = av1_log_block_var(cpi, x, bsize);
   }
 
-  if (do_partition_search &&
-      cpi->sf.partition_search_type == SEARCH_PARTITION &&
-      cpi->sf.adjust_partitioning_from_last_frame) {
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+      (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 ||
+       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
+        !frame_is_intra_only(cm)))) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+        MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs];
         if (this_mi && this_mi->sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -1755,10 +1951,11 @@
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+        mi_row + hbs < mi_params->mi_rows &&
+        mi_col + hbs < mi_params->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1774,23 +1971,23 @@
   switch (partition) {
     case PARTITION_NONE:
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
       break;
     case PARTITION_HORZ:
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
-                    0);
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                    invalid_rdc, PICK_MODE_RD);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_row + hbs < cm->mi_rows) {
+          mi_row + hbs < mi_params->mi_rows) {
         RD_STATS tmp_rdc;
         const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col, subsize, NULL);
+        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
         pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      INT64_MAX, 0);
+                      invalid_rdc, PICK_MODE_RD);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1802,19 +1999,20 @@
       break;
     case PARTITION_VERT:
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
-                    0);
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rdc,
+                    PICK_MODE_RD);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_col + hbs < cm->mi_cols) {
+          mi_col + hbs < mi_params->mi_cols) {
         RD_STATS tmp_rdc;
         const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col, subsize, NULL);
+        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
         pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
                       PARTITION_VERT, subsize,
-                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 0);
+                      &pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc,
+                      PICK_MODE_RD);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1825,6 +2023,11 @@
       }
       break;
     case PARTITION_SPLIT:
+      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+          none_rdc.rate < INT_MAX && none_rdc.skip == 1) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
@@ -1833,12 +2036,13 @@
         int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_STATS tmp_rdc;
-        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
           continue;
 
         av1_init_rd_stats(&tmp_rdc);
         rd_use_partition(cpi, td, tile_data,
-                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+                         mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
                          &tmp_rdc.dist, i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1865,11 +2069,13 @@
         RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
   }
 
-  if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
-      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
       partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
-      (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
-      (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
+      (mi_row + bs < mi_params->mi_rows ||
+       mi_row + hbs == mi_params->mi_rows) &&
+      (mi_col + bs < mi_params->mi_cols ||
+       mi_col + hbs == mi_params->mi_cols)) {
     BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
@@ -1883,14 +2089,15 @@
       int y_idx = (i >> 1) * hbs;
       RD_STATS tmp_rdc;
 
-      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
         continue;
 
       save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
                     PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
-                    INT64_MAX, 0);
+                    invalid_rdc, PICK_MODE_RD);
 
       restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1949,130 +2156,256 @@
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
+  x->rdmult = orig_rdmult;
 }
 
-// TODO(kyslov): now this is very similar to rd_use_partition (except that
-// doesn't do extra search arounf suggested partitioning)
-//               consider passing a flag to select non-rd path (similar to
-//               encode_sb_row)
-static void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                                TileDataEnc *tile_data, MB_MODE_INFO **mib,
-                                TOKENEXTRA **tp, int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                                int do_recon, PC_TREE *pc_tree) {
+static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+        (mi_col + x_idx >= cm->mi_params.mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+            PARTITION_NONE &&
+        subsize != BLOCK_8X8)
+      return 0;
+  }
+  return 1;
+}
+
+static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
+  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           MB_MODE_INFO **mib, TOKENEXTRA **tp,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
   const int bs = mi_size_wide[bsize];
   const int hbs = bs / 2;
-  int i;
-  const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
   const PARTITION_TYPE partition =
       (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
                            : PARTITION_NONE;
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_STATS last_part_rdc;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  RD_STATS dummy_cost;
+  av1_invalid_rd_stats(&dummy_cost);
+  RD_STATS invalid_rd;
+  av1_invalid_rd_stats(&invalid_rd);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
-  av1_invalid_rd_stats(&last_part_rdc);
-
   pc_tree->partitioning = partition;
 
-  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-  }
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 1);
-      break;
-    case PARTITION_HORZ:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
-                    1);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_row + hbs < cm->mi_rows) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col, subsize, NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      INT64_MAX, 1);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
+      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
+          !frame_is_intra_only(cm)) {
+        RD_STATS split_rdc, none_rdc, block_rdc;
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+
+        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+
+        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        for (int i = 0; i < 4; i++) {
+          av1_invalid_rd_stats(&block_rdc);
+          const int x_idx = (i & 1) * hbs;
+          const int y_idx = (i >> 1) * hbs;
+          if (mi_row + y_idx >= mi_params->mi_rows ||
+              mi_col + x_idx >= mi_params->mi_cols)
+            continue;
+          xd->above_txfm_context =
+              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+          pc_tree->split[i]->partitioning = PARTITION_NONE;
+          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                        &block_rdc, PARTITION_NONE, subsize,
+                        &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD);
+          split_rdc.rate += block_rdc.rate;
+          split_rdc.dist += block_rdc.dist;
+
+          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
         }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
+        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->sb_type = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   &pc_tree->none, NULL);
+        } else {
+          mib[0]->sb_type = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < 4; i++) {
+            const int x_idx = (i & 1) * hbs;
+            const int y_idx = (i >> 1) * hbs;
+            if (mi_row + y_idx >= mi_params->mi_rows ||
+                mi_col + x_idx >= mi_params->mi_cols)
+              continue;
+
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+        }
+
+      } else {
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                 &pc_tree->none, NULL);
       }
       break;
     case PARTITION_VERT:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
-                    1);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_col + hbs < cm->mi_cols) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col, subsize, NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-                      PARTITION_VERT, subsize,
-                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 1);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rd,
+                    PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_VERT, &pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost,
+                      PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                      invalid_rd, PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                 PARTITION_VERT, &pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                    invalid_rd, PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_HORZ, &pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      invalid_rd, PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                 PARTITION_HORZ, &pc_tree->horizontal[1], NULL);
       }
       break;
     case PARTITION_SPLIT:
-      last_part_rdc.rate = 0;
-      last_part_rdc.dist = 0;
-      last_part_rdc.rdcost = 0;
-      for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * hbs;
-        int y_idx = (i >> 1) * hbs;
-        int jj = i >> 1, ii = i & 0x01;
-        RD_STATS tmp_rdc;
-        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-          continue;
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+          is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm) && bsize <= BLOCK_32X32) {
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+        RD_STATS split_rdc, none_rdc;
+        av1_invalid_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+        pc_tree->partitioning = PARTITION_NONE;
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
+            none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) {
+          av1_init_rd_stats(&split_rdc);
+          for (int i = 0; i < 4; i++) {
+            RD_STATS block_rdc;
+            av1_invalid_rd_stats(&block_rdc);
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+            xd->above_txfm_context =
+                cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+            xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                    ((mi_row + y_idx) & MAX_MIB_MASK);
+            pc_tree->split[i]->partitioning = PARTITION_NONE;
+            pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                          &block_rdc, PARTITION_NONE, subsize,
+                          &pc_tree->split[i]->none, invalid_rd,
+                          PICK_MODE_NONRD);
+            split_rdc.rate += block_rdc.rate;
+            split_rdc.dist += block_rdc.dist;
 
-        av1_init_rd_stats(&tmp_rdc);
-        nonrd_use_partition(
-            cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
-            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-            &tmp_rdc.dist, i != 3, pc_tree->split[i]);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+          split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
         }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->sb_type = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   &pc_tree->none, NULL);
+        } else {
+          mib[0]->sb_type = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < 4; i++) {
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= mi_params->mi_rows) ||
+              (mi_col + x_idx >= mi_params->mi_cols))
+            continue;
+          nonrd_use_partition(cpi, td, tile_data,
+                              mib + jj * hbs * mi_params->mi_stride + ii * hbs,
+                              tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                              pc_tree->split[i]);
+        }
       }
       break;
     case PARTITION_VERT_A:
@@ -2084,56 +2417,38 @@
       assert(0 && "Cannot handle extended partition types");
     default: assert(0); break;
   }
-
-  if (last_part_rdc.rate < INT_MAX) {
-    last_part_rdc.rate += x->partition_cost[pl][partition];
-    last_part_rdc.rdcost =
-        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
-  }
-
-  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  // We must have chosen a partitioning and encoding or we'll fail later on.
-  // No other opportunities for success.
-  if (bsize == cm->seq_params.sb_size)
-    assert(last_part_rdc.rate < INT_MAX && last_part_rdc.dist < INT64_MAX);
-
-  if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
-      // NOTE: To get estimate for rate due to the tokens, use:
-      // int rate_coeffs = 0;
-      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
-      //           bsize, pc_tree, &rate_coeffs);
-      x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-                pc_tree, NULL);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  *rate = last_part_rdc.rate;
-  *dist = last_part_rdc.dist;
 }
 
+#if !CONFIG_REALTIME_ONLY
+static const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) {
+  assert(frm >= 0);
+  if (frm < 0 ||
+      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+    return NULL;
+  }
+
+  return &p->stats_buf_ctx->stats_in_start[frm];
+}
 // Checks to see if a super block is on a horizontal image edge.
 // In most cases this is the "real" edge unless there are formatting
 // bars embedded in the stream.
 static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
   int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
+  int bottom_edge = cpi->common.mi_params.mi_rows;
   int is_active_h_edge = 0;
 
   // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
 
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
     bottom_edge = AOMMAX(top_edge, bottom_edge);
   }
 
@@ -2149,18 +2464,21 @@
 // bars embedded in the stream.
 static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
   int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
+  int right_edge = cpi->common.mi_params.mi_cols;
   int is_active_v_edge = 0;
 
   // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
-    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
 
-    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
     right_edge = AOMMAX(left_edge, right_edge);
   }
 
@@ -2170,6 +2488,7 @@
   }
   return is_active_v_edge;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
@@ -2180,54 +2499,53 @@
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Try searching for an encoding for the given subblock. Returns zero if the
 // rdcost is already too high (to tell the caller not to bother searching for
 // encodings of further subblocks)
 static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
                            TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
                            int mi_row, int mi_col, BLOCK_SIZE subsize,
-                           RD_STATS *best_rdc, RD_STATS *sum_rdc,
-                           RD_STATS *this_rdc, PARTITION_TYPE partition,
+                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                           PARTITION_TYPE partition,
                            PICK_MODE_CONTEXT *prev_ctx,
                            PICK_MODE_CONTEXT *this_ctx) {
-#define RTS_X_RATE_NOCOEF_ARG
-#define RTS_MAX_RDCOST best_rdc->rdcost
-
   MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
 
-  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+  if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
 
-  const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
-                                       ? INT64_MAX
-                                       : (best_rdc->rdcost - sum_rdc->rdcost);
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+                subsize, this_ctx, rdcost_remaining, PICK_MODE_RD);
 
-  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
-                RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
-                rdcost_remaining, 0);
-
-  if (this_rdc->rate == INT_MAX) {
+  if (this_rdc.rate == INT_MAX) {
     sum_rdc->rdcost = INT64_MAX;
   } else {
-    sum_rdc->rate += this_rdc->rate;
-    sum_rdc->dist += this_rdc->dist;
-    sum_rdc->rdcost += this_rdc->rdcost;
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
   }
 
-  if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
 
   if (!is_last) {
-    update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1);
-    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
-                      subsize, NULL);
+    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
   }
 
+  x->rdmult = orig_mult;
   return 1;
-
-#undef RTS_X_RATE_NOCOEF_ARG
-#undef RTS_MAX_RDCOST
 }
 
-static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
                                TileDataEnc *tile_data, TOKENEXTRA **tp,
                                PC_TREE *pc_tree, RD_STATS *best_rdc,
                                PICK_MODE_CONTEXT ctxs[3],
@@ -2236,63 +2554,38 @@
                                int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
                                int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
                                int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  RD_STATS sum_rdc, this_rdc;
-#define RTP_STX_TRY_ARGS
-  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  RD_STATS sum_rdc;
   av1_init_rd_stats(&sum_rdc);
   sum_rdc.rate = x->partition_cost[pl][partition];
   sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
   if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
-    return;
+                       *best_rdc, &sum_rdc, partition, ctx, &ctxs[0]))
+    return false;
 
   if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
-    return;
+                       *best_rdc, &sum_rdc, partition, &ctxs[0], &ctxs[1]))
+    return false;
 
-  // With the new layout of mixed partitions for PARTITION_HORZ_B and
-  // PARTITION_VERT_B, the last subblock might start past halfway through the
-  // main block, so we might signal it even though the subblock lies strictly
-  // outside the image. In that case, we won't spend any bits coding it and the
-  // difference (obviously) doesn't contribute to the error.
-  const int try_block2 = 1;
-  if (try_block2 &&
-      !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
-                       best_rdc, &sum_rdc, &this_rdc,
-                       RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
-    return;
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
+                       *best_rdc, &sum_rdc, partition, &ctxs[1], &ctxs[2]))
+    return false;
 
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
-
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
   sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
 
   *best_rdc = sum_rdc;
   pc_tree->partitioning = partition;
-
-#undef RTP_STX_TRY_ARGS
+  return true;
 }
 
-static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+static AOM_INLINE void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   pc_tree->partitioning = PARTITION_NONE;
-  pc_tree->cb_search_range = SEARCH_FULL_PLANE;
-  pc_tree->none.skip = 0;
-
-  pc_tree->pc_tree_stats.valid = 0;
-  pc_tree->pc_tree_stats.split = 0;
-  pc_tree->pc_tree_stats.skip = 0;
-  pc_tree->pc_tree_stats.rdcost = INT64_MAX;
-
-  for (int i = 0; i < 4; i++) {
-    pc_tree->pc_tree_stats.sub_block_split[i] = 0;
-    pc_tree->pc_tree_stats.sub_block_skip[i] = 0;
-    pc_tree->pc_tree_stats.sub_block_rdcost[i] = INT64_MAX;
-  }
+  pc_tree->none.rd_stats.skip = 0;
 
   if (bsize >= BLOCK_8X8) {
     BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -2301,793 +2594,12 @@
   }
 }
 
-static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
-                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
-                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                  RD_STATS *rd_cost, int64_t best_rd,
-                                  PC_TREE *pc_tree, int64_t *none_rd) {
-  const AV1_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_step = mi_size_wide[bsize] / 2;
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  const TOKENEXTRA *const tp_orig = *tp;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-  int tmp_partition_cost[PARTITION_TYPES];
-  BLOCK_SIZE subsize;
-  RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
-  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
-  int do_square_split = bsize_at_least_8x8;
-  const int pl = bsize_at_least_8x8
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-  const int *partition_cost =
-      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
-  const int num_planes = av1_num_planes(cm);
-
-  int64_t split_rd[4] = { 0, 0, 0, 0 };
-
-  // Override skipping rectangular partition operations for edge blocks
-  const int has_rows = (mi_row + mi_step < cm->mi_rows);
-  const int has_cols = (mi_col + mi_step < cm->mi_cols);
-
-  if (none_rd) *none_rd = 0;
-
-  int partition_none_allowed = has_rows && has_cols;
-
-  (void)*tp_orig;
-  (void)split_rd;
-
-  if (best_rd < 0) {
-    pc_tree->none.rdcost = INT64_MAX;
-    pc_tree->none.skip = 0;
-    av1_invalid_rd_stats(rd_cost);
-    return;
-  }
-  pc_tree->pc_tree_stats.valid = 1;
-
-  // Override partition costs at the edges of the frame in the same
-  // way as in read_partition (see decodeframe.c)
-  if (!(has_rows && has_cols)) {
-    assert(bsize_at_least_8x8 && pl >= 0);
-    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
-    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
-    if (has_cols) {
-      // At the bottom, the two possibilities are HORZ and SPLIT
-      aom_cdf_prob bot_cdf[2];
-      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
-      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
-    } else if (has_rows) {
-      // At the right, the two possibilities are VERT and SPLIT
-      aom_cdf_prob rhs_cdf[2];
-      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
-      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
-    } else {
-      // At the bottom right, we always split
-      tmp_partition_cost[PARTITION_SPLIT] = 0;
-    }
-
-    partition_cost = tmp_partition_cost;
-  }
-
-#ifndef NDEBUG
-  // Nothing should rely on the default value of this array (which is just
-  // leftover from encoding the previous block. Setting it to fixed pattern
-  // when debugging.
-  // bit 0, 1, 2 are blk_skip of each plane
-  // bit 4, 5, 6 are initialization checking of each plane
-  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
-#endif  // NDEBUG
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  av1_init_rd_stats(&this_rdc);
-  av1_init_rd_stats(&sum_rdc);
-  av1_invalid_rd_stats(&best_rdc);
-  best_rdc.rdcost = best_rd;
-
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-
-  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8) {
-    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
-      do_square_split = 0;
-  }
-#endif
-
-  // PARTITION_NONE
-  if (partition_none_allowed) {
-    int pt_cost = 0;
-    if (bsize_at_least_8x8) {
-      pc_tree->partitioning = PARTITION_NONE;
-      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
-                    ? partition_cost[PARTITION_NONE]
-                    : 0;
-    }
-    const int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
-    const int64_t best_remain_rdcost =
-        best_rdc.rdcost == INT64_MAX ? INT64_MAX
-                                     : (best_rdc.rdcost - partition_rd_cost);
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
-                  bsize, ctx_none, best_remain_rdcost, 0);
-
-    pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
-    pc_tree->pc_tree_stats.skip = ctx_none->skip;
-
-    if (none_rd) *none_rd = this_rdc.rdcost;
-    if (this_rdc.rate != INT_MAX) {
-      if (bsize_at_least_8x8) {
-        this_rdc.rate += pt_cost;
-        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-      }
-
-      if (this_rdc.rdcost < best_rdc.rdcost) {
-        // Adjust dist breakout threshold according to the partition size.
-        const int64_t dist_breakout_thr =
-            cpi->sf.partition_search_breakout_dist_thr >>
-            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
-             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
-        const int rate_breakout_thr =
-            cpi->sf.partition_search_breakout_rate_thr *
-            num_pels_log2_lookup[bsize];
-
-        best_rdc = this_rdc;
-        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
-        pc_tree->cb_search_range = SEARCH_FULL_PLANE;
-
-        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
-          const int use_ml_based_breakout =
-              bsize <= cpi->sf.use_square_partition_only_threshold &&
-              bsize > BLOCK_4X4 && xd->bd == 8;
-
-          // TODO(anyone): Currently this is using the same model and threshold
-          // values as in rd_pick_partition. Retraining the model and tuning the
-          // threshold values might be helpful to improve the speed.
-          if (use_ml_based_breakout) {
-            if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
-                                    x->source_variance)) {
-              do_square_split = 0;
-            }
-          }
-
-          // If all y, u, v transform blocks in this partition are skippable,
-          // and the dist & rate are within the thresholds, the partition search
-          // is terminated for current branch of the partition search tree. The
-          // dist & rate thresholds are set to 0 at speed 0 to disable the early
-          // termination at that speed.
-          if (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr) {
-            do_square_split = 0;
-          }
-        }
-
-        if (cpi->sf.firstpass_simple_motion_search_early_term &&
-            cm->show_frame && bsize <= BLOCK_32X32 && bsize >= BLOCK_8X8 &&
-            !frame_is_intra_only(cm) && mi_row + mi_step < cm->mi_rows &&
-            mi_col + mi_step < cm->mi_cols && this_rdc.rdcost < INT64_MAX &&
-            this_rdc.rdcost >= 0 && this_rdc.rate < INT_MAX &&
-            this_rdc.rate >= 0 && do_square_split) {
-          av1_firstpass_simple_motion_search_early_term(
-              cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
-              &do_square_split);
-        }
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // store estimated motion vector
-  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
-
-  int64_t temp_best_rdcost = best_rdc.rdcost;
-  pn_rdc = best_rdc;
-
-  // PARTITION_SPLIT
-  if (do_square_split) {
-    int reached_last_index = 0;
-    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    int idx;
-
-    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-    for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
-      const int x_idx = (idx & 1) * mi_step;
-      const int y_idx = (idx >> 1) * mi_step;
-
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-        continue;
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-      pc_tree->split[idx]->index = idx;
-      int64_t *p_split_rd = &split_rd[idx];
-      const int64_t best_remain_rdcost =
-          (temp_best_rdcost == INT64_MAX) ? INT64_MAX
-                                          : (temp_best_rdcost - sum_rdc.rdcost);
-      rd_pick_sqr_partition(
-          cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-          &this_rdc, best_remain_rdcost, pc_tree->split[idx], p_split_rd);
-
-      pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost;
-      pc_tree->pc_tree_stats.sub_block_skip[idx] =
-          pc_tree->split[idx]->none.skip;
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-        break;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
-      }
-    }
-    reached_last_index = (idx == 4);
-
-    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_SPLIT;
-      }
-    }
-
-    int has_split = 0;
-    if (pc_tree->partitioning == PARTITION_SPLIT) {
-      for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
-        if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
-          ++has_split;
-      }
-
-      if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
-        pc_tree->cb_search_range = SPLIT_PLANE;
-      }
-    }
-
-    if (pc_tree->partitioning == PARTITION_NONE) {
-      pc_tree->cb_search_range = SEARCH_SAME_PLANE;
-      if (pn_rdc.dist <= sum_rdc.dist)
-        pc_tree->cb_search_range = NONE_PARTITION_PLANE;
-    }
-
-    if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }  // if (do_split)
-
-  pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT;
-  if (do_square_split) {
-    for (int i = 0; i < 4; ++i) {
-      pc_tree->pc_tree_stats.sub_block_split[i] =
-          pc_tree->split[i]->partitioning == PARTITION_SPLIT;
-    }
-  }
-
-  // TODO(jbb): This code added so that we avoid static analysis
-  // warning related to the fact that best_rd isn't used after this
-  // point.  This code should be refactored so that the duplicate
-  // checks occur in some sub function and thus are used...
-  (void)best_rd;
-  *rd_cost = best_rdc;
-
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      pc_tree->index != 3) {
-    if (bsize == cm->seq_params.sb_size) {
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  if (bsize == cm->seq_params.sb_size) {
-    assert(best_rdc.rate < INT_MAX);
-    assert(best_rdc.dist < INT64_MAX);
-  } else {
-    assert(tp_orig == *tp);
-  }
-}
-
-// split_score indicates confidence of picking split partition;
-// none_score indicates confidence of picking none partition;
-#define FEATURE_SIZE 19
-static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
-                                          BLOCK_SIZE bsize, int *split_score,
-                                          int *none_score) {
-  if (!pc_tree_stats->valid) return 0;
-  const float *split_weights = NULL;
-  const float *none_weights = NULL;
-  switch (bsize) {
-    case BLOCK_4X4: break;
-    case BLOCK_8X8:
-      split_weights = av1_2pass_split_partition_weights_8;
-      none_weights = av1_2pass_none_partition_weights_8;
-      break;
-    case BLOCK_16X16:
-      split_weights = av1_2pass_split_partition_weights_16;
-      none_weights = av1_2pass_none_partition_weights_16;
-      break;
-    case BLOCK_32X32:
-      split_weights = av1_2pass_split_partition_weights_32;
-      none_weights = av1_2pass_none_partition_weights_32;
-      break;
-    case BLOCK_64X64:
-      split_weights = av1_2pass_split_partition_weights_64;
-      none_weights = av1_2pass_none_partition_weights_64;
-      break;
-    case BLOCK_128X128:
-      split_weights = av1_2pass_split_partition_weights_128;
-      none_weights = av1_2pass_none_partition_weights_128;
-      break;
-    default: assert(0 && "Unexpected bsize.");
-  }
-  if (!split_weights || !none_weights) return 0;
-
-  aom_clear_system_state();
-
-  float features[FEATURE_SIZE];
-  int feature_index = 0;
-  features[feature_index++] = (float)pc_tree_stats->split;
-  features[feature_index++] = (float)pc_tree_stats->skip;
-  const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost);
-  const int rd_valid = rdcost > 0 && rdcost < 1000000000;
-  features[feature_index++] = (float)rd_valid;
-  for (int i = 0; i < 4; ++i) {
-    features[feature_index++] = (float)pc_tree_stats->sub_block_split[i];
-    features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i];
-    const int sub_rdcost =
-        (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]);
-    const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000;
-    features[feature_index++] = (float)sub_rd_valid;
-    // Ratio between the sub-block RD and the whole-block RD.
-    float rd_ratio = 1.0f;
-    if (rd_valid && sub_rd_valid && sub_rdcost < rdcost)
-      rd_ratio = (float)sub_rdcost / (float)rdcost;
-    features[feature_index++] = rd_ratio;
-  }
-  assert(feature_index == FEATURE_SIZE);
-
-  float score_1 = split_weights[FEATURE_SIZE];
-  float score_2 = none_weights[FEATURE_SIZE];
-  for (int i = 0; i < FEATURE_SIZE; ++i) {
-    score_1 += features[i] * split_weights[i];
-    score_2 += features[i] * none_weights[i];
-  }
-  *split_score = (int)(score_1 * 100);
-  *none_score = (int)(score_2 * 100);
-  return 1;
-}
-#undef FEATURE_SIZE
-
-static void ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                    const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                    int64_t best_rd, int64_t none_rd,
-                                    int64_t *split_rd,
-                                    int *const dst_prune_horz,
-                                    int *const dst_prune_vert) {
-  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
-  best_rd = AOMMAX(best_rd, 1);
-  const NN_CONFIG *nn_config = NULL;
-  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
-  float cur_thresh = 0.0f;
-  switch (bsize) {
-    case BLOCK_8X8:
-      nn_config = &av1_rect_partition_nnconfig_8;
-      cur_thresh = prob_thresholds[0];
-      break;
-    case BLOCK_16X16:
-      nn_config = &av1_rect_partition_nnconfig_16;
-      cur_thresh = prob_thresholds[1];
-      break;
-    case BLOCK_32X32:
-      nn_config = &av1_rect_partition_nnconfig_32;
-      cur_thresh = prob_thresholds[2];
-      break;
-    case BLOCK_64X64:
-      nn_config = &av1_rect_partition_nnconfig_64;
-      cur_thresh = prob_thresholds[3];
-      break;
-    case BLOCK_128X128:
-      nn_config = &av1_rect_partition_nnconfig_128;
-      cur_thresh = prob_thresholds[4];
-      break;
-    default: assert(0 && "Unexpected bsize.");
-  }
-  if (!nn_config) return;
-  aom_clear_system_state();
-
-  // 1. Compute input features
-  float features[9];
-
-  // RD cost ratios
-  for (int i = 0; i < 5; i++) features[i] = 1.0f;
-  if (none_rd > 0 && none_rd < 1000000000)
-    features[0] = (float)none_rd / (float)best_rd;
-  for (int i = 0; i < 4; i++) {
-    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
-      features[1 + i] = (float)split_rd[i] / (float)best_rd;
-  }
-
-  // Variance ratios
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  int whole_block_variance;
-  if (is_cur_buf_hbd(xd)) {
-    whole_block_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    whole_block_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-  }
-  whole_block_variance = AOMMAX(whole_block_variance, 1);
-
-  int split_variance[4];
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  struct buf_2d buf;
-  buf.stride = x->plane[0].src.stride;
-  const int bw = block_size_wide[bsize];
-  for (int i = 0; i < 4; ++i) {
-    const int x_idx = (i & 1) * bw / 2;
-    const int y_idx = (i >> 1) * bw / 2;
-    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
-    if (is_cur_buf_hbd(xd)) {
-      split_variance[i] =
-          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
-    } else {
-      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
-    }
-  }
-
-  for (int i = 0; i < 4; i++)
-    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
-
-  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
-  float raw_scores[3] = { 0.0f };
-  av1_nn_predict(features, nn_config, raw_scores);
-  aom_clear_system_state();
-  float probs[3] = { 0.0f };
-  av1_nn_softmax(raw_scores, probs, 3);
-
-  // probs[0] is the probability of the fact that both rectangular partitions
-  // are worse than current best_rd
-  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
-  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
-}
-
-// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
-// considered.
-static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
-                                  int64_t best_rd, int64_t horz_rd[2],
-                                  int64_t vert_rd[2], int64_t split_rd[4],
-                                  int *const horza_partition_allowed,
-                                  int *const horzb_partition_allowed,
-                                  int *const verta_partition_allowed,
-                                  int *const vertb_partition_allowed) {
-  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
-  const NN_CONFIG *nn_config = NULL;
-  switch (bsize) {
-    case BLOCK_8X8: nn_config = NULL; break;
-    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
-    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
-    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
-    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
-    default: assert(0 && "Unexpected bsize.");
-  }
-  if (!nn_config) return;
-
-  aom_clear_system_state();
-
-  // Generate features.
-  float features[10];
-  int feature_index = 0;
-  features[feature_index++] = (float)part_ctx;
-  features[feature_index++] = (float)var_ctx;
-  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
-  int sub_block_rdcost[8] = { 0 };
-  int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
-    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)horz_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 2; ++i) {
-    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)vert_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 4; ++i) {
-    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)split_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 8; ++i) {
-    // Ratio between the sub-block RD and the whole-block RD.
-    float rd_ratio = 1.0f;
-    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
-      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
-    features[feature_index++] = rd_ratio;
-  }
-  assert(feature_index == 10);
-
-  // Calculate scores using the NN model.
-  float score[16] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
-  aom_clear_system_state();
-  int int_score[16];
-  int max_score = -1000;
-  for (int i = 0; i < 16; ++i) {
-    int_score[i] = (int)(100 * score[i]);
-    max_score = AOMMAX(int_score[i], max_score);
-  }
-
-  // Make decisions based on the model scores.
-  int thresh = max_score;
-  switch (bsize) {
-    case BLOCK_16X16: thresh -= 150; break;
-    case BLOCK_32X32: thresh -= 100; break;
-    default: break;
-  }
-  *horza_partition_allowed = 0;
-  *horzb_partition_allowed = 0;
-  *verta_partition_allowed = 0;
-  *vertb_partition_allowed = 0;
-  for (int i = 0; i < 16; ++i) {
-    if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *horza_partition_allowed = 1;
-      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
-      if ((i >> 2) & 1) *verta_partition_allowed = 1;
-      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
-    }
-  }
-}
-
-#define FEATURES 18
-#define LABELS 4
-// Use a ML model to predict if horz4 and vert4 should be considered.
-static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                                 BLOCK_SIZE bsize, int part_ctx,
-                                 int64_t best_rd, int64_t horz_rd[2],
-                                 int64_t vert_rd[2], int64_t split_rd[4],
-                                 int *const partition_horz4_allowed,
-                                 int *const partition_vert4_allowed,
-                                 unsigned int pb_source_variance, int mi_row,
-                                 int mi_col) {
-  if (best_rd >= 1000000000) return;
-  const NN_CONFIG *nn_config = NULL;
-  switch (bsize) {
-    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
-    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
-    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
-    default: assert(0 && "Unexpected bsize.");
-  }
-  if (!nn_config) return;
-
-  aom_clear_system_state();
-
-  // Generate features.
-  float features[FEATURES];
-  int feature_index = 0;
-  features[feature_index++] = (float)part_ctx;
-  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
-
-  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
-  int sub_block_rdcost[8] = { 0 };
-  int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
-    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)horz_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 2; ++i) {
-    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)vert_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 4; ++i) {
-    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
-      sub_block_rdcost[rd_index] = (int)split_rd[i];
-    ++rd_index;
-  }
-  for (int i = 0; i < 8; ++i) {
-    // Ratio between the sub-block RD and the whole-block RD.
-    float rd_ratio = 1.0f;
-    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
-      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
-    features[feature_index++] = rd_ratio;
-  }
-
-  // Get variance of the 1:4 and 4:1 sub-blocks.
-  unsigned int horz_4_source_var[4] = { 0 };
-  unsigned int vert_4_source_var[4] = { 0 };
-  {
-    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
-    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
-                         av1_num_planes(&cpi->common), bsize);
-    const int src_stride = x->plane[0].src.stride;
-    const uint8_t *src = x->plane[0].src.buf;
-    const MACROBLOCKD *const xd = &x->e_mbd;
-    for (int i = 0; i < 4; ++i) {
-      const uint8_t *horz_src =
-          src + i * block_size_high[horz_4_bs] * src_stride;
-      const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
-      unsigned int horz_var, vert_var, sse;
-      if (is_cur_buf_hbd(xd)) {
-        switch (xd->bd) {
-          case 10:
-            horz_var = cpi->fn_ptr[horz_4_bs].vf(
-                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
-                0, &sse);
-            vert_var = cpi->fn_ptr[vert_4_bs].vf(
-                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
-                0, &sse);
-            break;
-          case 12:
-            horz_var = cpi->fn_ptr[horz_4_bs].vf(
-                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
-                0, &sse);
-            vert_var = cpi->fn_ptr[vert_4_bs].vf(
-                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
-                0, &sse);
-            break;
-          case 8:
-          default:
-            horz_var = cpi->fn_ptr[horz_4_bs].vf(
-                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
-                0, &sse);
-            vert_var = cpi->fn_ptr[vert_4_bs].vf(
-                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
-                0, &sse);
-            break;
-        }
-        horz_4_source_var[i] =
-            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
-        vert_4_source_var[i] =
-            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
-      } else {
-        horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS,
-                                             0, &sse);
-        vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS,
-                                             0, &sse);
-        horz_4_source_var[i] =
-            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
-        vert_4_source_var[i] =
-            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
-      }
-    }
-  }
-
-  const float denom = (float)(pb_source_variance + 1);
-  const float low_b = 0.1f;
-  const float high_b = 10.0f;
-  for (int i = 0; i < 4; ++i) {
-    // Ratio between the 4:1 sub-block variance and the whole-block variance.
-    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
-    if (var_ratio < low_b) var_ratio = low_b;
-    if (var_ratio > high_b) var_ratio = high_b;
-    features[feature_index++] = var_ratio;
-  }
-  for (int i = 0; i < 4; ++i) {
-    // Ratio between the 1:4 sub-block RD and the whole-block RD.
-    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
-    if (var_ratio < low_b) var_ratio = low_b;
-    if (var_ratio > high_b) var_ratio = high_b;
-    features[feature_index++] = var_ratio;
-  }
-  assert(feature_index == FEATURES);
-
-  // Calculate scores using the NN model.
-  float score[LABELS] = { 0.0f };
-  av1_nn_predict(features, nn_config, score);
-  aom_clear_system_state();
-  int int_score[LABELS];
-  int max_score = -1000;
-  for (int i = 0; i < LABELS; ++i) {
-    int_score[i] = (int)(100 * score[i]);
-    max_score = AOMMAX(int_score[i], max_score);
-  }
-
-  // Make decisions based on the model scores.
-  int thresh = max_score;
-  switch (bsize) {
-    case BLOCK_16X16: thresh -= 500; break;
-    case BLOCK_32X32: thresh -= 500; break;
-    case BLOCK_64X64: thresh -= 200; break;
-    default: break;
-  }
-  *partition_horz4_allowed = 0;
-  *partition_vert4_allowed = 0;
-  for (int i = 0; i < LABELS; ++i) {
-    if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
-      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
-    }
-  }
-}
-#undef FEATURES
-#undef LABELS
-
-#define FEATURES 4
-// ML-based partition search breakout.
-static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                               const MACROBLOCK *const x,
-                               const RD_STATS *const rd_stats,
-                               unsigned int pb_source_variance) {
-  const NN_CONFIG *nn_config = NULL;
-  int thresh = 0;
-  switch (bsize) {
-    case BLOCK_8X8:
-      nn_config = &av1_partition_breakout_nnconfig_8;
-      thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
-      break;
-    case BLOCK_16X16:
-      nn_config = &av1_partition_breakout_nnconfig_16;
-      thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
-      break;
-    case BLOCK_32X32:
-      nn_config = &av1_partition_breakout_nnconfig_32;
-      thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
-      break;
-    case BLOCK_64X64:
-      nn_config = &av1_partition_breakout_nnconfig_64;
-      thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
-      break;
-    case BLOCK_128X128:
-      nn_config = &av1_partition_breakout_nnconfig_128;
-      thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
-      break;
-    default: assert(0 && "Unexpected bsize.");
-  }
-  if (!nn_config || thresh < 0) return 0;
-
-  // Generate feature values.
-  float features[FEATURES];
-  int feature_index = 0;
-  aom_clear_system_state();
-
-  const int num_pels_log2 = num_pels_log2_lookup[bsize];
-  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
-  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
-           rate_f;
-  features[feature_index++] = rate_f;
-
-  const float dist_f =
-      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
-  features[feature_index++] = dist_f;
-
-  features[feature_index++] = (float)pb_source_variance;
-
-  const int dc_q = (int)x->plane[0].dequant_QTX[0];
-  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
-  assert(feature_index == FEATURES);
-
-  // Calculate score using the NN model.
-  float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
-  aom_clear_system_state();
-
-  // Make decision.
-  return (int)(score * 100) >= thresh;
-}
-#undef FEATURES
-
 // Record the ref frames that have been selected by square partition blocks.
-static void update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
-                                          BLOCK_SIZE bsize, int mib_size,
-                                          int mi_row, int mi_col) {
+static AOM_INLINE void update_picked_ref_frames_mask(MACROBLOCK *const x,
+                                                     int ref_type,
+                                                     BLOCK_SIZE bsize,
+                                                     int mib_size, int mi_row,
+                                                     int mi_col) {
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
   const int sb_size_mask = mib_size - 1;
   const int mi_row_in_sb = mi_row & sb_size_mask;
@@ -3100,19 +2612,75 @@
   }
 }
 
-// TODO(jinging,jimbankoski,rbultje): properly skip partition types that are
-// unlikely to be selected depending on previous rate-distortion optimization
-// results, for encoding speed-up.
-// TODO(chiyotsai@google.com): Move these ml related varables to a seprate file
-// to separate low level ml logic from partition logic
-#define NUM_SIMPLE_MOTION_FEATURES 28
-static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+// Structure to keep win flags for HORZ and VERT partition evaluations
+typedef struct {
+  bool horz_win;
+  bool vert_win;
+} RD_RECT_PART_WIN_INFO;
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+    PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+    RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+    int split_idx2) {
+  int num_win = 0;
+  // Threshold for number of winners
+  // Conservative pruning for high quantizers
+  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+  bool sub_part_win = (rect_part_win_info == NULL)
+                          ? (pc_tree->partitioning == rect_part)
+                          : (rect_part == PARTITION_HORZ)
+                                ? rect_part_win_info->horz_win
+                                : rect_part_win_info->vert_win;
+  num_win += (sub_part_win) ? 1 : 0;
+  num_win +=
+      (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+  num_win +=
+      (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+  if (num_win < num_win_thresh) {
+    return 0;
+  }
+  return 1;
+}
+
+// Searches for the best partition pattern for a block based on the
+// rate-distortion cost, and returns a bool value to indicate whether a valid
+// partition pattern is found. The partition can recursively go down to
+// the smallest block size.
+//
+// Inputs:
+//     cpi: the global compressor setting
+//     td: thread data
+//     tile_data: tile data
+//     tp: the pointer to the start token
+//     mi_row: row coordinate of the block in a step size of MI_SIZE
+//     mi_col: column coordinate of the block in a step size of MI_SIZE
+//     bsize: block size
+//     max_sq_part: the largest square block size for prediction blocks
+//     min_sq_part: the smallest square block size for prediction blocks
+//     rd_cost: the pointer to the final rd cost of the current block
+//     best_rdc: the upper bound of rd cost for a valid partition
+//     pc_tree: the pointer to the PC_TREE node storing the picked partitions
+//              and mode info for the current block
+//     none_rd: the pointer to the rd cost in the case of not splitting the
+//              current block
+//     multi_pass_mode: SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+//     rect_part_win_info: the pointer to a struct storing whether horz/vert
+//                         partition outperforms previously tested partitions
+//
+// Output:
+//     a bool value indicating whether a valid partition is found
+static bool rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
-                              RD_STATS *rd_cost, int64_t best_rd,
-                              PC_TREE *pc_tree, int64_t *none_rd) {
+                              RD_STATS *rd_cost, RD_STATS best_rdc,
+                              PC_TREE *pc_tree, int64_t *none_rd,
+                              SB_MULTI_PASS_MODE multi_pass_mode,
+                              RD_RECT_PART_WIN_INFO *rect_part_win_info) {
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
@@ -3123,14 +2691,13 @@
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
   int tmp_partition_cost[PARTITION_TYPES];
   BLOCK_SIZE subsize;
-  RD_STATS this_rdc, sum_rdc, best_rdc;
+  RD_STATS this_rdc, sum_rdc;
   const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
   int do_square_split = bsize_at_least_8x8;
   const int pl = bsize_at_least_8x8
                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
-  const int *partition_cost =
-      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+  const int *partition_cost = x->partition_cost[pl];
 
   int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
   int64_t cur_none_rd = 0;
@@ -3145,27 +2712,40 @@
   int horz_ctx_is_ready = 0;
   int vert_ctx_is_ready = 0;
   BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+  // Initialise HORZ and VERT win flags as true for all split partitions
+  RD_RECT_PART_WIN_INFO split_part_rect_win[4] = {
+    { true, true }, { true, true }, { true, true }, { true, true }
+  };
 
-  if (best_rd < 0) {
-    pc_tree->none.rdcost = INT64_MAX;
-    pc_tree->none.skip = 0;
+  bool found_best_partition = false;
+  if (best_rdc.rdcost < 0) {
     av1_invalid_rd_stats(rd_cost);
-    return;
+    return found_best_partition;
   }
+
+  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+    x->quad_tree_idx = 0;
+    x->cnn_output_valid = 0;
+  }
+
   if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks
-  const int has_rows = (mi_row + mi_step < cm->mi_rows);
-  const int has_cols = (mi_col + mi_step < cm->mi_cols);
+  const int has_rows = (mi_row + mi_step < mi_params->mi_rows);
+  const int has_cols = (mi_col + mi_step < mi_params->mi_cols);
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
   if (none_rd) *none_rd = 0;
   int partition_none_allowed = has_rows && has_cols;
-  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
-                               cpi->oxcf.enable_rect_partitions;
-  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
-                               cpi->oxcf.enable_rect_partitions;
+  int partition_horz_allowed =
+      has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
+                           yss) != BLOCK_INVALID;
+  int partition_vert_allowed =
+      has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
+                           yss) != BLOCK_INVALID;
 
   (void)*tp_orig;
 
@@ -3185,7 +2765,8 @@
   if (!(has_rows && has_cols)) {
     assert(bsize_at_least_8x8 && pl >= 0);
     const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
-    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    const int max_cost = av1_cost_symbol(0);
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = max_cost;
     if (has_cols) {
       // At the bottom, the two possibilities are HORZ and SPLIT
       aom_cdf_prob bot_cdf[2];
@@ -3204,7 +2785,6 @@
     }
 
     partition_cost = tmp_partition_cost;
-    do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
   }
 
 #ifndef NDEBUG
@@ -3219,107 +2799,73 @@
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_init_rd_stats(&this_rdc);
-  av1_invalid_rd_stats(&best_rdc);
-  best_rdc.rdcost = best_rd;
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdc);
+
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
     x->mb_energy = av1_log_block_var(cpi, x, bsize);
 
-  if (bsize > cpi->sf.use_square_partition_only_threshold) {
+  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
     partition_horz_allowed &= !has_rows;
     partition_vert_allowed &= !has_cols;
   }
 
-  if (bsize > BLOCK_4X4 && x->use_cb_search_range) {
-    int split_score = 0;
-    int none_score = 0;
-    const int score_valid = ml_prune_2pass_split_partition(
-        &pc_tree->pc_tree_stats, bsize, &split_score, &none_score);
-    if (score_valid) {
-      {
-        const int only_split_thresh = 300;
-        const int no_none_thresh = 250;
-        const int no_split_thresh = 0;
-        if (split_score > only_split_thresh) {
-          partition_none_allowed = 0;
-          partition_horz_allowed = 0;
-          partition_vert_allowed = 0;
-        } else if (split_score > no_none_thresh) {
-          partition_none_allowed = 0;
-        }
-        if (split_score < no_split_thresh) do_square_split = 0;
-      }
-      {
-        const int no_split_thresh = 120;
-        const int no_none_thresh = -120;
-        if (none_score > no_split_thresh && partition_none_allowed)
-          do_square_split = 0;
-        if (none_score < no_none_thresh) partition_none_allowed = 0;
-      }
-    } else {
-      if (pc_tree->cb_search_range == SPLIT_PLANE) {
-        partition_none_allowed = 0;
-        partition_horz_allowed = 0;
-        partition_vert_allowed = 0;
-      }
-      if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0;
-      if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
-        do_square_split = 0;
-        partition_horz_allowed = 0;
-        partition_vert_allowed = 0;
-      }
-    }
-
-    // Fall back to default values in case all partition modes are rejected.
-    if (partition_none_allowed == 0 && do_square_split == 0 &&
-        partition_horz_allowed == 0 && partition_vert_allowed == 0) {
-      do_square_split = bsize_at_least_8x8;
-      partition_none_allowed = has_rows && has_cols;
-      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
-                               cpi->oxcf.enable_rect_partitions;
-      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
-                               cpi->oxcf.enable_rect_partitions;
-    }
-  }
-
-  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
+  const int try_intra_cnn_split =
+      !cpi->is_screen_content_type && frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_split &&
+      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+
+  if (try_intra_cnn_split) {
+    av1_intra_mode_cnn_partition(
+        &cpi->common, x, bsize, x->quad_tree_idx, &partition_none_allowed,
+        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
+        &do_square_split);
+  }
+
   // Use simple_motion_search to prune partitions. This must be done prior to
   // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
   const int try_split_only =
-      cpi->sf.simple_motion_search_split_only && bsize >= BLOCK_8X8 &&
-      do_square_split && mi_row + mi_size_high[bsize] <= cm->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= cm->mi_cols && !frame_is_intra_only(cm) &&
-      !av1_superres_scaled(cm);
+      !cpi->is_screen_content_type &&
+      cpi->sf.part_sf.simple_motion_search_split && do_square_split &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
+      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
 
   if (try_split_only) {
     av1_simple_motion_search_based_split(
-        cpi, x, mi_row, mi_col, bsize, &partition_none_allowed,
+        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
         &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
         &do_square_split);
   }
 
   const int try_prune_rect =
-      cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
-      do_rectangular_split &&
+      !cpi->is_screen_content_type &&
+      cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) && do_rectangular_split &&
       (do_square_split || partition_none_allowed ||
        (prune_horz && prune_vert)) &&
       (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
 
-  float simple_motion_features[NUM_SIMPLE_MOTION_FEATURES] = { 0.0f };
-  int simple_motion_features_are_valid = 0;
-
   if (try_prune_rect) {
-    av1_simple_motion_search_prune_part(
-        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
-        &partition_horz_allowed, &partition_vert_allowed, &do_square_split,
-        &do_rectangular_split, &prune_horz, &prune_vert, simple_motion_features,
-        &simple_motion_features_are_valid);
+    av1_simple_motion_search_prune_rect(
+        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_horz_allowed,
+        &partition_vert_allowed, &prune_horz, &prune_vert);
   }
 
   // Max and min square partition levels are defined as the partition nodes that
@@ -3352,17 +2898,22 @@
     if (has_rows && has_cols) do_square_split = 0;
     partition_none_allowed = !do_square_split;
   }
-  do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
 
 BEGIN_PARTITION_SEARCH:
   if (x->must_find_valid_partition) {
-    do_square_split =
-        bsize_at_least_8x8 && partition_cost[PARTITION_SPLIT] != INT_MAX;
-    partition_none_allowed = has_rows && has_cols;
-    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
-                             cpi->oxcf.enable_rect_partitions;
-    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
-                             cpi->oxcf.enable_rect_partitions;
+    do_square_split = bsize_at_least_8x8 && (blksize > min_partition_size);
+    partition_none_allowed =
+        has_rows && has_cols && (blksize >= min_partition_size);
+    partition_horz_allowed =
+        has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+        (blksize > min_partition_size) &&
+        get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
+                             yss) != BLOCK_INVALID;
+    partition_vert_allowed =
+        has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+        (blksize > min_partition_size) &&
+        get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
+                             yss) != BLOCK_INVALID;
     terminate_partition_search = 0;
   }
 
@@ -3374,29 +2925,26 @@
   unsigned int pb_simple_motion_pred_sse = UINT_MAX;
   (void)pb_simple_motion_pred_sse;
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8) {
-    if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
-    if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
-    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
-      do_square_split = 0;
-  }
-#endif
-
   // PARTITION_NONE
   if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
-  if (!terminate_partition_search && partition_none_allowed &&
-      !is_gt_max_sq_part) {
+  assert(terminate_partition_search == 0);
+  int64_t part_none_rd = INT64_MAX;
+  if (cpi->is_screen_content_type)
+    partition_none_allowed = has_rows && has_cols;
+  if (partition_none_allowed && !is_gt_max_sq_part) {
     int pt_cost = 0;
     if (bsize_at_least_8x8) {
       pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
                     ? partition_cost[PARTITION_NONE]
                     : 0;
     }
-    const int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
-    const int64_t best_remain_rdcost =
-        (best_rdc.rdcost == INT64_MAX) ? INT64_MAX
-                                       : (best_rdc.rdcost - partition_rd_cost);
+    RD_STATS partition_rdcost;
+    av1_init_rd_stats(&partition_rdcost);
+    partition_rdcost.rate = pt_cost;
+    av1_rd_cost_update(x->rdmult, &partition_rdcost);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &partition_rdcost,
+                             &best_remain_rdcost);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (best_remain_rdcost >= 0) {
       partition_attempts[PARTITION_NONE] += 1;
@@ -3405,7 +2953,8 @@
     }
 #endif
     pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
-                  bsize, ctx_none, best_remain_rdcost, 0);
+                  bsize, ctx_none, best_remain_rdcost, PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
       aom_usec_timer_mark(&partition_timer);
@@ -3419,7 +2968,7 @@
     if (none_rd) *none_rd = this_rdc.rdcost;
     cur_none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
-      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
         const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
         update_picked_ref_frames_mask(x, ref_type, bsize,
                                       cm->seq_params.mib_size, mi_row, mi_col);
@@ -3429,27 +2978,30 @@
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
       }
 
+      part_none_rd = this_rdc.rdcost;
       if (this_rdc.rdcost < best_rdc.rdcost) {
         // Adjust dist breakout threshold according to the partition size.
         const int64_t dist_breakout_thr =
-            cpi->sf.partition_search_breakout_dist_thr >>
+            cpi->sf.part_sf.partition_search_breakout_dist_thr >>
             ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
              (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
         const int rate_breakout_thr =
-            cpi->sf.partition_search_breakout_rate_thr *
+            cpi->sf.part_sf.partition_search_breakout_rate_thr *
             num_pels_log2_lookup[bsize];
 
         best_rdc = this_rdc;
+        found_best_partition = true;
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
-        if ((do_square_split || do_rectangular_split) &&
+        if (!frame_is_intra_only(cm) &&
+            (do_square_split || do_rectangular_split) &&
             !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
           const int use_ml_based_breakout =
-              bsize <= cpi->sf.use_square_partition_only_threshold &&
+              bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
               bsize > BLOCK_4X4 && xd->bd == 8;
           if (use_ml_based_breakout) {
-            if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
-                                    pb_source_variance)) {
+            if (av1_ml_predict_breakout(cpi, bsize, x, &this_rdc,
+                                        pb_source_variance)) {
               do_square_split = 0;
               do_rectangular_split = 0;
             }
@@ -3467,16 +3019,16 @@
           }
         }
 
-        if (cpi->sf.simple_motion_search_early_term_none && cm->show_frame &&
-            !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
-            mi_row + mi_step < cm->mi_rows && mi_col + mi_step < cm->mi_cols &&
+        if (cpi->sf.part_sf.simple_motion_search_early_term_none &&
+            cm->show_frame && !frame_is_intra_only(cm) &&
+            bsize >= BLOCK_16X16 && mi_row + mi_step < mi_params->mi_rows &&
+            mi_col + mi_step < mi_params->mi_cols &&
             this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
             this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
             (do_square_split || do_rectangular_split)) {
-          av1_simple_motion_search_early_term_none(
-              cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
-              &terminate_partition_search, simple_motion_features,
-              &simple_motion_features_are_valid);
+          av1_simple_motion_search_early_term_none(cpi, x, pc_tree, mi_row,
+                                                   mi_col, bsize, &this_rdc,
+                                                   &terminate_partition_search);
         }
       }
     }
@@ -3485,9 +3037,10 @@
   }
 
   // store estimated motion vector
-  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+  if (cpi->sf.mv_sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
   // PARTITION_SPLIT
+  int64_t part_split_rd = INT64_MAX;
   if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -3506,35 +3059,46 @@
       const int x_idx = (idx & 1) * mi_step;
       const int y_idx = (idx >> 1) * mi_step;
 
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+      if (mi_row + y_idx >= mi_params->mi_rows ||
+          mi_col + x_idx >= mi_params->mi_cols)
         continue;
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
       pc_tree->split[idx]->index = idx;
       int64_t *p_split_rd = &split_rd[idx];
-      const int64_t best_remain_rdcost =
-          best_rdc.rdcost == INT64_MAX ? INT64_MAX
-                                       : (best_rdc.rdcost - sum_rdc.rdcost);
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
-                        subsize, max_sq_part, min_sq_part, &this_rdc,
-                        best_remain_rdcost, pc_tree->split[idx], p_split_rd);
 
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
+      RD_STATS best_remain_rdcost;
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
+
+      int curr_quad_tree_idx = 0;
+      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        curr_quad_tree_idx = x->quad_tree_idx;
+        x->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1;
+      }
+      if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                             mi_col + x_idx, subsize, max_sq_part, min_sq_part,
+                             &this_rdc, best_remain_rdcost, pc_tree->split[idx],
+                             p_split_rd, multi_pass_mode,
+                             &split_part_rect_win[idx])) {
+        av1_invalid_rd_stats(&sum_rdc);
         break;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
-        if (idx <= 1 && (bsize <= BLOCK_8X8 ||
-                         pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
-          const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
-          const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-          // Neither palette mode nor cfl predicted
-          if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-            if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
-          }
+      }
+      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        x->quad_tree_idx = curr_quad_tree_idx;
+      }
+
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
+      if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                       pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+        const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+        const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+        // Neither palette mode nor cfl predicted
+        if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+          if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
         }
       }
     }
@@ -3548,29 +3112,44 @@
 #endif
     const int reached_last_index = (idx == 4);
 
+    part_split_rd = sum_rdc.rdcost;
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+        found_best_partition = true;
         pc_tree->partitioning = PARTITION_SPLIT;
       }
-    } else if (cpi->sf.less_rectangular_check_level > 0) {
-      // skip rectangular partition test when larger block size
-      // gives better rd cost
-      if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
-        do_rectangular_split &= !partition_none_allowed;
+    } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+      // Skip rectangular partition test when partition type none gives better
+      // rd than partition type split.
+      if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+        const int partition_none_valid = cur_none_rd > 0;
+        const int partition_none_better = cur_none_rd < sum_rdc.rdcost;
+        do_rectangular_split &=
+            !(partition_none_valid && partition_none_better);
+      }
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
-  if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      !frame_is_intra_only(cm) && !terminate_partition_search &&
+      do_rectangular_split &&
+      (partition_horz_allowed || partition_vert_allowed)) {
+    av1_ml_early_term_after_split(cpi, x, pc_tree, bsize, best_rdc.rdcost,
+                                  part_none_rd, part_split_rd, split_rd, mi_row,
+                                  mi_col, &terminate_partition_search);
+  }
+
+  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
       (partition_horz_allowed || partition_vert_allowed) &&
-      !(prune_horz || prune_vert)) {
+      !(prune_horz || prune_vert) && !terminate_partition_search) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
-                            split_rd, &prune_horz, &prune_vert);
+    av1_ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+                                split_rd, &prune_horz, &prune_vert);
   }
 
   // PARTITION_HORZ
@@ -3580,17 +3159,12 @@
       !is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed) {
-      pc_tree->horizontal[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-    }
+    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
     sum_rdc.rate = partition_cost[PARTITION_HORZ];
     sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
-                                           ? INT64_MAX
-                                           : (best_rdc.rdcost - sum_rdc.rdcost);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (best_remain_rdcost >= 0) {
       partition_attempts[PARTITION_HORZ] += 1;
@@ -3599,14 +3173,16 @@
     }
 #endif
     pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
-                  subsize, &pc_tree->horizontal[0], best_remain_rdcost, 0);
+                  subsize, &pc_tree->horizontal[0], best_remain_rdcost,
+                  PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
     } else {
       sum_rdc.rate += this_rdc.rate;
       sum_rdc.dist += this_rdc.dist;
-      sum_rdc.rdcost += this_rdc.rdcost;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
     }
     horz_rd[0] = this_rdc.rdcost;
 
@@ -3618,20 +3194,18 @@
       if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
         if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
       }
-      update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
-                        subsize, NULL);
+      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
-      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed) {
-        pc_tree->horizontal[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
-      }
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
+
       pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
                     PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                    best_rdc.rdcost - sum_rdc.rdcost, 0);
+                    best_remain_rdcost, PICK_MODE_RD);
+      av1_rd_cost_update(x->rdmult, &this_rdc);
       horz_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -3639,7 +3213,7 @@
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        av1_rd_cost_update(x->rdmult, &sum_rdc);
       }
     }
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -3655,8 +3229,14 @@
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+        found_best_partition = true;
         pc_tree->partitioning = PARTITION_HORZ;
       }
+    } else {
+      // Update HORZ win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->horz_win = false;
+      }
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
@@ -3670,18 +3250,13 @@
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_VERT);
 
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed) {
-      pc_tree->vertical[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-    }
     sum_rdc.rate = partition_cost[PARTITION_VERT];
     sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
-                                           ? INT64_MAX
-                                           : (best_rdc.rdcost - sum_rdc.rdcost);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (best_remain_rdcost >= 0) {
       partition_attempts[PARTITION_VERT] += 1;
@@ -3690,14 +3265,16 @@
     }
 #endif
     pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
-                  subsize, &pc_tree->vertical[0], best_remain_rdcost, 0);
+                  subsize, &pc_tree->vertical[0], best_remain_rdcost,
+                  PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
     } else {
       sum_rdc.rate += this_rdc.rate;
       sum_rdc.dist += this_rdc.dist;
-      sum_rdc.rdcost += this_rdc.rdcost;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
     }
     vert_rd[0] = this_rdc.rdcost;
     if (sum_rdc.rdcost < best_rdc.rdcost && has_cols) {
@@ -3707,21 +3284,17 @@
       if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
         if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
       }
-      update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
-                   subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
-                        subsize, NULL);
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed) {
-        pc_tree->vertical[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-      }
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
                     PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                    best_rdc.rdcost - sum_rdc.rdcost, 0);
+                    best_remain_rdcost, PICK_MODE_RD);
+      av1_rd_cost_update(x->rdmult, &this_rdc);
       vert_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -3729,7 +3302,7 @@
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        av1_rd_cost_update(x->rdmult, &sum_rdc);
       }
     }
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -3741,11 +3314,15 @@
     }
 #endif
 
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_VERT;
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      // Update VERT win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->vert_win = false;
       }
     }
 
@@ -3765,17 +3342,18 @@
 
   if (use_pb_simple_motion_pred_sse(cpi) &&
       pb_simple_motion_pred_sse == UINT_MAX) {
-    const MV ref_mv_full = { .row = 0, .col = 0 };
+    const FULLPEL_MV start_mv = kZeroFullMv;
     unsigned int var = 0;
 
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
                               &pb_simple_motion_pred_sse, &var);
   }
 
   assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
 
   const int ext_partition_allowed =
-      do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
+      do_rectangular_split &&
+      bsize > cpi->sf.part_sf.ext_partition_eval_thresh && has_rows && has_cols;
 
   // The standard AB partitions are allowed whenever ext-partition-types are
   // allowed
@@ -3784,17 +3362,8 @@
   int vertab_partition_allowed =
       ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8) {
-    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
-      horzab_partition_allowed = 0;
-      vertab_partition_allowed = 0;
-    }
-  }
-#endif
-
-  if (cpi->sf.prune_ext_partition_types_search_level) {
-    if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
       // TODO(debargha,huisu@google.com): may need to tune the threshold for
       // pb_source_variance.
       horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
@@ -3822,10 +3391,10 @@
   }
   int horza_partition_allowed = horzab_partition_allowed;
   int horzb_partition_allowed = horzab_partition_allowed;
-  if (cpi->sf.prune_ext_partition_types_search_level) {
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
     const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
     const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
-    switch (cpi->sf.prune_ext_partition_types_search_level) {
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
       case 1:
         horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
         horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
@@ -3840,10 +3409,10 @@
 
   int verta_partition_allowed = vertab_partition_allowed;
   int vertb_partition_allowed = vertab_partition_allowed;
-  if (cpi->sf.prune_ext_partition_types_search_level) {
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
     const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
     const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
-    switch (cpi->sf.prune_ext_partition_types_search_level) {
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
       case 1:
         verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
         vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
@@ -3856,16 +3425,16 @@
     }
   }
 
-  if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
+  if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed &&
       partition_horz_allowed && partition_vert_allowed) {
     // TODO(huisu@google.com): x->source_variance may not be the current
     // block's variance. The correct one to use is pb_source_variance. Need to
     // re-train the model to fix it.
-    ml_prune_ab_partition(bsize, pc_tree->partitioning,
-                          get_unsigned_bits(x->source_variance),
-                          best_rdc.rdcost, horz_rd, vert_rd, split_rd,
-                          &horza_partition_allowed, &horzb_partition_allowed,
-                          &verta_partition_allowed, &vertb_partition_allowed);
+    av1_ml_prune_ab_partition(
+        bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance),
+        best_rdc.rdcost, horz_rd, vert_rd, split_rd, &horza_partition_allowed,
+        &horzb_partition_allowed, &verta_partition_allowed,
+        &vertb_partition_allowed);
   }
 
   horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
@@ -3873,6 +3442,12 @@
   verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
   vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
 
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      horza_partition_allowed) {
+    horza_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+  }
+
   // PARTITION_HORZ_A
   if (!terminate_partition_search && partition_horz_allowed &&
       horza_partition_allowed && !is_gt_max_sq_part) {
@@ -3904,11 +3479,11 @@
       }
     }
 #endif
-    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
-                       pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
-                       mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
-                       subsize);
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col,
+        bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+        subsize);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
       aom_usec_timer_mark(&partition_timer);
@@ -3919,6 +3494,13 @@
 #endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      horzb_partition_allowed) {
+    horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+  }
+
   // PARTITION_HORZ_B
   if (!terminate_partition_search && partition_horz_allowed &&
       horzb_partition_allowed && !is_gt_max_sq_part) {
@@ -3944,11 +3526,11 @@
       }
     }
 #endif
-    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
-                       pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_B, mi_row, mi_col, subsize,
-                       mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
-                       mi_col + mi_step, bsize2);
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col,
+        subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+        mi_col + mi_step, bsize2);
 
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
@@ -3961,6 +3543,12 @@
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      verta_partition_allowed) {
+    verta_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+  }
+
   // PARTITION_VERT_A
   if (!terminate_partition_search && partition_vert_allowed &&
       verta_partition_allowed && !is_gt_max_sq_part) {
@@ -3986,11 +3574,11 @@
       }
     }
 #endif
-    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
-                       pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_A, mi_row, mi_col, bsize2,
-                       mi_row + mi_step, mi_col, bsize2, mi_row,
-                       mi_col + mi_step, subsize);
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col,
+        bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step,
+        subsize);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
       aom_usec_timer_mark(&partition_timer);
@@ -4001,6 +3589,13 @@
 #endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      vertb_partition_allowed) {
+    vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+  }
+
   // PARTITION_VERT_B
   if (!terminate_partition_search && partition_vert_allowed &&
       vertb_partition_allowed && !is_gt_max_sq_part) {
@@ -4027,11 +3622,11 @@
       }
     }
 #endif
-    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
-                       pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
-                       mi_col + mi_step, bsize2, mi_row + mi_step,
-                       mi_col + mi_step, bsize2);
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col,
+        subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step,
+        mi_col + mi_step, bsize2);
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
       aom_usec_timer_mark(&partition_timer);
@@ -4051,9 +3646,15 @@
                                  ext_partition_allowed &&
                                  bsize != BLOCK_128X128;
 
-  int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
-  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
-  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+  int partition_horz4_allowed =
+      partition4_allowed && partition_horz_allowed &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), xss,
+                           yss) != BLOCK_INVALID;
+  int partition_vert4_allowed =
+      partition4_allowed && partition_vert_allowed &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), xss,
+                           yss) != BLOCK_INVALID;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
     partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
                                 pc_tree->partitioning == PARTITION_HORZ_A ||
                                 pc_tree->partitioning == PARTITION_HORZ_B ||
@@ -4065,28 +3666,40 @@
                                 pc_tree->partitioning == PARTITION_SPLIT ||
                                 pc_tree->partitioning == PARTITION_NONE);
   }
-  if (cpi->sf.ml_prune_4_partition && partition4_allowed &&
+  if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed &&
       partition_horz_allowed && partition_vert_allowed) {
-    ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
-                         horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
-                         &partition_vert4_allowed, pb_source_variance, mi_row,
-                         mi_col);
+    av1_ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning,
+                             best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+                             &partition_horz4_allowed, &partition_vert4_allowed,
+                             pb_source_variance, mi_row, mi_col);
   }
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8) {
-    if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
-      partition_horz4_allowed = 0;
-      partition_vert4_allowed = 0;
-    }
-  }
-#endif
-
   if (blksize < (min_partition_size << 2)) {
     partition_horz4_allowed = 0;
     partition_vert4_allowed = 0;
   }
 
+  if (cpi->sf.part_sf.prune_4_partition_using_split_info &&
+      (partition_horz4_allowed || partition_vert4_allowed)) {
+    // Count of child blocks in which HORZ or VERT partition has won
+    int num_child_horz_win = 0, num_child_vert_win = 0;
+    for (int idx = 0; idx < 4; idx++) {
+      num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0;
+      num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0;
+    }
+
+    // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+    // split partiitons.
+    // Conservative pruning for high quantizers
+    const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+    if (num_child_horz_win < num_win_thresh) {
+      partition_horz4_allowed = 0;
+    }
+    if (num_child_vert_win < num_win_thresh) {
+      partition_vert4_allowed = 0;
+    }
+  }
+
   // PARTITION_HORZ_4
   assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
   if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
@@ -4110,25 +3723,26 @@
     for (int i = 0; i < 4; ++i) {
       const int this_mi_row = mi_row + i * quarter_step;
 
-      if (i > 0 && this_mi_row >= cm->mi_rows) break;
+      if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
       ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
-                           mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
-                           PARTITION_HORZ_4, ctx_prev, ctx_this))
+                           mi_col, subsize, best_rdc, &sum_rdc,
+                           PARTITION_HORZ_4, ctx_prev, ctx_this)) {
+        av1_invalid_rd_stats(&sum_rdc);
         break;
+      }
 
       ctx_prev = ctx_this;
     }
 
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_HORZ_4;
-      }
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_HORZ_4;
     }
 
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -4165,25 +3779,26 @@
     for (int i = 0; i < 4; ++i) {
       const int this_mi_col = mi_col + i * quarter_step;
 
-      if (i > 0 && this_mi_col >= cm->mi_cols) break;
+      if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
       ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
-                           this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
-                           PARTITION_VERT_4, ctx_prev, ctx_this))
+                           this_mi_col, subsize, best_rdc, &sum_rdc,
+                           PARTITION_VERT_4, ctx_prev, ctx_this)) {
+        av1_invalid_rd_stats(&sum_rdc);
         break;
+      }
 
       ctx_prev = ctx_this;
     }
 
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_VERT_4;
-      }
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_VERT_4;
     }
 #if CONFIG_COLLECT_PARTITION_STATS
     if (partition_timer_on) {
@@ -4196,7 +3811,7 @@
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-  if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) {
+  if (bsize == cm->seq_params.sb_size && !found_best_partition) {
     // Did not find a valid partition, go back and search again, with less
     // constraint on which partition types to search.
     x->must_find_valid_partition = 1;
@@ -4206,11 +3821,6 @@
     goto BEGIN_PARTITION_SEARCH;
   }
 
-  // TODO(jbb): This code added so that we avoid static analysis
-  // warning related to the fact that best_rd isn't used after this
-  // point.  This code should be refactored so that the duplicate
-  // checks occur in some sub function and thus are used...
-  (void)best_rd;
   *rd_cost = best_rdc;
 
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -4251,11 +3861,13 @@
   }
 #endif
 
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      pc_tree->index != 3) {
+  if (found_best_partition && pc_tree->index != 3) {
     if (bsize == cm->seq_params.sb_size) {
+      const int emit_output = multi_pass_mode != SB_DRY_PASS;
+      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
       x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
                 pc_tree, NULL);
     } else {
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
@@ -4269,214 +3881,332 @@
   } else {
     assert(tp_orig == *tp);
   }
+
+  x->rdmult = orig_rdmult;
+  return found_best_partition;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 #undef NUM_SIMPLE_MOTION_FEATURES
 
-// Set all the counters as max.
-static void init_first_partition_pass_stats_tables(
-    AV1_COMP *cpi, FIRST_PARTITION_PASS_STATS *stats) {
-  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
-    memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
-    memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
-    stats[i].sample_counts = INT_MAX;
-    if (cpi->sf.use_first_partition_pass_interintra_stats)
-      memset(stats[i].interintra_motion_mode_count, 0xff,
-             sizeof(stats[i].interintra_motion_mode_count));
-  }
-}
+#if !CONFIG_REALTIME_ONLY
 
-// Minimum number of samples to trigger the mode pruning in
-// two_pass_partition_search feature.
-#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
-
-static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                            int mi_col, int orig_rdmult) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
+static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int analysis_type,
+                            int mi_row, int mi_col, int orig_rdmult) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
   int tpl_stride = tpl_frame->stride;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
-  int mi_wide = mi_size_wide[bsize];
-  int mi_high = mi_size_high[bsize];
-  int row, col;
-
-  int dr = 0;
-  double r0, rk, beta;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
 
   if (tpl_frame->is_valid == 0) return orig_rdmult;
 
-  if (cpi->common.show_frame) return orig_rdmult;
+  if (!is_frame_tpl_eligible(cpi)) return orig_rdmult;
 
-  if (cpi->twopass.gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult;
+  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult;
 
-  for (row = mi_row; row < mi_row + mi_high; ++row) {
-    for (col = mi_col; col < mi_col + mi_wide; ++col) {
-      TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
-
-      if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue;
-
-      intra_cost += this_stats->intra_cost;
-      mc_dep_cost += this_stats->mc_dep_cost;
+  int64_t mc_count = 0, mc_saved = 0;
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mc_count += this_stats->mc_count;
+      mc_saved += this_stats->mc_saved;
+      mi_count++;
     }
   }
 
   aom_clear_system_state();
 
-  r0 = cpi->rd.r0;
-  rk = (double)intra_cost / mc_dep_cost;
-  beta = r0 / rk;
-  dr = av1_get_adaptive_rdmult(cpi, beta);
+  double beta = 1.0;
+  if (analysis_type == 0) {
+    if (mc_dep_cost > 0 && intra_cost > 0) {
+      const double r0 = cpi->rd.r0;
+      const double rk = (double)intra_cost / mc_dep_cost;
+      beta = (r0 / rk);
+    }
+  } else if (analysis_type == 1) {
+    const double mc_count_base = (mi_count * cpi->rd.mc_count_base);
+    beta = (mc_count + 1.0) / (mc_count_base + 1.0);
+    beta = pow(beta, 0.5);
+  } else if (analysis_type == 2) {
+    const double mc_saved_base = (mi_count * cpi->rd.mc_saved_base);
+    beta = (mc_saved + 1.0) / (mc_saved_base + 1.0);
+    beta = pow(beta, 0.5);
+  }
 
-  dr = AOMMIN(dr, orig_rdmult * 3 / 2);
-  dr = AOMMAX(dr, orig_rdmult * 1 / 2);
+  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
 
-  dr = AOMMAX(1, dr);
+  aom_clear_system_state();
 
-  return dr;
+  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+  rdmult = AOMMAX(1, rdmult);
+
+  return rdmult;
 }
 
-static void setup_delta_q(AV1_COMP *const cpi, MACROBLOCK *const x,
-                          const TileInfo *const tile_info, int mi_row,
-                          int mi_col, int num_planes) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  const int mib_size = cm->seq_params.mib_size;
+static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                           int mi_col, int64_t *intra_cost_b,
+                           int64_t *inter_cost_b,
+                           int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) {
+  if (!cpi->oxcf.enable_tpl_model) return 0;
+  if (cpi->superres_mode != SUPERRES_NONE) return 0;
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0;
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+    return 0;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
 
+  AV1_COMMON *const cm = &cpi->common;
+  const int gf_group_index = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  if (tpl_frame->is_valid == 0) return 0;
+  if (gf_group_index >= MAX_LAG_BUFFERS) return 0;
+
+  int mi_count = 0;
+  int count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  // TPL store unit size is not the same as the motion estimation unit size.
+  // Here always use motion estimation size to avoid getting repetitive inter/
+  // intra cost.
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const int step = mi_size_wide[tpl_bsize];
+  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+
+  // Stride is only based on SB size, and we fill in values for every 16x16
+  // block in a SB.
+  *stride = (mi_col_end_sr - mi_col_sr) / step;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      // Handle partial SB, so that no invalid values are used later.
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+        inter_cost_b[count] = INT64_MAX;
+        intra_cost_b[count] = INT64_MAX;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+          mv_b[count][i].as_int = INVALID_MV;
+        }
+        count++;
+        continue;
+      }
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      inter_cost_b[count] = this_stats->inter_cost;
+      intra_cost_b[count] = this_stats->intra_cost;
+      memcpy(mv_b[count], this_stats->mv, sizeof(this_stats->mv));
+      mi_count++;
+      count++;
+    }
+  }
+
+  return mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+//                  predictor chosen
+static int get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                      int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int base_qindex = cm->quant_params.base_qindex;
+
+  if (tpl_frame->is_valid == 0) return base_qindex;
+
+  if (!is_frame_tpl_eligible(cpi)) return base_qindex;
+
+  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return base_qindex;
+
+  int64_t mc_count = 0, mc_saved = 0;
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mc_count += this_stats->mc_count;
+      mc_saved += this_stats->mc_saved;
+      mi_count++;
+    }
+  }
+
+  aom_clear_system_state();
+
+  int offset = 0;
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+    assert(beta > 0.0);
+  }
+  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
+  aom_clear_system_state();
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+}
+
+static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+                                     MACROBLOCK *const x,
+                                     const TileInfo *const tile_info,
+                                     int mi_row, int mi_col, int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  assert(delta_q_info->delta_q_present_flag);
+
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
   // Delta-q modulation based on variance
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
 
-  int offset_qindex;
-  if (DELTAQ_MODULATION == 1) {
-    const int block_wavelet_energy_level =
-        av1_block_wavelet_energy_level(cpi, x, sb_size);
-    x->sb_energy_level = block_wavelet_energy_level;
-    offset_qindex =
-        av1_compute_deltaq_from_energy_level(cpi, block_wavelet_energy_level);
-  } else {
-    const int block_var_level = av1_log_block_var(cpi, x, sb_size);
-    x->sb_energy_level = block_var_level;
-    offset_qindex = av1_compute_deltaq_from_energy_level(cpi, block_var_level);
+  int current_qindex = cm->quant_params.base_qindex;
+  if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+    if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+      const int block_wavelet_energy_level =
+          av1_block_wavelet_energy_level(cpi, x, sb_size);
+      x->sb_energy_level = block_wavelet_energy_level;
+      current_qindex = av1_compute_q_from_energy_level_deltaq_mode(
+          cpi, block_wavelet_energy_level);
+    } else {
+      const int block_var_level = av1_log_block_var(cpi, x, sb_size);
+      x->sb_energy_level = block_var_level;
+      current_qindex =
+          av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
+    }
+  } else if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
+             cpi->oxcf.enable_tpl_model) {
+    // Setup deltaq based on tpl stats
+    current_qindex = get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
   }
-  const int qmask = ~(delta_q_info->delta_q_res - 1);
-  int current_qindex =
-      clamp(cm->base_qindex + offset_qindex, delta_q_info->delta_q_res,
-            256 - delta_q_info->delta_q_res);
-  current_qindex =
-      ((current_qindex - cm->base_qindex + delta_q_info->delta_q_res / 2) &
-       qmask) +
-      cm->base_qindex;
+
+  const int delta_q_res = delta_q_info->delta_q_res;
+  // Right now aq only works with tpl model. So if tpl is disabled, we set the
+  // current_qindex to base_qindex.
+  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.deltaq_mode != NO_DELTA_Q) {
+    current_qindex =
+        clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res);
+  } else {
+    current_qindex = cm->quant_params.base_qindex;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int sign_deltaq_index =
+      current_qindex - xd->current_qindex >= 0 ? 1 : -1;
+  const int deltaq_deadzone = delta_q_res / 4;
+  const int qmask = ~(delta_q_res - 1);
+  int abs_deltaq_index = abs(current_qindex - xd->current_qindex);
+  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+  current_qindex = xd->current_qindex + sign_deltaq_index * abs_deltaq_index;
+  current_qindex = AOMMAX(current_qindex, MINQ + 1);
   assert(current_qindex > 0);
 
-  xd->delta_qindex = current_qindex - cm->base_qindex;
+  xd->delta_qindex = current_qindex - cm->quant_params.base_qindex;
   set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
   xd->mi[0]->current_qindex = current_qindex;
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
-  if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
-    const int lfmask = ~(delta_q_info->delta_lf_res - 1);
+
+  // keep track of any non-zero delta-q used
+  td->deltaq_used |= (xd->delta_qindex != 0);
+
+  if (cpi->oxcf.deltalf_mode) {
+    const int delta_lf_res = delta_q_info->delta_lf_res;
+    const int lfmask = ~(delta_lf_res - 1);
     const int delta_lf_from_base =
-        ((offset_qindex / 2 + delta_q_info->delta_lf_res / 2) & lfmask);
+        ((xd->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
+    const int8_t delta_lf =
+        (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+    const int frame_lf_count =
+        av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    const int mib_size = cm->seq_params.mib_size;
 
     // pre-set the delta lf for loop filter. Note that this value is set
     // before mi is assigned for each block in current superblock
-    for (int j = 0; j < AOMMIN(mib_size, cm->mi_rows - mi_row); j++) {
-      for (int k = 0; k < AOMMIN(mib_size, cm->mi_cols - mi_col); k++) {
-        cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)].delta_lf_from_base =
-            clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
-        const int frame_lf_count =
-            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+      for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+        const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+        mi_params->mi_grid_base[grid_idx]->delta_lf_from_base = delta_lf;
         for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)].delta_lf[lf_id] =
-              clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+          mi_params->mi_grid_base[grid_idx]->delta_lf[lf_id] = delta_lf;
         }
       }
     }
   }
 }
-
-// First pass of partition search only considers square partition block sizes.
-// The results will be used in the second partition search pass to prune
-// unlikely partition candidates.
-static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
-                                        TileDataEnc *tile_data, int mi_row,
-                                        int mi_col, TOKENEXTRA **tp) {
-  MACROBLOCK *const x = &td->mb;
-  x->cb_partition_scan = 1;
-
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  // Reset the stats tables.
-  av1_zero(x->first_partition_pass_stats);
-
-  AV1_COMMON *const cm = &cpi->common;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
-  PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
-  RD_STATS dummy_rdc;
-  rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        &dummy_rdc, INT64_MAX, pc_root, NULL);
-  x->cb_partition_scan = 0;
-
-  x->source_variance = UINT_MAX;
-  x->simple_motion_pred_sse = UINT_MAX;
-  if (sf->adaptive_pred_interp_filter) {
-    const int leaf_nodes = 256;
-    for (int i = 0; i < leaf_nodes; ++i) {
-      td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
-      td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
-      td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
-      td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
-    }
-  }
-
-  x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
-  av1_zero(x->txb_rd_record_8X8);
-  av1_zero(x->txb_rd_record_16X16);
-  av1_zero(x->txb_rd_record_32X32);
-  av1_zero(x->txb_rd_record_64X64);
-  av1_zero(x->txb_rd_record_intra);
-  av1_zero(x->pred_mv);
-  pc_root->index = 0;
-
-  for (int idy = 0; idy < mi_size_high[sb_size]; ++idy) {
-    for (int idx = 0; idx < mi_size_wide[sb_size]; ++idx) {
-      const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx);
-      cm->mi_grid_visible[offset] = 0;
-    }
-  }
-
-  x->use_cb_search_range = 1;
-
-  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
-    FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i];
-    if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
-      // If there are not enough samples collected, make all available.
-      memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
-      memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
-      if (cpi->sf.use_first_partition_pass_interintra_stats)
-        memset(stat->interintra_motion_mode_count, 0xff,
-               sizeof(stat->interintra_motion_mode_count));
-    } else if (sf->selective_ref_frame < 3) {
-      // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
-      // initial partition scan, so we don't eliminate them.
-      stat->ref0_counts[ALTREF2_FRAME] = 0xff;
-      stat->ref1_counts[ALTREF2_FRAME] = 0xff;
-      stat->ref0_counts[BWDREF_FRAME] = 0xff;
-      stat->ref1_counts[BWDREF_FRAME] = 0xff;
-      if (cpi->sf.use_first_partition_pass_interintra_stats) {
-        stat->interintra_motion_mode_count[ALTREF2_FRAME] = 0xff;
-        stat->interintra_motion_mode_count[BWDREF_FRAME] = 0xff;
-      }
-    }
-  }
-}
+#endif  // !CONFIG_REALTIME_ONLY
 
 #define AVG_CDF_WEIGHT_LEFT 3
 #define AVG_CDF_WEIGHT_TOP_RIGHT 1
 
-static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
-                           int num_cdfs, int cdf_stride, int nsymbs,
-                           int wt_left, int wt_tr) {
+static AOM_INLINE void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left,
+                                      aom_cdf_prob *cdf_ptr_tr, int num_cdfs,
+                                      int cdf_stride, int nsymbs, int wt_left,
+                                      int wt_tr) {
   for (int i = 0; i < num_cdfs; i++) {
     for (int j = 0; j <= nsymbs; j++) {
       cdf_ptr_left[i * cdf_stride + j] =
@@ -4503,8 +4233,8 @@
                    wt_left, wt_tr);                                        \
   } while (0)
 
-static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
-                    int wt_tr) {
+static AOM_INLINE void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr,
+                               int wt_left, int wt_tr) {
   AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
   for (int i = 0; i < 2; i++) {
     AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
@@ -4527,8 +4257,9 @@
 // the left SB's CDFs and use the same for current SB's encoding to
 // improve the performance. This function facilitates the averaging
 // of CDF and used only when row-mt is enabled in encoder.
-static void avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
-                            int wt_left, int wt_tr) {
+static AOM_INLINE void avg_cdf_symbols(FRAME_CONTEXT *ctx_left,
+                                       FRAME_CONTEXT *ctx_tr, int wt_left,
+                                       int wt_tr) {
   AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
   AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
   AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
@@ -4647,20 +4378,517 @@
               CFL_ALPHABET_SIZE);
 }
 
-static void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
-                          int mi_row, TOKENEXTRA **tp, int use_nonrd_mode) {
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col) {
+  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+  const int orig_rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int gf_group_index = cpi->gf_group.index;
+  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
+      cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+    const int dr =
+        get_rdmult_delta(cpi, sb_size, 0, mi_row, mi_col, orig_rdmult);
+    x->rdmult = dr;
+  }
+}
+#endif
+
+static void source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int shift) {
+  unsigned int tmp_sse;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  uint8_t *src_y = cpi->source->y_buffer;
+  int src_ystride = cpi->source->y_stride;
+  uint8_t *last_src_y = cpi->last_source->y_buffer;
+  int last_src_ystride = cpi->last_source->y_stride;
+  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
+  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+#if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *xd = &x->e_mbd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+  src_y += shift;
+  last_src_y += shift;
+  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                       last_src_ystride, &tmp_sse);
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  // Detect large lighting change.
+  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+    x->content_state_sb = kLowVarHighSumdiff;
+  else if (tmp_sse < avg_source_sse_threshold)
+    x->content_state_sb = kLowSad;
+  else if (tmp_sse > avg_source_sse_threshold_high)
+    x->content_state_sb = kHighSad;
+}
+
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                       TileDataEnc *tile_data,
+                                       PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                       const int mi_row, const int mi_col,
+                                       const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  if (sf->rt_sf.source_metrics_sb_nonrd && sb_size == BLOCK_64X64 &&
+      cpi->svc.number_spatial_layers <= 1 &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    int shift = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
+    source_content_sb(cpi, x, shift);
+  }
+  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (cpi->partition_search_skippable_frame) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+  }
+  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+         cpi->partition_search_skippable_frame ||
+         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+  td->mb.cb_offset = 0;
+  nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                      pc_root);
+}
+
+// Memset the mbmis at the current superblock to 0
+static INLINE void reset_mbmi(CommonModeInfoParams *const mi_params,
+                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+  // size of sb in unit of mi (BLOCK_4X4)
+  const int sb_size_mi = mi_size_wide[sb_size];
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  // size of sb in unit of allocated mi size
+  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+         "mi is not allocated as a multiple of sb!");
+  assert(mi_params->mi_stride % sb_size_mi == 0 &&
+         "mi_grid_base is not allocated as a multiple of sb!");
+
+  const int mi_rows = mi_size_high[sb_size];
+  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+           mi_params->mi_stride);
+    const int mi_grid_idx =
+        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    const int alloc_mi_idx =
+        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->mi_grid_base));
+    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->tx_type_map));
+    if (cur_mi_row % mi_alloc_size_1d == 0) {
+      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+    }
+  }
+}
+
+static INLINE void backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats,
+                                   const AV1_COMP *cpi, ThreadData *td,
+                                   const TileDataEnc *tile_data, int mi_row,
+                                   int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const TileInfo *tile_info = &tile_data->tile_info;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  sb_fp_stats->rd_count = cpi->td.rd_counts;
+  sb_fp_stats->split_count = cpi->td.mb.txb_split_count;
+
+  sb_fp_stats->fc = *td->counts;
+
+  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+
+  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  sb_fp_stats->current_qindex =
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+static INLINE void restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats,
+                                    AV1_COMP *cpi, ThreadData *td,
+                                    TileDataEnc *tile_data, int mi_row,
+                                    int mi_col) {
+  MACROBLOCK *x = &td->mb;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  cpi->td.rd_counts = sb_fp_stats->rd_count;
+  cpi->td.mb.txb_split_count = sb_fp_stats->split_count;
+
+  *td->counts = sb_fp_stats->fc;
+
+  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+      sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
+                                 int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *x = &td->mb;
+  const int frame_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  av1_zero(x->search_ref_frame);
+
+  if (tpl_frame->is_valid == 0) return;
+  if (!is_frame_tpl_eligible(cpi)) return;
+  if (frame_idx >= MAX_LAG_BUFFERS) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+  if (cpi->oxcf.aq_mode != NO_AQ) return;
+
+  const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  if (is_overlay) {
+    memset(x->search_ref_frame, 1, sizeof(x->search_ref_frame));
+    return;
+  }
+
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
+  const int step = 1 << block_mis_log2;
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int mi_row_end =
+      AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
+  const int mi_col_end =
+      AOMMIN(mi_size_wide[sb_size] + mi_col, mi_params->mi_cols);
+
+  for (int row = mi_row; row < mi_row_end; row += step) {
+    for (int col = mi_col; col < mi_col_end; col += step) {
+      const TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
+      // Find the winner ref frame idx for the current block
+      int64_t best_inter_cost = this_stats->pred_error[0];
+      int best_rf_idx = 0;
+      for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) {
+        if ((this_stats->pred_error[idx] < best_inter_cost) &&
+            (this_stats->pred_error[idx] != 0)) {
+          best_inter_cost = this_stats->pred_error[idx];
+          best_rf_idx = idx;
+        }
+      }
+      // tpl_pred_error is the pred_error reduction of best_ref w.r.t.
+      // LAST_FRAME.
+      tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] -
+                                    this_stats->pred_error[LAST_FRAME - 1];
+
+      for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx)
+        inter_cost[rf_idx] += tpl_pred_error[rf_idx];
+    }
+  }
+
+  int rank_index[INTER_REFS_PER_FRAME - 1];
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    rank_index[idx] = idx + 1;
+    for (int i = idx; i > 0; --i) {
+      if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) {
+        const int tmp = rank_index[i - 1];
+        rank_index[i - 1] = rank_index[i];
+        rank_index[i] = tmp;
+      }
+    }
+  }
+
+  x->search_ref_frame[INTRA_FRAME] = 1;
+  x->search_ref_frame[LAST_FRAME] = 1;
+
+  int cutoff_ref = 0;
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+    if (idx > 2) {
+      if (!cutoff_ref) {
+        // If the predictive coding gains are smaller than the previous more
+        // relevant frame over certain amount, discard this frame and all the
+        // frames afterwards.
+        if (llabs(inter_cost[rank_index[idx]]) <
+                llabs(inter_cost[rank_index[idx - 1]]) / 8 ||
+            inter_cost[rank_index[idx]] == 0)
+          cutoff_ref = 1;
+      }
+
+      if (cutoff_ref) x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// This function initializes the stats for encode_rd_sb.
+static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                     const TileDataEnc *tile_data,
+                                     PC_TREE *pc_root, RD_STATS *rd_cost,
+                                     int mi_row, int mi_col,
+                                     int gather_tpl_data) {
+  const AV1_COMMON *cm = &cpi->common;
+  const TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *x = &td->mb;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int use_simple_motion_search =
+      (sf->part_sf.simple_motion_search_split ||
+       sf->part_sf.simple_motion_search_prune_rect ||
+       sf->part_sf.simple_motion_search_early_term_none ||
+       sf->part_sf.ml_early_term_after_part_split_level) &&
+      !frame_is_intra_only(cm);
+  if (use_simple_motion_search) {
+    init_simple_motion_search_mvs(pc_root);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  init_ref_frame_space(cpi, td, mi_row, mi_col);
+  x->sb_energy_level = 0;
+  x->cnn_output_valid = 0;
+  if (gather_tpl_data) {
+    if (cm->delta_q_info.delta_q_present_flag) {
+      const int num_planes = av1_num_planes(cm);
+      const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+      setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+      av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+    }
+    if (cpi->oxcf.enable_tpl_model) {
+      adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+    }
+  }
+#else
+  (void)tile_info;
+  (void)mi_row;
+  (void)mi_col;
+  (void)gather_tpl_data;
+#endif
+
+  // Reset hash state for transform/mode rd hash information
+  reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+  av1_zero(x->picked_ref_frames_mask);
+  av1_zero(x->pred_mv);
+  av1_invalid_rd_stats(rd_cost);
+}
+
+static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                    TileDataEnc *tile_data,
+                                    PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                    const int mi_row, const int mi_col,
+                                    const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  int dummy_rate;
+  int64_t dummy_dist;
+  RD_STATS dummy_rdc;
+
+#if CONFIG_REALTIME_ONLY
+  (void)seg_skip;
+#endif  // CONFIG_REALTIME_ONLY
+
+  init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, 1);
+
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  } else if (cpi->partition_search_skippable_frame) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  } else {
+    // No stats for overlay frames. Exclude key frame.
+    x->valid_cost_b =
+        get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b,
+                        x->inter_cost_b, x->mv_b, &x->cost_stride);
+
+    reset_partition(pc_root, sb_size);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_pick_partition_time);
+#endif
+    BLOCK_SIZE max_sq_size = x->max_partition_size;
+    BLOCK_SIZE min_sq_size = x->min_partition_size;
+
+    if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+      float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+      av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+      max_sq_size = AOMMAX(
+          AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size),
+          min_sq_size);
+    }
+
+    const int num_passes = cpi->oxcf.sb_multipass_unit_test ? 2 : 1;
+
+    if (num_passes == 1) {
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_SINGLE_PASS, NULL);
+    } else {
+      // First pass
+      SB_FIRST_PASS_STATS sb_fp_stats;
+      backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_DRY_PASS, NULL);
+
+      // Second pass
+      init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col,
+                        0);
+      reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      reset_partition(pc_root, sb_size);
+
+      restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_WET_PASS, NULL);
+    }
+    // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+    x->valid_cost_b = 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_pick_partition_time);
+#endif
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      cm->tiles.cols == 1 && cm->tiles.rows == 1) {
+    av1_inter_mode_data_fit(tile_data, x->rdmult);
+  }
+}
+
+static AOM_INLINE void set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                                         const TileInfo *const tile_info,
+                                         const int mi_row, const int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (cpi->oxcf.coeff_cost_upd_freq) {
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd &&
+          mi_col != tile_info->mi_col_start)
+        break;
+      av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->oxcf.mode_cost_upd_freq) {
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      av1_fill_mode_rates(cm, x, xd->tile_ctx);
+      break;
+    default: assert(0);
+  }
+  switch (cpi->oxcf.mv_cost_upd_freq) {
+    case COST_UPD_OFF: break;
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.disable_sb_level_mv_cost_upd &&
+          mi_col != tile_info->mi_col_start)
+        break;
+      av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv,
+                        cm->features.allow_high_precision_mv, x);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+                                     TileDataEnc *tile_data, int mi_row,
+                                     TOKENEXTRA **tp) {
+  AV1_COMMON *const cm = &cpi->common;
   const TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const int leaf_nodes = 256;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
   const int mib_size = cm->seq_params.mib_size;
   const int mib_size_log2 = cm->seq_params.mib_size_log2;
   const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+  const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_sb_time);
@@ -4670,13 +4898,14 @@
   av1_zero_left_context(xd);
 
   // Reset delta for every tile
-  if (mi_row == tile_info->mi_row_start) {
+  if (mi_row == tile_info->mi_row_start || cpi->row_mt) {
     if (cm->delta_q_info.delta_q_present_flag)
-      xd->current_qindex = cm->base_qindex;
+      xd->current_qindex = cm->quant_params.base_qindex;
     if (cm->delta_q_info.delta_lf_present_flag) {
       av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
   }
+  reset_thresh_freq_fact(x);
 
   // Code each SB in the row
   for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
@@ -4700,185 +4929,38 @@
       }
     }
 
-    switch (cpi->oxcf.coeff_cost_upd_freq) {
-      case COST_UPD_TILE:  // Tile level
-        if (mi_row != tile_info->mi_row_start) break;
-        AOM_FALLTHROUGH_INTENDED;
-      case COST_UPD_SBROW:  // SB row level in tile
-        if (mi_col != tile_info->mi_col_start) break;
-        AOM_FALLTHROUGH_INTENDED;
-      case COST_UPD_SB:  // SB level
-        av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
-        break;
-      default: assert(0);
-    }
+    set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
 
-    switch (cpi->oxcf.mode_cost_upd_freq) {
-      case COST_UPD_TILE:  // Tile level
-        if (mi_row != tile_info->mi_row_start) break;
-        AOM_FALLTHROUGH_INTENDED;
-      case COST_UPD_SBROW:  // SB row level in tile
-        if (mi_col != tile_info->mi_col_start) break;
-        AOM_FALLTHROUGH_INTENDED;
-      case COST_UPD_SB:  // SB level
-        av1_fill_mode_rates(cm, x, xd->tile_ctx);
-        break;
-      default: assert(0);
-    }
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    x->content_state_sb = 0;
 
-    if (sf->adaptive_pred_interp_filter) {
-      for (int i = 0; i < leaf_nodes; ++i) {
-        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
-        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
-        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
-        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
-      }
-    }
-
-    x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
-
-    if (!use_nonrd_mode) {
-      av1_zero(x->txb_rd_record_8X8);
-      av1_zero(x->txb_rd_record_16X16);
-      av1_zero(x->txb_rd_record_32X32);
-      av1_zero(x->txb_rd_record_64X64);
-      av1_zero(x->txb_rd_record_intra);
-    }
-
-    av1_zero(x->picked_ref_frames_mask);
-
-    av1_zero(x->pred_mv);
-    PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
+    PC_TREE *const pc_root = td->pc_root;
     pc_root->index = 0;
 
-    if ((sf->simple_motion_search_prune_rect ||
-         sf->simple_motion_search_early_term_none ||
-         sf->firstpass_simple_motion_search_early_term) &&
-        !frame_is_intra_only(cm)) {
-      init_simple_motion_search_mvs(pc_root);
-    }
+    xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+    x->source_variance = UINT_MAX;
+    x->simple_motion_pred_sse = UINT_MAX;
 
     const struct segmentation *const seg = &cm->seg;
     int seg_skip = 0;
     if (seg->enabled) {
       const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
       const int segment_id =
-          map ? get_segment_id(cm, map, sb_size, mi_row, mi_col) : 0;
+          map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
+              : 0;
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
-    xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
 
-    x->sb_energy_level = 0;
-    if (cm->delta_q_info.delta_q_present_flag)
-      setup_delta_q(cpi, x, tile_info, mi_row, mi_col, num_planes);
-
-    int dummy_rate;
-    int64_t dummy_dist;
-    RD_STATS dummy_rdc;
-    const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-    x->source_variance = UINT_MAX;
-    x->simple_motion_pred_sse = UINT_MAX;
-    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-      const BLOCK_SIZE bsize = seg_skip ? sb_size : sf->always_this_block_size;
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                       &dummy_rate, &dummy_dist, 1, pc_root);
-    } else if (cpi->partition_search_skippable_frame) {
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-      const BLOCK_SIZE bsize =
-          get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                       &dummy_rate, &dummy_dist, 1, pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               use_nonrd_mode) {
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-      av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
-      nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                          &dummy_rate, &dummy_dist, 1, pc_root);
-
+    if (use_nonrd_mode) {
+      encode_nonrd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col,
+                      seg_skip);
     } else {
-      const int orig_rdmult = cpi->rd.RDMULT;
-      x->cb_rdmult = orig_rdmult;
-      if (cpi->twopass.gf_group.index > 0 && cpi->oxcf.enable_tpl_model &&
-          cpi->oxcf.aq_mode == NO_AQ && cpi->oxcf.deltaq_mode == 0) {
-        const int dr =
-            get_rdmult_delta(cpi, BLOCK_128X128, mi_row, mi_col, orig_rdmult);
-
-        x->cb_rdmult = dr;
-        x->rdmult = x->cb_rdmult;
-      }
-
-      reset_partition(pc_root, sb_size);
-      x->use_cb_search_range = 0;
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      start_timing(cpi, first_partition_search_pass_time);
-#endif
-      init_first_partition_pass_stats_tables(cpi,
-                                             x->first_partition_pass_stats);
-      // Do the first pass if we need two pass partition search
-      if (cpi->two_pass_partition_search &&
-          cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
-          mi_row + mi_size_high[sb_size] <= cm->mi_rows &&
-          mi_col + mi_size_wide[sb_size] <= cm->mi_cols &&
-          cm->current_frame.frame_type != KEY_FRAME) {
-        first_partition_search_pass(cpi, td, tile_data, mi_row, mi_col, tp);
-      }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      end_timing(cpi, first_partition_search_pass_time);
-#endif
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      start_timing(cpi, rd_pick_partition_time);
-#endif
-      BLOCK_SIZE max_sq_size = BLOCK_128X128;
-      switch (cpi->oxcf.max_partition_size) {
-        case 4: max_sq_size = BLOCK_4X4; break;
-        case 8: max_sq_size = BLOCK_8X8; break;
-        case 16: max_sq_size = BLOCK_16X16; break;
-        case 32: max_sq_size = BLOCK_32X32; break;
-        case 64: max_sq_size = BLOCK_64X64; break;
-        case 128: max_sq_size = BLOCK_128X128; break;
-        default: assert(0); break;
-      }
-      max_sq_size = AOMMIN(max_sq_size, sb_size);
-
-      BLOCK_SIZE min_sq_size = BLOCK_4X4;
-      switch (cpi->oxcf.min_partition_size) {
-        case 4: min_sq_size = BLOCK_4X4; break;
-        case 8: min_sq_size = BLOCK_8X8; break;
-        case 16: min_sq_size = BLOCK_16X16; break;
-        case 32: min_sq_size = BLOCK_32X32; break;
-        case 64: min_sq_size = BLOCK_64X64; break;
-        case 128: min_sq_size = BLOCK_128X128; break;
-        default: assert(0); break;
-      }
-
-      if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
-        float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
-
-        av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
-        max_sq_size =
-            AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size);
-      }
-
-      min_sq_size = AOMMIN(min_sq_size, max_sq_size);
-
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, INT64_MAX,
-                        pc_root, NULL);
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      end_timing(cpi, rd_pick_partition_time);
-#endif
+      encode_rd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, seg_skip);
     }
-    // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
-    if (cpi->sf.inter_mode_rd_model_estimation == 1 && cm->tile_cols == 1 &&
-        cm->tile_rows == 1) {
-      av1_inter_mode_data_fit(tile_data, x->rdmult);
-    }
+
     if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
         (tile_info->mi_row_end > (mi_row + mib_size))) {
       if (sb_cols_in_tile == 1)
@@ -4895,7 +4977,7 @@
 #endif
 }
 
-static void init_encode_frame_mb_context(AV1_COMP *cpi) {
+static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &cpi->td.mb;
@@ -4909,62 +4991,24 @@
                          cm->seq_params.subsampling_y, num_planes);
 }
 
-static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
-  if (frame_is_intra_only(&cpi->common)) {
-    return INTRA_FRAME;
-  } else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
-             cpi->rc.is_src_frame_internal_arf) {
-    // We will not update the golden frame with an internal overlay frame
-    return ALTREF_FRAME;
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
-             cpi->refresh_alt_ref_frame) {
-    return GOLDEN_FRAME;
-  } else {
-    return LAST_FRAME;
-  }
-}
-
-static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
-  if (cpi->common.coded_lossless) return ONLY_4X4;
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
-    return TX_MODE_LARGEST;
-  else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
-           cpi->sf.tx_size_search_method == USE_FAST_RD)
-    return TX_MODE_SELECT;
-  else
-    return cpi->common.tx_mode;
-}
-
 void av1_alloc_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
-  int tile_col, tile_row;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
 
   if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
   CHECK_MEM_ERROR(
       cm, cpi->tile_data,
       aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
-  cpi->allocated_tiles = tile_cols * tile_rows;
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileDataEnc *const tile_data =
-          &cpi->tile_data[tile_row * tile_cols + tile_col];
-      int i, j;
-      for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-        for (j = 0; j < MAX_MODES; ++j) {
-          tile_data->thresh_freq_fact[i][j] = 32;
-        }
-      }
-    }
+  cpi->allocated_tiles = tile_cols * tile_rows;
 }
 
 void av1_init_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
   TOKENLIST *tplist = cpi->tplist[0][0];
@@ -4985,9 +5029,9 @@
       cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
       tplist = cpi->tplist[tile_row][tile_col];
       tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
-      tile_data->allow_update_cdf = !cm->large_scale_tile;
+      tile_data->allow_update_cdf = !cm->tiles.large_scale;
       tile_data->allow_update_cdf =
-          tile_data->allow_update_cdf && !cm->disable_cdf_update;
+          tile_data->allow_update_cdf && !cm->features.disable_cdf_update;
       tile_data->tctx = *cm->fc;
     }
   }
@@ -4997,7 +5041,7 @@
                        int tile_col, int mi_row) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const int tile_cols = cm->tile_cols;
+  const int tile_cols = cm->tiles.cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
   TOKENEXTRA *tok = NULL;
@@ -5012,7 +5056,7 @@
                 cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
 
-  encode_sb_row(cpi, td, this_tile, mi_row, &tok, cpi->sf.use_nonrd_pick_mode);
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok);
 
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
@@ -5033,36 +5077,30 @@
                      int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
   TileDataEnc *const this_tile =
-      &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+      &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  int mi_row;
 
-  av1_inter_mode_data_init(this_tile);
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile);
 
   av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
                          tile_info->mi_col_end, tile_row);
-  av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                         &td->mb.e_mbd);
 
-  // Set up pointers to per thread motion search counters.
-  this_tile->m_search_count = 0;   // Count of motion search hits.
-  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
-  td->mb.m_search_count_ptr = &this_tile->m_search_count;
-  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-
-  cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+  if (cpi->oxcf.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
 
   av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
 
-  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->seq_params.mib_size) {
     av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
 }
 
-static void encode_tiles(AV1_COMP *cpi) {
+static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
@@ -5073,12 +5111,14 @@
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
       TileDataEnc *const this_tile =
-          &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+          &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
       cpi->td.intrabc_used = 0;
+      cpi->td.deltaq_used = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
       cpi->intrabc_used |= cpi->td.intrabc_used;
+      cpi->deltaq_used |= cpi->td.deltaq_used;
     }
   }
 }
@@ -5133,11 +5173,9 @@
   return (params_cost << AV1_PROB_COST_SHIFT);
 }
 
-static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
-                              int frame) {
-  (void)num_refs_using_gm;
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
   (void)frame;
-  switch (sf->gm_search_type) {
+  switch (sf->gm_sf.gm_search_type) {
     case GM_FULL_SEARCH: return 1;
     case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
       return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
@@ -5150,57 +5188,37 @@
   return 1;
 }
 
-static int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
-  const unsigned int max_allowed_refs_for_given_speed =
-      (cpi->sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
-                                         : INTER_REFS_PER_FRAME;
-  return AOMMIN(max_allowed_refs_for_given_speed,
-                cpi->oxcf.max_reference_frames);
-}
-
-// Enforce the number of references for each arbitrary frame based on user
-// options and speed.
-static void enforce_max_ref_frames(AV1_COMP *cpi) {
+// Set the relative distance of a reference frame w.r.t. current frame
+static AOM_INLINE void set_rel_frame_dist(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
   MV_REFERENCE_FRAME ref_frame;
-  int total_valid_refs = 0;
+  int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
+  cpi->nearest_past_ref = NONE_FRAME;
+  cpi->nearest_future_ref = NONE_FRAME;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    cpi->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      total_valid_refs++;
+      int dist = av1_encoder_get_relative_dist(
+          order_hint_info,
+          cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
+          cm->current_frame.display_order_hint);
+      cpi->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+      // Get the nearest ref_frame in the past
+      if (abs(dist) < min_past_dist && dist < 0) {
+        cpi->nearest_past_ref = ref_frame;
+        min_past_dist = abs(dist);
+      }
+      // Get the nearest ref_frame in the future
+      if (dist < min_future_dist && dist > 0) {
+        cpi->nearest_future_ref = ref_frame;
+        min_future_dist = dist;
+      }
     }
   }
-
-  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
-
-  // When more than 'max_allowed_refs' are available, we reduce the number of
-  // reference frames one at a time based on this order.
-  const MV_REFERENCE_FRAME disable_order[] = {
-    LAST3_FRAME,
-    LAST2_FRAME,
-    ALTREF2_FRAME,
-    GOLDEN_FRAME,
-  };
-
-  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
-    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
-
-    if (!(cpi->ref_frame_flags &
-          av1_ref_frame_flag_list[ref_frame_to_disable])) {
-      continue;
-    }
-
-    switch (ref_frame_to_disable) {
-      case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
-      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
-      case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
-      case GOLDEN_FRAME: cpi->ref_frame_flags &= ~AOM_GOLD_FLAG; break;
-      default: assert(0);
-    }
-    --total_valid_refs;
-  }
-  assert(total_valid_refs <= max_allowed_refs);
 }
 
-static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
+static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
   assert(!frame_is_intra_only(cm));
 
   int one_sided_refs = 1;
@@ -5208,9 +5226,10 @@
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
     if (buf == NULL) continue;
 
-    const int ref_order_hint = buf->order_hint;
-    if (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
-                          (int)cm->current_frame.order_hint) > 0) {
+    const int ref_display_order_hint = buf->display_order_hint;
+    if (av1_encoder_get_relative_dist(
+            &cm->seq_params.order_hint_info, ref_display_order_hint,
+            (int)cm->current_frame.display_order_hint) > 0) {
       one_sided_refs = 0;  // bwd reference
       break;
     }
@@ -5286,48 +5305,373 @@
   return 0;
 }
 
-static void set_default_interp_skip_flags(AV1_COMP *cpi) {
-  const int num_planes = av1_num_planes(&cpi->common);
-  cpi->default_interp_skip_flags = (num_planes == 1)
-                                       ? DEFAULT_LUMA_INTERP_SKIP_FLAG
-                                       : DEFAULT_INTERP_SKIP_FLAG;
+static AOM_INLINE void set_default_interp_skip_flags(
+    const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
+  const int num_planes = av1_num_planes(cm);
+  interp_search_flags->default_interp_skip_flags =
+      (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA
+                        : INTERP_SKIP_LUMA_SKIP_CHROMA;
 }
 
-static void encode_frame_internal(AV1_COMP *cpi) {
+// TODO(Remya): Can include erroradv_prod_tr[] for threshold calculation
+static INLINE int64_t calc_erroradv_threshold(AV1_COMP *cpi,
+                                              int64_t ref_frame_error) {
+  if (!cpi->sf.gm_sf.disable_adaptive_warp_error_thresh)
+    return (int64_t)(
+        ref_frame_error * erroradv_tr[cpi->sf.gm_sf.gm_erroradv_type] + 0.5);
+  else
+    return INT64_MAX;
+}
+
+static void compute_global_motion_for_ref_frame(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h,
+    const WarpedMotionParams *ref_params) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  int i;
+  // clang-format off
+  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+  };
+  // clang-format on
+  WarpedMotionParams tmp_wm_params;
+  const double *params_this_motion;
+  int inliers_by_motion[RANSAC_NUM_MOTIONS];
+  assert(ref_buf[frame] != NULL);
+  if (*num_frm_corners < 0) {
+    // compute interest points using FAST features
+    *num_frm_corners = av1_fast_corner_detect(
+        frm_buffer, cpi->source->y_width, cpi->source->y_height,
+        cpi->source->y_stride, frm_corners, MAX_CORNERS);
+  }
+  TransformationType model;
+
+  aom_clear_system_state();
+
+  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+  const int do_adaptive_gm_estimation = 0;
+
+  const int ref_frame_dist = get_relative_dist(
+      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+  const GlobalMotionEstimationType gm_estimation_type =
+      cm->seq_params.order_hint_info.enable_order_hint &&
+              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+          ? GLOBAL_MOTION_DISFLOW_BASED
+          : GLOBAL_MOTION_FEATURE_BASED;
+  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+    int64_t best_warp_error = INT64_MAX;
+    // Initially set all params to identity.
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      memcpy(params_by_motion[i].params, kIdentityParams,
+             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
+      params_by_motion[i].num_inliers = 0;
+    }
+
+    av1_compute_global_motion(
+        model, frm_buffer, cpi->source->y_width, cpi->source->y_height,
+        cpi->source->y_stride, frm_corners, *num_frm_corners, ref_buf[frame],
+        cpi->common.seq_params.bit_depth, gm_estimation_type, inliers_by_motion,
+        params_by_motion, RANSAC_NUM_MOTIONS);
+    int64_t ref_frame_error = 0;
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      if (inliers_by_motion[i] == 0) continue;
+
+      params_this_motion = params_by_motion[i].params;
+      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+      if (tmp_wm_params.wmtype != IDENTITY) {
+        av1_compute_feature_segmentation_map(
+            segment_map, segment_map_w, segment_map_h,
+            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
+
+        ref_frame_error = av1_segmented_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer,
+            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride,
+            segment_map, segment_map_w);
+
+        int64_t erroradv_threshold =
+            calc_erroradv_threshold(cpi, ref_frame_error);
+
+        const int64_t warp_error = av1_refine_integerized_param(
+            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
+            cpi->source->y_stride, GM_REFINEMENT_COUNT, best_warp_error,
+            segment_map, segment_map_w, erroradv_threshold);
+
+        if (warp_error < best_warp_error) {
+          best_warp_error = warp_error;
+          // Save the wm_params modified by
+          // av1_refine_integerized_param() rather than motion index to
+          // avoid rerunning refine() below.
+          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                 sizeof(WarpedMotionParams));
+        }
+      }
+    }
+    if (cm->global_motion[frame].wmtype <= AFFINE)
+      if (!av1_get_shear_params(&cm->global_motion[frame]))
+        cm->global_motion[frame] = default_warp_params;
+
+    if (cm->global_motion[frame].wmtype == TRANSLATION) {
+      cm->global_motion[frame].wmmat[0] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[0]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+      cm->global_motion[frame].wmmat[1] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[1]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+    }
+
+    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
+
+    if (ref_frame_error == 0) continue;
+
+    // If the best error advantage found doesn't meet the threshold for
+    // this motion type, revert to IDENTITY.
+    if (!av1_is_enough_erroradvantage(
+            (double)best_warp_error / ref_frame_error,
+            gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                               cm->features.allow_high_precision_mv),
+            cpi->sf.gm_sf.gm_erroradv_type)) {
+      cm->global_motion[frame] = default_warp_params;
+    }
+
+    if (cm->global_motion[frame].wmtype != IDENTITY) break;
+  }
+
+  aom_clear_system_state();
+}
+
+typedef struct {
+  int distance;
+  MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+static INLINE void update_valid_ref_frames_for_gm(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair *past_ref_frame, FrameDistPair *future_ref_frame,
+    int *num_past_ref_frames, int *num_future_ref_frames) {
+  AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+    const int ref_disabled =
+        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+    ref_buf[frame] = NULL;
+    cm->global_motion[frame] = default_warp_params;
+    // Skip global motion estimation for invalid ref frames
+    if (buf == NULL ||
+        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+      cpi->gm_info.params_cost[frame] = 0;
+      continue;
+    } else {
+      ref_buf[frame] = &buf->buf;
+    }
+
+    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+        do_gm_search_logic(&cpi->sf, frame) &&
+        !prune_ref_by_selective_ref_frame(
+            cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint) &&
+        !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
+      assert(ref_buf[frame] != NULL);
+      int relative_frame_dist = av1_encoder_get_relative_dist(
+          order_hint_info, buf->display_order_hint,
+          cm->cur_frame->display_order_hint);
+      // Populate past and future ref frames
+      if (relative_frame_dist <= 0) {
+        past_ref_frame[*num_past_ref_frames].distance =
+            abs(relative_frame_dist);
+        past_ref_frame[*num_past_ref_frames].frame = frame;
+        (*num_past_ref_frames)++;
+      } else {
+        future_ref_frame[*num_future_ref_frames].distance =
+            abs(relative_frame_dist);
+        future_ref_frame[*num_future_ref_frames].frame = frame;
+        (*num_future_ref_frames)++;
+      }
+    }
+  }
+}
+
+static INLINE void compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const WarpedMotionParams *ref_params =
+      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                     : &default_warp_params;
+
+  compute_global_motion_for_ref_frame(
+      cpi, ref_buf, frame, num_frm_corners, frm_corners, frm_buffer,
+      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
+
+  gm_info->params_cost[frame] =
+      gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                         cm->features.allow_high_precision_mv) +
+      gm_info->type_cost[cm->global_motion[frame].wmtype] -
+      gm_info->type_cost[IDENTITY];
+}
+
+static int compare_distance(const void *a, const void *b) {
+  const int diff =
+      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+  if (diff > 0)
+    return 1;
+  else if (diff < 0)
+    return -1;
+  return 0;
+}
+
+static INLINE void compute_global_motion_for_references(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Compute global motion w.r.t. reference frames starting from the nearest ref
+  // frame in a given direction
+  for (int frame = 0; frame < num_ref_frames; frame++) {
+    int ref_frame = reference_frame[frame].frame;
+    compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, num_frm_corners,
+                                    frm_corners, frm_buffer, params_by_motion,
+                                    segment_map, segment_map_w, segment_map_h);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        reference_frame[frame].distance != 0 &&
+        cm->global_motion[ref_frame].wmtype != ROTZOOM)
+      break;
+  }
+}
+
+static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      cpi->sf.inter_sf.selective_ref_frame >= 2) {
+    AV1_COMMON *const cm = &cpi->common;
+    const OrderHintInfo *const order_hint_info =
+        &cm->seq_params.order_hint_info;
+    const int cur_frame_display_order_hint =
+        cm->current_frame.display_order_hint;
+    unsigned int *ref_display_order_hint =
+        cm->cur_frame->ref_display_order_hint;
+    const int arf2_dist = av1_encoder_get_relative_dist(
+        order_hint_info, ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+    const int bwd_dist = av1_encoder_get_relative_dist(
+        order_hint_info, ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+
+    for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_idx);
+      if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) ||
+          !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) {
+        continue;
+      }
+
+      if (!cpi->all_one_sided_refs) {
+        int ref_dist[2];
+        for (int i = 0; i < 2; ++i) {
+          ref_dist[i] = av1_encoder_get_relative_dist(
+              order_hint_info, ref_display_order_hint[rf[i] - LAST_FRAME],
+              cur_frame_display_order_hint);
+        }
+
+        // One-sided compound is used only when all reference frames are
+        // one-sided.
+        if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) {
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+
+      if (cpi->sf.inter_sf.selective_ref_frame >= 4 &&
+          (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) &&
+          (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+        // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+        if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) {
+          // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+          // reference to the current frame than ALTREF2_FRAME
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+    }
+  }
+}
+
+#define CHECK_PRECOMPUTED_REF_FRAME_MAP 0
+
+static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
   int i;
 
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
-#if CONFIG_DIST_8X8
-  x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
-  x->tune_metric = cpi->oxcf.tuning;
-#endif
-  cm->setup_mi(cm);
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    mi_params->setup_mi(mi_params);
+  }
 
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
+  set_mi_offsets(mi_params, xd, 0, 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  x->fwd_txfm4x4 = aom_fdct4x4;
+#else
+  x->fwd_txfm4x4 = aom_fdct4x4_lp;
+#endif
 
   av1_zero(*td->counts);
   av1_zero(rdc->comp_pred_diff);
-  // Two pass partition search can be enabled/disabled for different frames.
-  // Reset this data at frame level to avoid any incorrect usage.
-  init_first_partition_pass_stats_tables(cpi, x->first_partition_pass_stats);
+  av1_zero(rdc->tx_type_used);
+  av1_zero(rdc->obmc_used);
+  av1_zero(rdc->warped_used);
 
   // Reset the flag.
   cpi->intrabc_used = 0;
   // Need to disable intrabc when superres is selected
   if (av1_superres_scaled(cm)) {
-    cm->allow_intrabc = 0;
+    features->allow_intrabc = 0;
   }
 
-  cm->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+  features->allow_intrabc &= (cpi->oxcf.enable_intrabc);
 
-  if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    if (frame_probs->warped_probs[update_type] <
+        cpi->sf.inter_sf.prune_warped_prob_thresh)
+      features->allow_warped_motion = 0;
+  }
+
+  int hash_table_created = 0;
+  if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    // TODO(any): move this outside of the recoding loop to avoid recalculating
+    // the hash table.
     // add to hash table
     const int pic_width = cpi->source->y_crop_width;
     const int pic_height = cpi->source->y_crop_height;
@@ -5347,46 +5691,28 @@
       }
     }
 
-    av1_hash_table_create(&cm->cur_frame->hash_table);
-    av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
-                                      is_block_same[0], &cpi->td.mb);
-    av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
-                                  block_hash_values[1], is_block_same[0],
-                                  is_block_same[1], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
-        pic_width, pic_height, 4);
-    av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
-                                  block_hash_values[0], is_block_same[1],
-                                  is_block_same[0], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
-        pic_width, pic_height, 8);
-    av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
-                                  block_hash_values[1], is_block_same[0],
-                                  is_block_same[1], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
-        pic_width, pic_height, 16);
-    av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
-                                  block_hash_values[0], is_block_same[1],
-                                  is_block_same[0], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
-        pic_width, pic_height, 32);
-    av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
-                                  block_hash_values[1], is_block_same[0],
-                                  is_block_same[1], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
-        pic_width, pic_height, 64);
-
-    av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
-                                  block_hash_values[0], is_block_same[1],
-                                  is_block_same[0], &cpi->td.mb);
-    av1_add_to_hash_map_by_row_with_precal_data(
-        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
-        pic_width, pic_height, 128);
+    av1_hash_table_init(intrabc_hash_info);
+    av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table);
+    hash_table_created = 1;
+    av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
+                                      block_hash_values[0], is_block_same[0]);
+    // Hash data generated for screen contents is used for intraBC ME
+    const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
+    const int max_sb_size =
+        (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2));
+    int src_idx = 0;
+    for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
+      const int dst_idx = !src_idx;
+      av1_generate_block_hash_value(
+          intrabc_hash_info, cpi->source, size, block_hash_values[src_idx],
+          block_hash_values[dst_idx], is_block_same[src_idx],
+          is_block_same[dst_idx]);
+      if (size >= min_alloc_size) {
+        av1_add_to_hash_map_by_row_with_precal_data(
+            &intrabc_hash_info->intrabc_hash_table, block_hash_values[dst_idx],
+            is_block_same[dst_idx][2], pic_width, pic_height, size);
+      }
+    }
 
     for (k = 0; k < 2; k++) {
       for (j = 0; j < 2; j++) {
@@ -5399,74 +5725,69 @@
     }
   }
 
+  const CommonQuantParams *quant_params = &cm->quant_params;
   for (i = 0; i < MAX_SEGMENTS; ++i) {
-    const int qindex = cm->seg.enabled
-                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
-                           : cm->base_qindex;
-    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
-                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
-    if (xd->lossless[i]) cpi->has_lossless_segment = 1;
+    const int qindex =
+        cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex)
+                        : quant_params->base_qindex;
+    xd->lossless[i] =
+        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1;
     xd->qindex[i] = qindex;
     if (xd->lossless[i]) {
-      cpi->optimize_seg_arr[i] = 0;
+      cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT;
     } else {
-      cpi->optimize_seg_arr[i] = cpi->sf.optimize_coefficients;
+      cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients;
     }
   }
-  cm->coded_lossless = is_coded_lossless(cm, xd);
-  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
-
-  cm->tx_mode = select_tx_mode(cpi);
+  features->coded_lossless = is_coded_lossless(cm, xd);
+  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
 
   // Fix delta q resolution for the moment
-  cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES;
+  cm->delta_q_info.delta_q_res = 0;
+  if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE)
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+  else if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL)
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
   // Set delta_q_present_flag before it is used for the first time
   cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
   cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
-  cm->delta_q_info.delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
-  cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
-  // update delta_q_present_flag and delta_lf_present_flag based on
-  // base_qindex
-  cm->delta_q_info.delta_q_present_flag &= cm->base_qindex > 0;
-  cm->delta_q_info.delta_lf_present_flag &= cm->base_qindex > 0;
 
-  if (cpi->twopass.gf_group.index &&
-      cpi->twopass.gf_group.index < MAX_LAG_BUFFERS &&
-      cpi->oxcf.enable_tpl_model) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
-    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-
-    int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    int row, col;
-
-    for (row = 0; row < cm->mi_rows; ++row) {
-      for (col = 0; col < cm->mi_cols; ++col) {
-        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
-        intra_cost_base += this_stats->intra_cost;
-        mc_dep_cost_base += this_stats->mc_dep_cost;
-      }
-    }
-
-    aom_clear_system_state();
-
-    if (tpl_frame->is_valid)
-      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
+  // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q is used
+  // for ineligible frames. That effectively will turn off row_mt usage.
+  // Note objective delta_q and tpl eligible frames are only altref frames
+  // currently.
+  if (cm->delta_q_info.delta_q_present_flag) {
+    if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
+        !is_frame_tpl_eligible(cpi))
+      cm->delta_q_info.delta_q_present_flag = 0;
   }
 
-  av1_frame_init_quantizer(cpi);
+  // Reset delta_q_used flag
+  cpi->deltaq_used = 0;
 
+  cm->delta_q_info.delta_lf_present_flag =
+      cm->delta_q_info.delta_q_present_flag && cpi->oxcf.deltalf_mode;
+  cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+  // update delta_q_present_flag and delta_lf_present_flag based on
+  // base_qindex
+  cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+  cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+
+  av1_frame_init_quantizer(cpi);
   av1_initialize_rd_consts(cpi);
-  av1_initialize_me_consts(cpi, x, cm->base_qindex);
+  av1_initialize_me_consts(cpi, x, quant_params->base_qindex);
+
   init_encode_frame_mb_context(cpi);
-  set_default_interp_skip_flags(cpi);
-  if (cm->prev_frame)
+  set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
+  if (cm->prev_frame && cm->prev_frame->seg.enabled)
     cm->last_frame_seg_map = cm->prev_frame->seg_map;
   else
     cm->last_frame_seg_map = NULL;
-  if (cm->allow_intrabc || cm->coded_lossless) {
+  if (features->allow_intrabc || features->coded_lossless) {
     av1_set_default_ref_deltas(cm->lf.ref_deltas);
     av1_set_default_mode_deltas(cm->lf.mode_deltas);
   } else if (cm->prev_frame) {
@@ -5476,9 +5797,12 @@
   memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
   memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
 
-  // Special case: set prev_mi to NULL when the previous mode info
-  // context cannot be used.
-  cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm);
+
+  cpi->prune_ref_frame_mask = 0;
+  // Figure out which ref frames can be skipped at frame level.
+  setup_prune_ref_frame_mask(cpi);
 
   x->txb_split_count = 0;
 #if CONFIG_SPEED_STATS
@@ -5489,146 +5813,80 @@
   start_timing(cpi, av1_compute_global_motion_time);
 #endif
   av1_zero(rdc->global_motion_used);
-  av1_zero(cpi->gmparams_cost);
+  av1_zero(gm_info->params_cost);
   if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
-      cpi->oxcf.enable_global_motion && !cpi->global_motion_search_done) {
+      cpi->oxcf.enable_global_motion && !gm_info->search_done) {
     YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
-    int frame;
-    double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
-    const double *params_this_motion;
-    int inliers_by_motion[RANSAC_NUM_MOTIONS];
-    WarpedMotionParams tmp_wm_params;
-    // clang-format off
-    static const double kIdentityParams[MAX_PARAMDIM - 1] = {
-      0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+    MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      memset(&params_by_motion[m], 0, sizeof(params_by_motion[m]));
+      params_by_motion[m].inliers =
+          aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
+    }
+
+    int num_frm_corners = -1;
+    int frm_corners[2 * MAX_CORNERS];
+    unsigned char *frm_buffer = cpi->source->y_buffer;
+    if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+      // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+      // following code. We cache the result until the frame is released.
+      frm_buffer =
+          av1_downconvert_frame(cpi->source, cpi->common.seq_params.bit_depth);
+    }
+    const int segment_map_w =
+        (cpi->source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+    const int segment_map_h =
+        (cpi->source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+
+    uint8_t *segment_map =
+        aom_malloc(sizeof(*segment_map) * segment_map_w * segment_map_h);
+    memset(segment_map, 0,
+           sizeof(*segment_map) * segment_map_w * segment_map_h);
+
+    FrameDistPair future_ref_frame[REF_FRAMES - 1] = {
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }
     };
-    // clang-format on
-    int num_refs_using_gm = 0;
+    FrameDistPair past_ref_frame[REF_FRAMES - 1] = {
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }
+    };
+    int num_past_ref_frames = 0;
+    int num_future_ref_frames = 0;
+    // Populate ref_buf for valid ref frames in global motion
+    update_valid_ref_frames_for_gm(cpi, ref_buf, past_ref_frame,
+                                   future_ref_frame, &num_past_ref_frames,
+                                   &num_future_ref_frames);
 
-    for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
-      ref_buf[frame] = NULL;
-      RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
-      if (buf != NULL) ref_buf[frame] = &buf->buf;
-      int pframe;
-      cm->global_motion[frame] = default_warp_params;
-      const WarpedMotionParams *ref_params =
-          cm->prev_frame ? &cm->prev_frame->global_motion[frame]
-                         : &default_warp_params;
-      // check for duplicate buffer
-      for (pframe = ALTREF_FRAME; pframe > frame; --pframe) {
-        if (ref_buf[frame] == ref_buf[pframe]) break;
-      }
-      if (pframe > frame) {
-        memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
-               sizeof(WarpedMotionParams));
-      } else if (ref_buf[frame] &&
-                 ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
-                 ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
-                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
-                 !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
-        TransformationType model;
-        const int64_t ref_frame_error = av1_frame_error(
-            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
-            ref_buf[frame]->y_stride, cpi->source->y_buffer,
-            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride);
+    // Sort the ref frames in the ascending order of their distance from the
+    // current frame
+    qsort(past_ref_frame, num_past_ref_frames, sizeof(past_ref_frame[0]),
+          compare_distance);
+    qsort(future_ref_frame, num_future_ref_frames, sizeof(future_ref_frame[0]),
+          compare_distance);
 
-        if (ref_frame_error == 0) continue;
+    // Compute global motion w.r.t. past reference frames
+    if (num_past_ref_frames > 0)
+      compute_global_motion_for_references(
+          cpi, ref_buf, past_ref_frame, num_past_ref_frames, &num_frm_corners,
+          frm_corners, frm_buffer, params_by_motion, segment_map, segment_map_w,
+          segment_map_h);
 
-        aom_clear_system_state();
+    // Compute global motion w.r.t. future reference frames
+    if (num_future_ref_frames > 0)
+      compute_global_motion_for_references(
+          cpi, ref_buf, future_ref_frame, num_future_ref_frames,
+          &num_frm_corners, frm_corners, frm_buffer, params_by_motion,
+          segment_map, segment_map_w, segment_map_h);
 
-        // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
-        const int do_adaptive_gm_estimation = 0;
+    aom_free(segment_map);
 
-        const int ref_frame_dist = get_relative_dist(
-            &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
-            cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
-        const GlobalMotionEstimationType gm_estimation_type =
-            cm->seq_params.order_hint_info.enable_order_hint &&
-                    abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
-                ? GLOBAL_MOTION_DISFLOW_BASED
-                : GLOBAL_MOTION_FEATURE_BASED;
-        for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
-          int64_t best_warp_error = INT64_MAX;
-          // Initially set all params to identity.
-          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-            memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
-                   (MAX_PARAMDIM - 1) * sizeof(*params_by_motion));
-          }
-
-          av1_compute_global_motion(model, cpi->source, ref_buf[frame],
-                                    cpi->common.seq_params.bit_depth,
-                                    gm_estimation_type, inliers_by_motion,
-                                    params_by_motion, RANSAC_NUM_MOTIONS);
-
-          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-            if (inliers_by_motion[i] == 0) continue;
-
-            params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i;
-            av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
-
-            if (tmp_wm_params.wmtype != IDENTITY) {
-              const int64_t warp_error = av1_refine_integerized_param(
-                  &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd),
-                  xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
-                  ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
-                  cpi->source->y_buffer, cpi->source->y_width,
-                  cpi->source->y_height, cpi->source->y_stride, 5,
-                  best_warp_error);
-              if (warp_error < best_warp_error) {
-                best_warp_error = warp_error;
-                // Save the wm_params modified by
-                // av1_refine_integerized_param() rather than motion index to
-                // avoid rerunning refine() below.
-                memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
-                       sizeof(WarpedMotionParams));
-              }
-            }
-          }
-          if (cm->global_motion[frame].wmtype <= AFFINE)
-            if (!get_shear_params(&cm->global_motion[frame]))
-              cm->global_motion[frame] = default_warp_params;
-
-          if (cm->global_motion[frame].wmtype == TRANSLATION) {
-            cm->global_motion[frame].wmmat[0] =
-                convert_to_trans_prec(cm->allow_high_precision_mv,
-                                      cm->global_motion[frame].wmmat[0]) *
-                GM_TRANS_ONLY_DECODE_FACTOR;
-            cm->global_motion[frame].wmmat[1] =
-                convert_to_trans_prec(cm->allow_high_precision_mv,
-                                      cm->global_motion[frame].wmmat[1]) *
-                GM_TRANS_ONLY_DECODE_FACTOR;
-          }
-
-          // If the best error advantage found doesn't meet the threshold for
-          // this motion type, revert to IDENTITY.
-          if (!av1_is_enough_erroradvantage(
-                  (double)best_warp_error / ref_frame_error,
-                  gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                                     cm->allow_high_precision_mv),
-                  cpi->sf.gm_erroradv_type)) {
-            cm->global_motion[frame] = default_warp_params;
-          }
-          if (cm->global_motion[frame].wmtype != IDENTITY) break;
-        }
-        aom_clear_system_state();
-      }
-      if (cm->global_motion[frame].wmtype != IDENTITY) num_refs_using_gm++;
-      cpi->gmparams_cost[frame] =
-          gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                             cm->allow_high_precision_mv) +
-          cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
-          cpi->gmtype_cost[IDENTITY];
+    gm_info->search_done = 1;
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      aom_free(params_by_motion[m].inliers);
     }
-    // clear disabled ref_frames
-    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
-      const int ref_disabled =
-          !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
-      if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
-        cpi->gmparams_cost[frame] = 0;
-        cm->global_motion[frame] = default_warp_params;
-      }
-    }
-    cpi->global_motion_search_done = 1;
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
          REF_FRAMES * sizeof(WarpedMotionParams));
@@ -5639,53 +5897,151 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_setup_motion_field_time);
 #endif
-  av1_setup_motion_field(cm);
+  if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_setup_motion_field_time);
 #endif
 
-  cpi->all_one_sided_refs =
-      frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
-
   cm->current_frame.skip_mode_info.skip_mode_flag =
       check_skip_mode_enabled(cpi);
 
-  {
-    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
-    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
-    cpi->row_mt = 0;
-    if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
-      cpi->row_mt = 1;
-      cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
-      cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
-      av1_encode_tiles_row_mt(cpi);
-    } else {
-      if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
-        av1_encode_tiles_mt(cpi);
-      else
-        encode_tiles(cpi);
-    }
+  cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
+  cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
+  cpi->row_mt = 0;
+
+  if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
+    cpi->row_mt = 1;
+    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
+    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
+    av1_encode_tiles_row_mt(cpi);
+  } else {
+    if (AOMMIN(cpi->oxcf.max_threads, cm->tiles.cols * cm->tiles.rows) > 1)
+      av1_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
   }
 
   // If intrabc is allowed but never selected, reset the allow_intrabc flag.
-  if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0;
-  if (cm->allow_intrabc) cm->delta_q_info.delta_lf_present_flag = 0;
+  if (features->allow_intrabc && !cpi->intrabc_used) {
+    features->allow_intrabc = 0;
+  }
+  if (features->allow_intrabc) {
+    cm->delta_q_info.delta_lf_present_flag = 0;
+  }
+
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+  // Set the transform size appropriately before bitstream creation
+  const MODE_EVAL_TYPE eval_type =
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch
+          ? WINNER_MODE_EVAL
+          : DEFAULT_EVAL;
+  const TX_SIZE_SEARCH_METHOD tx_search_type =
+      cpi->winner_mode_params.tx_size_search_methods[eval_type];
+  assert(cpi->oxcf.enable_tx64 || tx_search_type != USE_LARGESTALL);
+  features->tx_mode = select_tx_mode(cm, tx_search_type);
+
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < TX_SIZES_ALL; i++) {
+      int sum = 0;
+      int j;
+      int left = 1024;
+
+      for (j = 0; j < TX_TYPES; j++)
+        sum += cpi->td.rd_counts.tx_type_used[i][j];
+
+      for (j = TX_TYPES - 1; j >= 0; j--) {
+        const int new_prob =
+            sum ? 1024 * cpi->td.rd_counts.tx_type_used[i][j] / sum
+                : (j ? 0 : 1024);
+        int prob =
+            (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+        left -= prob;
+        if (j == 0) prob += left;
+        frame_probs->tx_type_probs[update_type][i][j] = prob;
+      }
+    }
+  }
+
+  if (!cpi->sf.inter_sf.disable_obmc &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+      int sum = 0;
+      for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+      const int new_prob =
+          sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+      frame_probs->obmc_probs[update_type][i] =
+          (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+    }
+  }
+
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    int sum = 0;
+    for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
+    const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
+    frame_probs->warped_probs[update_type] =
+        (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+  }
+
+  if (cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+      features->interp_filter == SWITCHABLE) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      int sum = 0;
+      int j;
+      int left = 1536;
+
+      for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+        sum += cpi->td.counts->switchable_interp[i][j];
+      }
+
+      for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+        const int new_prob =
+            sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
+                : (j ? 0 : 1536);
+        int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                    new_prob) >>
+                   1;
+        left -= prob;
+        if (j == 0) prob += left;
+        frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+      }
+    }
+  }
+
+  if ((!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+       !cpi->sf.rt_sf.use_nonrd_pick_mode) ||
+      hash_table_created) {
+    av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
+  }
 }
 
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
-  cm->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
+  features->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
 
   // Make sure segment_id is no larger than last_active_segid.
   if (cm->seg.enabled && cm->seg.update_map) {
-    const int mi_rows = cm->mi_rows;
-    const int mi_cols = cm->mi_cols;
+    const int mi_rows = cm->mi_params.mi_rows;
+    const int mi_cols = cm->mi_params.mi_cols;
     const int last_active_segid = cm->seg.last_active_segid;
-    uint8_t *map = cpi->segmentation_map;
+    uint8_t *map = cpi->enc_seg.map;
     for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
       for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
         map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
@@ -5695,54 +6051,71 @@
   }
 
   av1_setup_frame_buf_refs(cm);
-  enforce_max_ref_frames(cpi);
+  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags);
+  set_rel_frame_dist(cpi);
   av1_setup_frame_sign_bias(cm);
 
+#if CHECK_PRECOMPUTED_REF_FRAME_MAP
+  GF_GROUP *gf_group = &cpi->gf_group;
+  // TODO(yuec): The check is disabled on OVERLAY frames for now, because info
+  // in cpi->gf_group has been refreshed for the next GOP when the check is
+  // performed for OVERLAY frames. Since we have not support inter-GOP ref
+  // frame map computation, the precomputed ref map for an OVERLAY frame is all
+  // -1 at this point (although it is meaning before gf_group is refreshed).
+  if (!frame_is_intra_only(cm) && gf_group->index != 0) {
+    const RefCntBuffer *const golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+
+    if (golden_buf) {
+      const int golden_order_hint = golden_buf->order_hint;
+
+      for (int ref = LAST_FRAME; ref < EXTREF_FRAME; ++ref) {
+        const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+        const int ref_disp_idx_precomputed =
+            gf_group->ref_frame_disp_idx[gf_group->index][ref - LAST_FRAME];
+
+        (void)ref_disp_idx_precomputed;
+
+        if (buf != NULL) {
+          const int ref_disp_idx =
+              get_relative_dist(&cm->seq_params.order_hint_info,
+                                buf->order_hint, golden_order_hint);
+
+          if (ref_disp_idx >= 0)
+            assert(ref_disp_idx == ref_disp_idx_precomputed);
+          else
+            assert(ref_disp_idx_precomputed == -1);
+        } else {
+          assert(ref_disp_idx_precomputed == -1);
+        }
+      }
+    }
+  }
+#endif
+
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(num_planes);
 #else
   (void)num_planes;
 #endif
 
-  if (cpi->sf.frame_parameter_update) {
-    int i;
-    RD_OPT *const rd_opt = &cpi->rd;
+  if (cpi->sf.hl_sf.frame_parameter_update) {
     RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
-    // This code does a single RD pass over the whole frame assuming
-    // either compound, single or hybrid prediction as per whatever has
-    // worked best for that type of frame in the past.
-    // It also predicts whether another coding mode would have worked
-    // better than this coding mode. If that is the case, it remembers
-    // that for subsequent frames.
-    // It does the same analysis for transform size selection also.
-    //
-    // TODO(zoeliu): To investigate whether a frame_type other than
-    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
-    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
-    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
-    const int is_alt_ref = frame_type == ALTREF_FRAME;
-
-    /* prediction (compound, single or hybrid) mode selection */
-    // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
-    if (is_alt_ref || frame_is_intra_only(cm))
+    if (frame_is_intra_only(cm))
       current_frame->reference_mode = SINGLE_REFERENCE;
     else
       current_frame->reference_mode = REFERENCE_MODE_SELECT;
 
-    cm->interp_filter = SWITCHABLE;
-    if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR;
+    features->interp_filter = SWITCHABLE;
+    if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
 
-    cm->switchable_motion_mode = 1;
+    features->switchable_motion_mode = 1;
 
     rdc->compound_ref_used_flag = 0;
     rdc->skip_mode_used_flag = 0;
 
     encode_frame_internal(cpi);
 
-    for (i = 0; i < REFERENCE_MODES; ++i)
-      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
-
     if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
       // Use a flag that includes 4x4 blocks
       if (rdc->compound_ref_used_flag == 0) {
@@ -5763,19 +6136,20 @@
     if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0)
       skip_mode_info->skip_mode_flag = 0;
 
-    if (!cm->large_scale_tile) {
-      if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
-        cm->tx_mode = TX_MODE_LARGEST;
+    if (!cm->tiles.large_scale) {
+      if (features->tx_mode == TX_MODE_SELECT &&
+          cpi->td.mb.txb_split_count == 0)
+        features->tx_mode = TX_MODE_LARGEST;
     }
   } else {
     encode_frame_internal(cpi);
   }
 }
 
-static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
-                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
-                              int blk_row, int blk_col,
-                              uint8_t allow_update_cdf) {
+static AOM_INLINE void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                                         FRAME_COUNTS *counts, TX_SIZE tx_size,
+                                         int depth, int blk_row, int blk_col,
+                                         uint8_t allow_update_cdf) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int max_blocks_high = max_block_high(xd, bsize, 0);
@@ -5838,30 +6212,33 @@
   }
 }
 
-static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                      BLOCK_SIZE plane_bsize, int mi_row,
-                                      int mi_col, FRAME_COUNTS *td_counts,
-                                      uint8_t allow_update_cdf) {
+static AOM_INLINE void tx_partition_count_update(const AV1_COMMON *const cm,
+                                                 MACROBLOCK *x,
+                                                 BLOCK_SIZE plane_bsize,
+                                                 FRAME_COUNTS *td_counts,
+                                                 uint8_t allow_update_cdf) {
   MACROBLOCKD *xd = &x->e_mbd;
-  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
   const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
-  int idx, idy;
 
-  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
   xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
 
-  for (idy = 0; idy < mi_height; idy += bh)
-    for (idx = 0; idx < mi_width; idx += bw)
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
       update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
                         allow_update_cdf);
+    }
+  }
 }
 
-static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
-                             int blk_col) {
+static AOM_INLINE void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        int blk_row, int blk_col) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int max_blocks_high = max_block_high(xd, bsize, 0);
@@ -5898,29 +6275,31 @@
   }
 }
 
-static void tx_partition_set_contexts(const AV1_COMMON *const cm,
-                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
-                                      int mi_row, int mi_col) {
-  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+static AOM_INLINE void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                                 MACROBLOCKD *xd,
+                                                 BLOCK_SIZE plane_bsize) {
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
   const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
-  int idx, idy;
 
-  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
   xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
 
-  for (idy = 0; idy < mi_height; idy += bh)
-    for (idx = 0; idx < mi_width; idx += bw)
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
       set_txfm_context(xd, max_tx_size, idy, idx);
+    }
+  }
 }
 
-static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
-                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              int *rate) {
+static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
+                                         TileDataEnc *tile_data, ThreadData *td,
+                                         TOKENEXTRA **t, RUN_TYPE dry_run,
+                                         BLOCK_SIZE bsize, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
@@ -5929,59 +6308,35 @@
   MB_MODE_INFO *mbmi = mi_4x4[0];
   const int seg_skip =
       segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  const int mis = cm->mi_stride;
+  const int mis = cm->mi_params.mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
   const int is_inter = is_inter_block(mbmi);
 
-  if (cpi->two_pass_partition_search && x->cb_partition_scan) {
-    for (int row = mi_row; row < mi_row + mi_width;
-         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-      for (int col = mi_col; col < mi_col + mi_height;
-           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-        const int index = av1_first_partition_pass_stats_index(row, col);
-        FIRST_PARTITION_PASS_STATS *const stats =
-            &x->first_partition_pass_stats[index];
-        // Increase the counter of data samples.
-        ++stats->sample_counts;
-        // Increase the counter for ref_frame[0] and ref_frame[1].
-        if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
-          ++stats->ref0_counts[mbmi->ref_frame[0]];
-        if (mbmi->ref_frame[1] >= 0 &&
-            stats->ref1_counts[mbmi->ref_frame[1]] < 255)
-          ++stats->ref1_counts[mbmi->ref_frame[1]];
-        if (cpi->sf.use_first_partition_pass_interintra_stats) {
-          // Increase the counter for interintra_motion_mode_count
-          if (mbmi->motion_mode == 0 && mbmi->ref_frame[1] == INTRA_FRAME &&
-              stats->interintra_motion_mode_count[mbmi->ref_frame[0]] < 255) {
-            ++stats->interintra_motion_mode_count[mbmi->ref_frame[0]];
-          }
-        }
-      }
-    }
-  }
+  // Initialize tx_mode and tx_size_search_method
+  set_tx_size_search_method(
+      cm, &cpi->winner_mode_params, x,
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
 
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   if (!is_inter) {
-    xd->cfl.is_chroma_reference =
-        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
-                            cm->seq_params.subsampling_y);
     xd->cfl.store_y = store_cfl_required(cm, xd);
     mbmi->skip = 1;
     for (int plane = 0; plane < num_planes; ++plane) {
-      av1_encode_intra_block_plane(cpi, x, bsize, plane,
-                                   cpi->optimize_seg_arr[mbmi->segment_id],
-                                   mi_row, mi_col);
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
     }
 
     // If there is at least one lossless segment, force the skip for intra
     // block to be 0, in order to avoid the segment_id to be changed by in
     // write_segment_id().
     if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
-        cpi->has_lossless_segment)
+        cpi->enc_seg.has_lossless_segment)
       mbmi->skip = 0;
 
     xd->cfl.store_y = 0;
-    if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
       for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           if (!dry_run) {
@@ -5996,7 +6351,7 @@
       }
     }
 
-    av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col,
+    av1_update_txb_context(cpi, td, dry_run, bsize,
                            tile_data->allow_update_cdf);
   } else {
     int ref;
@@ -6010,12 +6365,12 @@
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            xd->block_ref_scale_factors[ref], num_planes);
     }
-
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                  av1_num_planes(cm) - 1);
+    int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  start_plane, av1_num_planes(cm) - 1);
     if (mbmi->motion_mode == OBMC_CAUSAL) {
       assert(cpi->oxcf.enable_obmc == 1);
-      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+      av1_build_obmc_inter_predictors_sb(cm, xd);
     }
 
 #if CONFIG_MISMATCH_DEBUG
@@ -6038,17 +6393,18 @@
     (void)num_planes;
 #endif
 
-    av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
-    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
+    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
                           tile_data->allow_update_cdf);
   }
 
   if (!dry_run) {
     if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
-    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
-        mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) {
+    if (x->tx_mode_search_type == TX_MODE_SELECT &&
+        !xd->lossless[mbmi->segment_id] && mbmi->sb_type > BLOCK_4X4 &&
+        !(is_inter && (mbmi->skip || seg_skip))) {
       if (is_inter) {
-        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts,
+        tx_partition_count_update(cm, x, bsize, td->counts,
                                   tile_data->allow_update_cdf);
       } else {
         if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
@@ -6076,7 +6432,7 @@
         if (xd->lossless[mbmi->segment_id]) {
           intra_tx_size = TX_4X4;
         } else {
-          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+          intra_tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
         }
       } else {
         intra_tx_size = mbmi->tx_size;
@@ -6084,17 +6440,18 @@
 
       for (j = 0; j < mi_height; j++)
         for (i = 0; i < mi_width; i++)
-          if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+          if (mi_col + i < cm->mi_params.mi_cols &&
+              mi_row + j < cm->mi_params.mi_rows)
             mi_4x4[mis * j + i]->tx_size = intra_tx_size;
 
       if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
     }
   }
 
-  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) &&
-      is_inter && !(mbmi->skip || seg_skip) &&
-      !xd->lossless[mbmi->segment_id]) {
-    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+      block_signals_txsize(mbmi->sb_type) && is_inter &&
+      !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
   } else {
     TX_SIZE tx_size = mbmi->tx_size;
     // The new intra coding scheme requires no change of transform size
@@ -6102,20 +6459,17 @@
       if (xd->lossless[mbmi->segment_id]) {
         tx_size = TX_4X4;
       } else {
-        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+        tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
       }
     } else {
       tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
     }
     mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
+    set_txfm_ctxs(tx_size, xd->width, xd->height,
                   (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
   }
-  CFL_CTX *const cfl = &xd->cfl;
-  if (is_inter_block(mbmi) &&
-      !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                           cfl->subsampling_y) &&
-      is_cfl_allowed(xd)) {
+
+  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
     cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
 }

diff --git a/libaom/av1/encoder/encodeframe.h b/libaom/av1/encoder/encodeframe.h
index 3b1730d..e4c4841 100644
--- a/libaom/av1/encoder/encodeframe.h
+++ b/libaom/av1/encoder/encodeframe.h

@@ -20,7 +20,9 @@
 extern "C" {
 #endif
 
-#define DELTAQ_MODULATION 1  // 0: variance based, 1: wavelet AC energy based
+#define DELTA_Q_PERCEPTUAL_MODULATION \
+  1  // 0: variance based
+     // 1: wavelet AC energy based
 
 struct macroblock;
 struct yv12_buffer_config;

diff --git a/libaom/av1/encoder/encodemb.c b/libaom/av1/encoder/encodemb.c
index 8e9da61..ec33362 100644
--- a/libaom/av1/encoder/encodemb.c
+++ b/libaom/av1/encoder/encodemb.c

@@ -35,30 +35,19 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 
-// Check if one needs to use c version subtraction.
-static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
-
-static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                           int16_t *diff, ptrdiff_t diff_stride,
-                           const uint8_t *src8, ptrdiff_t src_stride,
-                           const uint8_t *pred8, ptrdiff_t pred_stride) {
-  if (check_subtract_block_size(rows, cols)) {
-    if (is_cur_buf_hbd(xd)) {
-      aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
-                                  src_stride, pred8, pred_stride, xd->bd);
-      return;
-    }
-    aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
-                         pred_stride);
-
-    return;
-  }
-
+void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                        int16_t *diff, ptrdiff_t diff_stride,
+                        const uint8_t *src8, ptrdiff_t src_stride,
+                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+  assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride, xd->bd);
     return;
   }
+#endif
+  (void)xd;
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -73,54 +62,193 @@
   const int dst_stride = pd->dst.stride;
   const int tx1d_width = tx_size_wide[tx_size];
   const int tx1d_height = tx_size_high[tx_size];
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  uint8_t *src =
-      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
   int16_t *src_diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-  subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
-                 src_stride, dst, dst_stride);
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+  av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride);
 }
 
-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                 pd->dst.buf, pd->dst.stride);
+  av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
 }
 
-int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
                    const TXB_CTX *const txb_ctx, int fast_mode,
                    int *rate_cost) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
   const int eob = p->eobs[block];
   const int segment_id = xd->mi[0]->segment_id;
 
   if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
       xd->lossless[segment_id]) {
-    *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
+    *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
     return eob;
   }
 
-  return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+  return av1_optimize_txb_new(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
                               rate_cost, cpi->oxcf.sharpness, fast_mode);
 }
 
+// Hyper-parameters for dropout optimization, based on following logics.
+// TODO(yjshen): These settings are tuned by experiments. They may still be
+// optimized for better performance.
+// (1) Coefficients which are large enough will ALWAYS be kept.
+const tran_low_t DROPOUT_COEFF_MAX = 2;  // Max dropout-able coefficient.
+// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
+//     NOT required. For example, `5 0 0 0 7` is treated as two continuous
+//     coefficients if three zeros do not fulfill the dropout condition.
+const int DROPOUT_CONTINUITY_MAX = 2;  // Max dropout-able continuous coeff.
+// (3) Dropout operation is NOT applicable to blocks with large or small
+//     quantization index.
+const int DROPOUT_Q_MAX = 128;
+const int DROPOUT_Q_MIN = 16;
+// (4) Recall that dropout optimization will forcibly set some quantized
+//     coefficients to zero. The key logic on determining whether a coefficient
+//     should be dropped is to check the number of continuous zeros before AND
+//     after this coefficient. The exact number of zeros for judgement depends
+//     on block size and quantization index. More concretely, block size
+//     determines the base number of zeros, while quantization index determines
+//     the multiplier. Intuitively, larger block requires more zeros and larger
+//     quantization index also requires more zeros (more information is lost
+//     when using larger quantization index).
+const int DROPOUT_BEFORE_BASE_MAX = 32;  // Max base number for leading zeros.
+const int DROPOUT_BEFORE_BASE_MIN = 16;  // Min base number for leading zeros.
+const int DROPOUT_AFTER_BASE_MAX = 32;   // Max base number for trailing zeros.
+const int DROPOUT_AFTER_BASE_MIN = 16;   // Min base number for trailing zeros.
+const int DROPOUT_MULTIPLIER_MAX = 8;    // Max multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_MIN = 2;    // Min multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
+
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+  // Early return if `qindex` is out of range.
+  if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
+    return;
+  }
+
+  // Compute number of zeros used for dropout judgement.
+  const int base_size = AOMMAX(tx_width, tx_height);
+  const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE,
+                              DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX);
+  const int dropout_num_before =
+      multiplier *
+      CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX);
+  const int dropout_num_after =
+      multiplier *
+      CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
+
+  // Early return if there are not enough non-zero coefficients.
+  if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) {
+    return;
+  }
+
+  int count_zeros_before = 0;
+  int count_zeros_after = 0;
+  int count_nonzeros = 0;
+  // Index of the first non-zero coefficient after sufficient number of
+  // continuous zeros. If equals to `-1`, it means number of leading zeros
+  // hasn't reach `dropout_num_before`.
+  int idx = -1;
+  int eob = 0;  // New end of block.
+
+  for (int i = 0; i < p->eobs[block]; ++i) {
+    const int scan_idx = scan_order->scan[i];
+    if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) {  // Keep large coefficients.
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      idx = -1;
+      eob = i + 1;
+    } else if (qcoeff[scan_idx] == 0) {  // Count zeros.
+      if (idx == -1) {
+        ++count_zeros_before;
+      } else {
+        ++count_zeros_after;
+      }
+    } else {  // Count non-zeros.
+      if (count_zeros_before >= dropout_num_before) {
+        idx = (idx == -1) ? i : idx;
+        ++count_nonzeros;
+      } else {
+        count_zeros_before = 0;
+        eob = i + 1;
+      }
+    }
+
+    // Handle continuity.
+    if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      idx = -1;
+      eob = i + 1;
+    }
+
+    // Handle the trailing zeros after original end of block.
+    if (idx != -1 && i == p->eobs[block] - 1) {
+      count_zeros_after += (max_eob - p->eobs[block]);
+    }
+
+    // Set redundant coefficients to zeros if needed.
+    if (count_zeros_after >= dropout_num_after) {
+      for (int j = idx; j <= i; ++j) {
+        qcoeff[scan_order->scan[j]] = 0;
+        dqcoeff[scan_order->scan[j]] = 0;
+      }
+      count_zeros_before += (i - idx + 1);
+      count_zeros_after = 0;
+      count_nonzeros = 0;
+    } else if (i == p->eobs[block] - 1) {
+      eob = i + 1;
+    }
+  }
+
+  if (eob != p->eobs[block]) {
+    p->eobs[block] = eob;
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  }
+}
+
+// Settings for optimization type. NOTE: To set optimization type for all intra
+// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set.
+// TODO(yjshen): These settings are hard-coded and look okay for now. They
+// should be made configurable later.
+// Blocks of key frames ONLY.
+const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of intra frames (key frames EXCLUSIVE).
+const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
+// if trellis optimization is on for inter frames.)
+const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+
 enum {
   QUANT_FUNC_LOWBD = 0,
   QUANT_FUNC_HIGHBD = 1,
   QUANT_FUNC_TYPES = 2
 } UENUM1BYTE(QUANT_FUNC);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AV1_QUANT_FACADE
     quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
       { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
@@ -128,119 +256,155 @@
       { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
       { NULL, NULL }
     };
+#else
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
+  av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL
+};
+#endif
 
-void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
-                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, TX_TYPE tx_type,
-                     AV1_XFORM_QUANT xform_quant_idx) {
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     QUANT_PARAM *qparam) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const SCAN_ORDER *const scan_order =
+      get_scan(txfm_param->tx_size, txfm_param->tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const qcoeff = p->qcoeff + block_offset;
+  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
-  int seg_id = mbmi->segment_id;
-  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
-  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
-  const qm_val_t *qmatrix =
-      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
-                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
-  const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[seg_id][qm_tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
 
   const int src_offset = (blk_row * diff_stride + blk_col);
-  const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]];
-  QUANT_PARAM qparam;
-  qparam.log_scale = av1_get_tx_scale(tx_size);
-  qparam.tx_size = tx_size;
-  qparam.qmatrix = qmatrix;
-  qparam.iqmatrix = iqmatrix;
-  qparam.use_quant_b_adapt = cm->use_quant_b_adapt;
-  TxfmParam txfm_param;
-  txfm_param.tx_type = tx_type;
-  txfm_param.tx_size = tx_size;
-  txfm_param.lossless = xd->lossless[mbmi->segment_id];
-  txfm_param.tx_set_type = av1_get_ext_tx_set_type(
-      txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
+  const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
 
-  txfm_param.bd = xd->bd;
-  txfm_param.is_hbd = is_cur_buf_hbd(xd);
+  av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
 
-  av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
-
-  if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
-    const int n_coeffs = av1_get_max_eob(tx_size);
+  if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
     if (LIKELY(!x->skip_block)) {
-      quant_func_list[xform_quant_idx][txfm_param.is_hbd](
-          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
+#if CONFIG_AV1_HIGHBITDEPTH
+      quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#else
+      quant_func_list[qparam->xform_quant_idx](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#endif
     } else {
       av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
     }
   }
-  // NOTE: optimize_b_following is true means av1_optimze_b will be called
-  // When the condition of doing optimize_b is changed,
-  // this flag need update simultaneously
-  const int optimize_b_following =
-      (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless);
-  if (optimize_b_following) {
+  // use_optimize_b is true means av1_optimze_b will be called,
+  // thus cannot update entropy ctx now (performed in optimize_b)
+  if (qparam->use_optimize_b) {
+    p->txb_entropy_ctx[block] = 0;
+  } else {
     p->txb_entropy_ctx[block] =
         (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
-  } else {
-    p->txb_entropy_ctx[block] = 0;
   }
   return;
 }
 
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  txfm_param->tx_type = tx_type;
+  txfm_param->tx_size = tx_size;
+  txfm_param->lossless = xd->lossless[mbmi->segment_id];
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used);
+
+  txfm_param->bd = xd->bd;
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
+}
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam) {
+  qparam->log_scale = av1_get_tx_scale(tx_size);
+  qparam->tx_size = tx_size;
+
+  qparam->use_quant_b_adapt = use_quant_b_adapt;
+
+  // TODO(bohanli): optimize_b and quantization idx has relationship,
+  // but is kind of buried and complicated in different encoding stages.
+  // Should have a unified function to derive quant_idx, rather than
+  // determine and pass in the quant_idx
+  qparam->use_optimize_b = use_optimize_b;
+  qparam->xform_quant_idx = xform_quant_idx;
+
+  qparam->qmatrix = NULL;
+  qparam->iqmatrix = NULL;
+}
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam) {
+  qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type);
+  qparam->iqmatrix =
+      av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type);
+}
+
 static void encode_block(int plane, int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
-                         int mi_row, int mi_col, RUN_TYPE dry_run) {
-  (void)mi_row;
-  (void)mi_col;
+                         RUN_TYPE dry_run) {
   (void)dry_run;
   struct encode_b_args *const args = arg;
-  const AV1_COMMON *const cm = &args->cpi->common;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
   int dummy_rate_cost = 0;
 
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  dst = &pd->dst
-             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+  const int bw = mi_size_wide[plane_bsize];
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
 
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
 
+  TX_TYPE tx_type = DCT_DCT;
   if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
-    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
-                                      tx_size, cm->reduced_tx_set_used);
-    if (args->enable_optimize_b != NO_TRELLIS_OPT) {
-      av1_xform_quant(
-          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
-          USE_B_QUANT_NO_TRELLIS &&
-                  (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
-              ? AV1_XFORM_QUANT_B
-              : AV1_XFORM_QUANT_FP);
+    tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
+                    &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for inter frames.
+    const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+    const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+
+    if (quant_param.use_optimize_b && do_trellis) {
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
-    } else {
-      av1_xform_quant(
-          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
-          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+    }
+    if (!quant_param.use_optimize_b && do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
     }
   } else {
     p->eobs[block] = 0;
@@ -251,12 +415,9 @@
 
   if (p->eobs[block]) {
     *(args->skip) = 0;
-
-    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
-                                      tx_size, cm->reduced_tx_set_used);
     av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
                                 pd->dst.stride, p->eobs[block],
-                                cm->reduced_tx_set_used);
+                                cm->features.reduced_tx_set_used);
   }
 
   // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
@@ -272,13 +433,12 @@
       // enable_optimize_b is true to detect potential RD bug.
       const uint8_t disable_txk_check = args->enable_optimize_b;
       if (!disable_txk_check) {
-        assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
-                                                     blk_col)] == DCT_DCT);
+        assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+            DCT_DCT);
       }
     }
 #endif
-    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     DCT_DCT);
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
   }
 
 #if CONFIG_MISMATCH_DEBUG
@@ -287,8 +447,8 @@
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int blk_w = block_size_wide[bsize];
     int blk_h = block_size_high[bsize];
-    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
-                    pd->subsampling_x, pd->subsampling_y);
+    mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
+                    blk_row, pd->subsampling_x, pd->subsampling_y);
     mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
                              plane, pixel_c, pixel_r, blk_w, blk_h,
                              xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -298,10 +458,7 @@
 
 static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               void *arg, int mi_row, int mi_col,
-                               RUN_TYPE dry_run) {
-  (void)mi_row;
-  (void)mi_col;
+                               void *arg, RUN_TYPE dry_run) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -324,7 +481,7 @@
 
   if (tx_size == plane_tx_size || plane) {
     encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
-                 mi_row, mi_col, dry_run);
+                 dry_run);
   } else {
     assert(tx_size < TX_SIZES_ALL);
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
@@ -344,7 +501,7 @@
         if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
         encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
-                           arg, mi_row, mi_col, dry_run);
+                           arg, dry_run);
         block += step;
       }
     }
@@ -352,44 +509,39 @@
 }
 
 void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const uint8_t txw_unit = tx_size_wide_unit[tx_size];
   const uint8_t txh_unit = tx_size_high_unit[tx_size];
   const int step = txw_unit * txh_unit;
-  int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-  int blk_row, blk_col;
-
   const BLOCK_SIZE max_unit_bsize =
       get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
-  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+  const int mu_blocks_wide =
+      AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide);
+  const int mu_blocks_high =
+      AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high);
 
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+  int i = 0;
+  for (int r = 0; r < max_blocks_high; r += mu_blocks_high) {
     const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
     // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+    for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
       const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
-      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
-        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+      for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
           visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
           i += step;
         }
@@ -398,21 +550,8 @@
   }
 }
 
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes) {
-  for (int plane = 0; plane < num_planes; ++plane) {
-    if (!is_chroma_reference(mi_row, mi_col, bsize,
-                             xd->plane[plane].subsampling_x,
-                             xd->plane[plane].subsampling_y))
-      continue;
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-
 typedef struct encode_block_pass1_args {
-  AV1_COMMON *cm;
+  AV1_COMP *cpi;
   MACROBLOCK *x;
 } encode_block_pass1_args;
 
@@ -420,28 +559,31 @@
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                void *arg) {
   encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
-  AV1_COMMON *cm = args->cm;
+  AV1_COMP *cpi = args->cpi;
+  AV1_COMMON *cm = &cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  TxfmParam txfm_param;
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+
   uint8_t *dst;
-  dst = &pd->dst
-             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  DCT_DCT, AV1_XFORM_QUANT_B);
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+  av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
+                    &quant_param);
+
+  av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                  &quant_param);
 
   if (p->eobs[block] > 0) {
-    txfm_param.bd = xd->bd;
-    txfm_param.is_hbd = is_cur_buf_hbd(xd);
-    txfm_param.tx_type = DCT_DCT;
-    txfm_param.tx_size = tx_size;
     txfm_param.eob = p->eobs[block];
-    txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id];
-    txfm_param.tx_set_type = av1_get_ext_tx_set_type(
-        txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
     if (txfm_param.is_hbd) {
       av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
@@ -450,85 +592,65 @@
   }
 }
 
-void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
-  encode_block_pass1_args args = { cm, x };
+void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  encode_block_pass1_args args = { cpi, x };
   av1_subtract_plane(x, bsize, 0);
   av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
                                          encode_block_pass1, &args);
 }
 
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                   int mi_row, int mi_col, RUN_TYPE dry_run) {
-  (void)dry_run;
+                   RUN_TYPE dry_run) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->skip = 1;
+  if (x->force_skip) return;
+
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {
+    cpi,  x,    &ctx,    &mbmi->skip,
+    NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+  };
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  struct encode_b_args arg = { cpi,
-                               x,
-                               &ctx,
-                               &mbmi->skip,
-                               NULL,
-                               NULL,
-                               cpi->optimize_seg_arr[mbmi->segment_id] };
-  int plane;
-
-  mbmi->skip = 1;
-
-  if (x->skip) return;
-
-  for (plane = 0; plane < num_planes; ++plane) {
-    const int subsampling_x = xd->plane[plane].subsampling_x;
-    const int subsampling_y = xd->plane[plane].subsampling_y;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x,
-                             subsampling_y))
-      continue;
-
-    const BLOCK_SIZE bsizec =
-        scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
-
-    // TODO(jingning): Clean this up.
+  for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int subsampling_x = pd->subsampling_x;
+    const int subsampling_y = pd->subsampling_y;
+    if (plane && !xd->is_chroma_ref) break;
     const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
     const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
-
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
-    const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    const int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
-    int idx, idy;
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
     int block = 0;
-    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]);
-
-    av1_subtract_plane(x, bsizec, plane);
-
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+    av1_subtract_plane(x, plane_bsize, plane);
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
-
     const BLOCK_SIZE max_unit_bsize =
-        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-    int mu_blocks_wide =
-        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-    int mu_blocks_high =
-        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
+        get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
     mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
     mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
 
-    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
-      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
         int blk_row, blk_col;
         const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
         const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
         for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
           for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
             encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
-                               max_tx_size, &arg, mi_row, mi_col, dry_run);
+                               max_tx_size, &arg, dry_run);
             block += step;
           }
         }
@@ -555,25 +677,23 @@
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
   struct encode_b_args *const args = arg;
-  const AV1_COMMON *const cm = &args->cpi->common;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
   PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   int dummy_rate_cost = 0;
 
   av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
 
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  TX_TYPE tx_type = DCT_DCT;
+  const int bw = mi_size_wide[plane_bsize];
   if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
     *eob = 0;
     p->txb_entropy_ctx[block] = 0;
@@ -582,27 +702,59 @@
 
     const ENTROPY_CONTEXT *a = &args->ta[blk_col];
     const ENTROPY_CONTEXT *l = &args->tl[blk_row];
-    if (args->enable_optimize_b != NO_TRELLIS_OPT) {
-      av1_xform_quant(
-          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
-          USE_B_QUANT_NO_TRELLIS &&
-                  (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
-              ? AV1_XFORM_QUANT_B
-              : AV1_XFORM_QUANT_FP);
+    tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis =
+        is_trellis_used(args->enable_optimize_b, args->dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
+                    &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for key frames and
+    // intra frames.
+    const bool do_trellis = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+    const bool do_dropout = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+
+    if (quant_param.use_optimize_b && do_trellis) {
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
-    } else {
-      av1_xform_quant(
-          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
-          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+    }
+    if (do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
     }
   }
 
   if (*eob) {
     av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
-                                dst_stride, *eob, cm->reduced_tx_set_used);
+                                dst_stride, *eob,
+                                cm->features.reduced_tx_set_used);
   }
 
   // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
@@ -614,12 +766,11 @@
 #if 0
     if (args->cpi->oxcf.aq_mode == NO_AQ
         && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
-      assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
-                                                   blk_col)] == DCT_DCT);
+      assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+          DCT_DCT);
     }
 #endif
-    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     DCT_DCT);
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
   }
 
   // For intra mode, skipped blocks are so rare that transmitting skip=1 is
@@ -632,26 +783,23 @@
 }
 
 void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                                  BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b, int mi_row,
-                                  int mi_col) {
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const MACROBLOCKD *const xd = &x->e_mbd;
+  if (plane && !xd->is_chroma_ref) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
   ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
-
-  struct encode_b_args arg = {
-    cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b
-  };
-
-  if (!is_chroma_reference(mi_row, mi_col, bsize,
-                           xd->plane[plane].subsampling_x,
-                           xd->plane[plane].subsampling_y))
-    return;
-
+  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip),
+                               ta,  tl, dry_run, enable_optimize_b };
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
   if (enable_optimize_b) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    av1_get_entropy_contexts(bsize, pd, ta, tl);
+    av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
   }
   av1_foreach_transformed_block_in_plane(
-      xd, bsize, plane, encode_block_intra_and_set_context, &arg);
+      xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg);
 }

diff --git a/libaom/av1/encoder/encodemb.h b/libaom/av1/encoder/encodemb.h
index d4394cf..a337c83 100644
--- a/libaom/av1/encoder/encodemb.h
+++ b/libaom/av1/encoder/encodemb.h

@@ -14,7 +14,7 @@
 
 #include "config/aom_config.h"
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/tokenize.h"
@@ -34,7 +34,8 @@
   int8_t *skip;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
-  int8_t enable_optimize_b;
+  RUN_TYPE dry_run;
+  TRELLIS_OPT_TYPE enable_optimize_b;
 };
 
 enum {
@@ -45,33 +46,75 @@
   AV1_XFORM_QUANT_TYPES,
 } UENUM1BYTE(AV1_XFORM_QUANT);
 
+// Available optimization types to optimize the quantized coefficients.
+enum {
+  NONE_OPT = 0,            // No optimization.
+  TRELLIS_OPT = 1,         // Trellis optimization. See `av1_optimize_b()`.
+  DROPOUT_OPT = 2,         // Dropout optimization. See `av1_dropout_qcoeff()`.
+  TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                   int mi_row, int mi_col, RUN_TYPE dry_run);
+                   RUN_TYPE dry_run);
 
 void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes);
+void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize);
 
-void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param);
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam);
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam);
 
-void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
-                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, TX_TYPE tx_type,
-                     AV1_XFORM_QUANT xform_quant_idx);
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     QUANT_PARAM *qparam);
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
                    const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
 
+// This function can be used as (i) a further optimization to reduce the
+// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
+// optimization, or (ii) an alternative to trellis optimization in high-speed
+// compression mode (e.g., real-time mode under speed-6) due to its LOW time
+// complexity. The rational behind is to drop out the may-be redundant quantized
+// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as
+// accurate as trellis optimization since the hyper-parameters are hard-coded
+// instead of dynamic search. More adaptive logic may improve the performance.
+// This function should be applied to all or partical block cells.
+// Inputs:
+//   mb: Pointer to the MACROBLOCK to perform dropout on.
+//   plane: Index of the plane to which the target block belongs.
+//   block: Index of the target block.
+//   tx_size: Transform size of the target block.
+//   tx_type: Transform type of the target block. This field is particularly
+//            used to find out the scan order of the block.
+//   qindex: Quantization index used for target block. In general, all blocks
+//           in a same plane share the same quantization index. This field is
+//           particularly used to determine how many zeros should be used to
+//           drop out a coefficient.
+// Returns:
+//   Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as
+//   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex);
+
+void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                        int16_t *diff, ptrdiff_t diff_stride,
+                        const uint8_t *src8, ptrdiff_t src_stride,
+                        const uint8_t *pred8, ptrdiff_t pred_stride);
+
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
 
-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
 
 static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
                                        TX_SIZE tx_size, ENTROPY_CONTEXT *a,
@@ -85,10 +128,16 @@
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                                  BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b, int mi_row,
-                                  int mi_col);
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b);
 
+static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+                                  RUN_TYPE dry_run) {
+  if (optimize_b == NO_TRELLIS_OPT) return false;
+  if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
+    return false;
+  return true;
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/encodemv.c b/libaom/av1/encoder/encodemv.c
index 42eb5ab..167e9c0 100644
--- a/libaom/av1/encoder/encodemv.c
+++ b/libaom/av1/encoder/encodemv.c

@@ -20,22 +20,58 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/bitops.h"
 
-static INLINE int mv_class_base(MV_CLASS_TYPE c) {
-  return c ? CLASS0_SIZE << (c + 2) : 0;
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+                                      MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  update_cdf(mvcomp->sign_cdf, sign, 2);
+
+  // Class
+  update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i)
+      update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_cdf_prob *fp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+    update_cdf(fp_cdf, fr, MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    aom_cdf_prob *hp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+    update_cdf(hp_cdf, hp, 2);
+  }
 }
 
-// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
-static INLINE uint8_t log_in_base_2(unsigned int n) {
-  // get_msb() is only valid when n != 0.
-  return n == 0 ? 0 : get_msb(n);
-}
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
 
-static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
-  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
-                              ? MV_CLASS_10
-                              : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
-  if (offset) *offset = z - mv_class_base(c);
-  return c;
+  update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+  if (mv_joint_vertical(j))
+    update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+  if (mv_joint_horizontal(j))
+    update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
 }
 
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
@@ -44,7 +80,7 @@
   int offset;
   const int sign = comp < 0;
   const int mag = sign ? -comp : comp;
-  const int mv_class = get_mv_class(mag - 1, &offset);
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
   const int d = offset >> 3;         // int mv data
   const int fr = (offset >> 1) & 3;  // fractional mv data
   const int hp = offset & 1;         // high precision mv data
@@ -107,7 +143,7 @@
   for (v = 1; v <= MV_MAX; ++v) {
     int z, c, o, d, e, f, cost = 0;
     z = v - 1;
-    c = get_mv_class(z, &o);
+    c = av1_get_mv_class(z, &o);
     cost += class_cost[c];
     d = (o >> 3);     /* int mv data */
     f = (o >> 1) & 3; /* fractional pel mv data */
@@ -141,7 +177,9 @@
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-  if (cpi->common.cur_frame_force_integer_mv) {
+  // If the mv_diff is zero, then we should have used near or nearest instead.
+  assert(j != MV_JOINT_ZERO);
+  if (cpi->common.features.cur_frame_force_integer_mv) {
     usehp = MV_SUBPEL_NONE;
   }
   aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
@@ -153,9 +191,10 @@
 
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
-  if (cpi->sf.mv.auto_mv_step_size) {
-    unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude);
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
+    int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude);
   }
 }
 
@@ -192,25 +231,17 @@
   const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
   const CANDIDATE_MV *curr_ref_mv_stack =
       mbmi_ext->ref_mv_stack[ref_frame_type];
-  int_mv ref_mv;
-  ref_mv.as_int = INVALID_MV;
 
   if (ref_frame[1] > INTRA_FRAME) {
-    if (ref_idx == 0) {
-      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-    } else {
-      assert(ref_idx == 1);
-      ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
-    }
-  } else {
-    assert(ref_idx == 0);
-    if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) {
-      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-    } else {
-      ref_mv = mbmi_ext->global_mvs[ref_frame_type];
-    }
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
   }
-  return ref_mv;
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext->global_mvs[ref_frame_type];
 }
 
 int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {

diff --git a/libaom/av1/encoder/encodemv.h b/libaom/av1/encoder/encodemv.h
index 37ff547..0d13014 100644
--- a/libaom/av1/encoder/encodemv.h
+++ b/libaom/av1/encoder/encodemv.h

@@ -21,6 +21,9 @@
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp);
 
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision);
+
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx,
                               MvSubpelPrecision precision);
@@ -41,11 +44,29 @@
                                       int is_integer);
 
 static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
-  if (mv->row == 0) {
-    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
-  } else {
-    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
-  }
+  // row:  Z  col:  Z  | MV_JOINT_ZERO   (0)
+  // row:  Z  col: NZ  | MV_JOINT_HNZVZ  (1)
+  // row: NZ  col:  Z  | MV_JOINT_HZVNZ  (2)
+  // row: NZ  col: NZ  | MV_JOINT_HNZVNZ (3)
+  return (!!mv->col) | ((!!mv->row) << 1);
+}
+
+static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+  // get_msb() is only valid when n != 0.
+  return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+                              ? MV_CLASS_10
+                              : (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+  if (offset) *offset = z - av1_mv_class_base(c);
+  return c;
 }
 
 #ifdef __cplusplus

diff --git a/libaom/av1/encoder/encoder.c b/libaom/av1/encoder/encoder.c
index 818e43c..6c1fb2c 100644
--- a/libaom/av1/encoder/encoder.c
+++ b/libaom/av1/encoder/encoder.c

@@ -10,6 +10,7 @@
  */
 
 #include <limits.h>
+#include <float.h>
 #include <math.h>
 #include <stdio.h>
 
@@ -61,7 +62,7 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/grain_test_vectors.h"
 #include "av1/encoder/hash_motion.h"
-#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/mv_prec.h"
 #include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
@@ -71,9 +72,14 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
+#include "av1/encoder/tpl_model.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
 #if CONFIG_ENTROPY_STATS
@@ -83,13 +89,6 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-// Whether to use high precision mv for altref computation.
-#define ALTREF_HIGH_PRECISION_MV 1
-
-// Q threshold for high precision mv. Choose a very high value for now so that
-// HIGH_PRECISION is always chosen.
-#define HIGH_PRECISION_MV_QTHRESH 200
-
 // #define OUTPUT_YUV_REC
 #ifdef OUTPUT_YUV_SKINMAP
 FILE *yuv_skinmap_file = NULL;
@@ -99,6 +98,278 @@
 #define FILE_NAME_LEN 100
 #endif
 
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+                                                       64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                         [SWITCHABLE_FILTER_CONTEXTS]
+                                         [SWITCHABLE_FILTERS] = {
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } }
+                                         };
+
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -128,17 +399,18 @@
 // Mark all inactive blocks as active. Other segmentation features may be set
 // so memset cannot be used, instead only inactive blocks should be reset.
 static void suppress_active_map(AV1_COMP *cpi) {
-  unsigned char *const seg_map = cpi->segmentation_map;
+  unsigned char *const seg_map = cpi->enc_seg.map;
   int i;
   if (cpi->active_map.enabled || cpi->active_map.update)
-    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+    for (i = 0;
+         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
       if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
         seg_map[i] = AM_SEGMENT_ID_ACTIVE;
 }
 
 static void apply_active_map(AV1_COMP *cpi) {
   struct segmentation *const seg = &cpi->common.seg;
-  unsigned char *const seg_map = cpi->segmentation_map;
+  unsigned char *const seg_map = cpi->enc_seg.map;
   const unsigned char *const active_map = cpi->active_map.map;
   int i;
 
@@ -151,7 +423,9 @@
 
   if (cpi->active_map.update) {
     if (cpi->active_map.enabled) {
-      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      for (i = 0;
+           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+           ++i)
         if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
@@ -185,10 +459,11 @@
 
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
     unsigned char *const active_map_8x8 = cpi->active_map.map;
-    const int mi_rows = cpi->common.mi_rows;
-    const int mi_cols = cpi->common.mi_cols;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
     const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
     const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
     cpi->active_map.update = 1;
@@ -214,11 +489,12 @@
 
 int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols &&
       new_map_16x16) {
-    unsigned char *const seg_map_8x8 = cpi->segmentation_map;
-    const int mi_rows = cpi->common.mi_rows;
-    const int mi_cols = cpi->common.mi_cols;
+    unsigned char *const seg_map_8x8 = cpi->enc_seg.map;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
     const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
     const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
 
@@ -243,7 +519,7 @@
 // Compute the horizontal frequency components' energy in a frame
 // by calculuating the 16x4 Horizontal DCT. This is to be used to
 // decide the superresolution parameters.
-void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
   uint64_t freq_energy[16] = { 0 };
   const YV12_BUFFER_CONFIG *buf = cpi->source;
   const int bd = cpi->td.mb.e_mbd.bd;
@@ -300,45 +576,31 @@
   }
 }
 
-static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
-                                  int cur_frame_force_integer_mv) {
-  MACROBLOCK *const mb = &cpi->td.mb;
-  cpi->common.allow_high_precision_mv =
-      allow_high_precision_mv && cur_frame_force_integer_mv == 0;
-  const int copy_hp =
-      cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
-  int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
-  mb->mv_cost_stack = *src;
-}
-
 static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
 
   if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
     return BLOCK_64X64;
-#if CONFIG_FILEOPTIONS
-  if (cm->options && cm->options->ext_partition)
-#endif
-    if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
-      return BLOCK_128X128;
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
 
   assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
 
-// TODO(any): Possibly could improve this with a heuristic.
-#if CONFIG_FILEOPTIONS
-  if (cm->options && !cm->options->ext_partition) return BLOCK_64X64;
-#endif
+  if (cpi->svc.number_spatial_layers > 1) {
+    // Use the configured size (top resolution) for spatial layers.
+    return AOMMIN(cpi->oxcf.width, cpi->oxcf.height) > 480 ? BLOCK_128X128
+                                                           : BLOCK_64X64;
+  }
 
+  // TODO(any): Possibly could improve this with a heuristic.
   // When superres / resize is on, 'cm->width / height' can change between
-  // calls, so we don't apply this heuristic there. Also, this heuristic gives
-  // compression gain for speed >= 2 only.
-  // Things break if superblock size changes per-frame which is why this
-  // heuristic is set based on configured speed rather than actual
-  // speed-features (which may change per-frame in future)
+  // calls, so we don't apply this heuristic there.
+  // Things break if superblock size changes between the first pass and second
+  // pass encoding, which is why this heuristic is not configured as a
+  // speed-feature.
   if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
-      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
-    return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
-                                                   : BLOCK_64X64;
+      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 1) {
+    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
   }
 
   return BLOCK_128X128;
@@ -352,15 +614,16 @@
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
 
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cpi->ext_use_primary_ref_none) {
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
     av1_setup_past_independence(cm);
   }
 
-  if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-    set_sb_size(&cm->seq_params, select_sb_size(cpi));
-  } else if (frame_is_sframe(cm)) {
-    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    if (!cpi->seq_params_locked) {
+      set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    }
   } else {
     const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
     if (primary_ref_buf == NULL) {
@@ -377,68 +640,69 @@
   cpi->vaq_refresh = 0;
 }
 
-static void enc_setup_mi(AV1_COMMON *cm) {
-  int i;
-  int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows);
-  cm->mi = cm->mip;
-  memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip));
-  cm->prev_mi = cm->prev_mip;
-  // Clear top border row
-  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
-  // Clear left border column
-  for (i = 0; i < mi_rows_sb_aligned; ++i)
-    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
-  cm->mi_grid_visible = cm->mi_grid_base;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+static void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
 
-  memset(cm->mi_grid_base, 0,
-         cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base));
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  mi_params->mi_alloc_stride =
+      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+
+#if CONFIG_LPF_MASK
+  av1_alloc_loop_filter_mask(mi_params);
+#endif
 }
 
-static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
-  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
-  if (!cm->mip) return 1;
-  cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
-  if (!cm->prev_mip) return 1;
-  cm->mi_alloc_size = mi_size;
+static void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                          int height) {
+  const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+  mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
 
-  cm->mi_grid_base =
-      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
-  if (!cm->mi_grid_base) return 1;
-  cm->prev_mi_grid_base =
-      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
-  if (!cm->prev_mi_grid_base) return 1;
-
-  return 0;
+  set_mb_mi(mi_params, width, height);
 }
 
-static void enc_free_mi(AV1_COMMON *cm) {
-  aom_free(cm->mip);
-  cm->mip = NULL;
-  aom_free(cm->prev_mip);
-  cm->prev_mip = NULL;
-  aom_free(cm->mi_grid_base);
-  cm->mi_grid_base = NULL;
-  aom_free(cm->prev_mi_grid_base);
-  cm->prev_mi_grid_base = NULL;
-  cm->mi_alloc_size = 0;
+static void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+  set_mb_mi(mi_params, width, height);
 }
 
-static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
-  // Current mip will be the prev_mip for the next frame.
-  MB_MODE_INFO **temp_base = cm->prev_mi_grid_base;
-  MB_MODE_INFO *temp = cm->prev_mip;
-  cm->prev_mip = cm->mip;
-  cm->mip = temp;
+static void enc_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_alloc, 0,
+         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+  memset(mi_params->tx_type_map, 0,
+         mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
 
-  // Update the upper left visible macroblock ptrs.
-  cm->mi = cm->mip;
-  cm->prev_mi = cm->prev_mip;
-
-  cm->prev_mi_grid_base = cm->mi_grid_base;
-  cm->mi_grid_base = temp_base;
-  cm->mi_grid_visible = cm->mi_grid_base;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+static void enc_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
 }
 
 void av1_initialize_enc(void) {
@@ -451,20 +715,35 @@
   av1_init_wedge_masks();
 }
 
-static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
-  if (cpi->mbmi_ext_base) {
-    aom_free(cpi->mbmi_ext_base);
-    cpi->mbmi_ext_base = NULL;
+static void dealloc_context_buffers_ext(MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  if (mbmi_ext_info->frame_base) {
+    aom_free(mbmi_ext_info->frame_base);
+    mbmi_ext_info->frame_base = NULL;
+    mbmi_ext_info->alloc_size = 0;
   }
 }
 
-static void alloc_context_buffers_ext(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  int mi_size = cm->mi_cols * cm->mi_rows;
+static void alloc_context_buffers_ext(AV1_COMMON *cm,
+                                      MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 
-  dealloc_context_buffers_ext(cpi);
-  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
-                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_rows =
+      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_cols =
+      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+    dealloc_context_buffers_ext(mbmi_ext_info);
+    CHECK_MEM_ERROR(
+        cm, mbmi_ext_info->frame_base,
+        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
+    mbmi_ext_info->alloc_size = new_ext_mi_size;
+  }
+  // The stride needs to be updated regardless of whether new allocation
+  // happened or not.
+  mbmi_ext_info->stride = mi_alloc_cols;
 }
 
 static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
@@ -527,14 +806,14 @@
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
 
-  dealloc_context_buffers_ext(cpi);
+  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
 
   aom_free(cpi->tile_data);
   cpi->tile_data = NULL;
 
   // Delete sementation map
-  aom_free(cpi->segmentation_map);
-  cpi->segmentation_map = NULL;
+  aom_free(cpi->enc_seg.map);
+  cpi->enc_seg.map = NULL;
 
   av1_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
@@ -542,6 +821,20 @@
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  aom_free(cpi->ssim_rdmult_scaling_factors);
+  cpi->ssim_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_rdmult_scaling_factors);
+  cpi->tpl_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
+  cpi->tpl_sb_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+  aom_free(cpi->vmaf_rdmult_scaling_factors);
+  cpi->vmaf_rdmult_scaling_factors = NULL;
+#endif
+
   aom_free(cpi->td.mb.above_pred_buf);
   cpi->td.mb.above_pred_buf = NULL;
 
@@ -556,8 +849,8 @@
 
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++) {
-      aom_free(cpi->td.mb.hash_value_buffer[i][j]);
-      cpi->td.mb.hash_value_buffer[i][j] = NULL;
+      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
     }
   aom_free(cpi->td.mb.mask_buf);
   cpi->td.mb.mask_buf = NULL;
@@ -565,6 +858,14 @@
   aom_free(cm->tpl_mvs);
   cm->tpl_mvs = NULL;
 
+  aom_free(cpi->td.mb.mbmi_ext);
+  cpi->td.mb.mbmi_ext = NULL;
+
+  if (cpi->td.vt64x64) {
+    aom_free(cpi->td.vt64x64);
+    cpi->td.vt64x64 = NULL;
+  }
+
   av1_free_ref_frame_buffers(cm->buffer_pool);
   av1_free_txb_buf(cpi);
   av1_free_context_buffers(cm);
@@ -583,10 +884,10 @@
   aom_free(cpi->tplist[0][0]);
   cpi->tplist[0][0] = NULL;
 
-  av1_free_pc_tree(&cpi->td, num_planes);
+  av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
 
   aom_free(cpi->td.mb.palette_buffer);
-
+  av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
   aom_free(cpi->td.mb.tmp_conv_dst);
   for (int j = 0; j < 2; ++j) {
     aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
@@ -602,34 +903,12 @@
     aom_film_grain_table_free(cpi->film_grain_table);
     cpi->film_grain_table = NULL;
   }
-}
 
-static void save_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(cpi->level_params.level_info[i]);
+  }
 
-  // Stores a snapshot of key state variables which can subsequently be
-  // restored with a call to av1_restore_coding_context. These functions are
-  // intended for use in a re-code loop in av1_compress_frame where the
-  // quantizer value is adjusted between loop iterations.
-  av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
-  av1_copy(cc->nmv_costs, cpi->nmv_costs);
-  av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
-
-  cc->fc = *cm->fc;
-}
-
-static void restore_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-
-  // Restore key state variables to the snapshot state stored in the
-  // previous call to av1_save_coding_context.
-  av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
-  av1_copy(cpi->nmv_costs, cc->nmv_costs);
-  av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
-
-  *cm->fc = cc->fc;
+  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
 }
 
 static void configure_static_seg_features(AV1_COMP *cpi) {
@@ -643,10 +922,9 @@
   // Disable and clear down for KF
   if (cm->current_frame.frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
-    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
-    cpi->static_mb_pct = 0;
 
     // Disable segmentation
     av1_disable_segmentation(seg);
@@ -656,19 +934,14 @@
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
-    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
-    cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
     av1_disable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
-    // Scan frames from current to arf frame.
-    // This function re-enables segmentation if appropriate.
-    av1_update_mbgraph_stats(cpi);
-
     // If segmentation was enabled set those features needed for the
     // arf itself.
     if (seg->enabled) {
@@ -716,7 +989,7 @@
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
 
         // Segment coding disabled for compred testing
-        if (high_q || (cpi->static_mb_pct == 100)) {
+        if (high_q) {
           av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
           av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
           av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
@@ -727,7 +1000,8 @@
 
         av1_disable_segmentation(seg);
 
-        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+        memset(cpi->enc_seg.map, 0,
+               cm->mi_params.mi_rows * cm->mi_params.mi_cols);
 
         seg->update_map = 0;
         seg->update_data = 0;
@@ -768,42 +1042,31 @@
 
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO **mi_4x4_ptr = mi_params->mi_grid_base;
   uint8_t *cache_ptr = cm->cur_frame->seg_map;
-  int row, col;
 
-  for (row = 0; row < cm->mi_rows; row++) {
+  for (int row = 0; row < mi_params->mi_rows; row++) {
     MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
     uint8_t *cache = cache_ptr;
-    for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++)
+    for (int col = 0; col < mi_params->mi_cols; col++, mi_4x4++, cache++)
       cache[0] = mi_4x4[0]->segment_id;
-    mi_4x4_ptr += cm->mi_stride;
-    cache_ptr += cm->mi_cols;
+    mi_4x4_ptr += mi_params->mi_stride;
+    cache_ptr += mi_params->mi_cols;
   }
 }
 
-static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
+static void alloc_altref_frame_buffer(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
-  if (!cpi->lookahead) {
-    int is_scale = (oxcf->resize_mode || oxcf->superres_mode);
-    cpi->lookahead = av1_lookahead_init(
-        oxcf->width, oxcf->height, seq_params->subsampling_x,
-        seq_params->subsampling_y, seq_params->use_highbitdepth,
-        oxcf->lag_in_frames, oxcf->border_in_pixels, is_scale);
-  }
-  if (!cpi->lookahead)
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate lag buffers");
-
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (aom_realloc_frame_buffer(
           &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
           seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, oxcf->border_in_pixels,
-          cm->byte_alignment, NULL, NULL, NULL))
+          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+          cm->features.byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -811,10 +1074,11 @@
 static void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
   if (aom_realloc_frame_buffer(
           &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
@@ -822,14 +1086,14 @@
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
           cm->superres_upscaled_height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_RESTORATION_FRAME_BORDER, cm->byte_alignment, NULL, NULL, NULL))
+          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -837,7 +1101,7 @@
           &cpi->scaled_last_source, cm->width, cm->height,
           seq_params->subsampling_x, seq_params->subsampling_y,
           seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          cm->byte_alignment, NULL, NULL, NULL))
+          byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
@@ -846,31 +1110,37 @@
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
 
-  av1_alloc_context_buffers(cm, cm->width, cm->height);
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
 
   int mi_rows_aligned_to_sb =
-      ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
   int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
 
-  av1_alloc_txb_buf(cpi);
+  if (!is_stat_generation_stage(cpi)) {
+    av1_alloc_txb_buf(cpi);
 
-  alloc_context_buffers_ext(cpi);
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+  }
 
   aom_free(cpi->tile_tok[0][0]);
-
-  {
-    unsigned int tokens =
-        get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
-                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
-  }
   aom_free(cpi->tplist[0][0]);
 
-  CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
-                  aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
-                             sizeof(*cpi->tplist[0][0])));
+  if (!is_stat_generation_stage(cpi)) {
+    unsigned int tokens =
+        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+                        MAX_SB_SIZE_LOG2, num_planes);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
 
-  av1_setup_pc_tree(&cpi->common, &cpi->td);
+    CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+                    aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                               sizeof(*cpi->tplist[0][0])));
+  }
+
+  av1_setup_pc_tree(cpi, &cpi->td);
 }
 
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
@@ -878,71 +1148,100 @@
   av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
 }
 
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size) {
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int height = cm->height;
+  const int luma_pic_size = upscaled_width * height;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int pic_size_profile_factor =
+      profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+  encoded_frame_size =
+      (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
+  const size_t uncompressed_frame_size =
+      (luma_pic_size * pic_size_profile_factor) >> 3;
+  return uncompressed_frame_size / (double)encoded_frame_size;
+}
+
 static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
   int i, start_sb;
 
   av1_get_tile_limits(cm);
 
   // configure tile columns
   if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
-    cm->uniform_tile_spacing_flag = 1;
-    cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
-    cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
+    tiles->uniform_spacing = 1;
+    tiles->log2_cols = AOMMAX(cpi->oxcf.tile_columns, tiles->min_log2_cols);
+    tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
   } else {
-    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-    int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+    int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
+    int sb_cols = mi_cols >> seq_params->mib_size_log2;
     int size_sb, j = 0;
-    cm->uniform_tile_spacing_flag = 0;
+    tiles->uniform_spacing = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
-      cm->tile_col_start_sb[i] = start_sb;
+      tiles->col_start_sb[i] = start_sb;
       size_sb = cpi->oxcf.tile_widths[j++];
       if (j >= cpi->oxcf.tile_width_count) j = 0;
-      start_sb += AOMMIN(size_sb, cm->max_tile_width_sb);
+      start_sb += AOMMIN(size_sb, tiles->max_width_sb);
     }
-    cm->tile_cols = i;
-    cm->tile_col_start_sb[i] = sb_cols;
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
   }
-  av1_calculate_tile_cols(cm);
+  av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols,
+                          tiles);
 
   // configure tile rows
-  if (cm->uniform_tile_spacing_flag) {
-    cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
-    cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
+  if (tiles->uniform_spacing) {
+    tiles->log2_rows = AOMMAX(cpi->oxcf.tile_rows, tiles->min_log2_rows);
+    tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
   } else {
-    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-    int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+    int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
+    int sb_rows = mi_rows >> seq_params->mib_size_log2;
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
-      cm->tile_row_start_sb[i] = start_sb;
+      tiles->row_start_sb[i] = start_sb;
       size_sb = cpi->oxcf.tile_heights[j++];
       if (j >= cpi->oxcf.tile_height_count) j = 0;
-      start_sb += AOMMIN(size_sb, cm->max_tile_height_sb);
+      start_sb += AOMMIN(size_sb, tiles->max_height_sb);
     }
-    cm->tile_rows = i;
-    cm->tile_row_start_sb[i] = sb_rows;
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
   }
-  av1_calculate_tile_rows(cm);
+  av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
 }
 
 static void update_frame_size(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  av1_set_mb_mi(cm, cm->width, cm->height);
-  av1_init_context_buffers(cm);
+  // We need to reallocate the context buffers here in case we need more mis.
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+  av1_init_mi_buffers(&cm->mi_params);
+
   av1_init_macroblockd(cm, xd, NULL);
-  memset(cpi->mbmi_ext_base, 0,
-         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  if (!is_stat_generation_stage(cpi))
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
   set_tile_info(cpi);
 }
 
-static void init_buffer_indices(AV1_COMP *cpi) {
+static void init_buffer_indices(ForceIntegerMVInfo *const force_intpel_info,
+                                int *const remapped_ref_idx) {
   int fb_idx;
   for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
-    cpi->common.remapped_ref_idx[fb_idx] = fb_idx;
-  cpi->rate_index = 0;
-  cpi->rate_size = 0;
+    remapped_ref_idx[fb_idx] = fb_idx;
+  force_intpel_info->rate_index = 0;
+  force_intpel_info->rate_size = 0;
 }
 
 static INLINE int does_level_match(int width, int height, double fps,
@@ -997,47 +1296,41 @@
     level = SEQ_LEVEL_6_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               8192, 4352, 60.0, 2)) {
+    level = SEQ_LEVEL_6_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               8192, 4352, 120.0, 2)) {
     level = SEQ_LEVEL_6_2;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              16384, 8704, 30.0, 2)) {
-    level = SEQ_LEVEL_7_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              16384, 8704, 60.0, 2)) {
-    level = SEQ_LEVEL_7_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              16384, 8704, 120.0, 2)) {
-    level = SEQ_LEVEL_7_2;
   }
+
+  SequenceHeader *const seq_params = &cm->seq_params;
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
     seq->seq_level_idx[i] = level;
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
-    cm->op_params[i].bitrate = max_level_bitrate(
+    seq_params->op_params[i].bitrate = av1_max_level_bitrate(
         cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
-    if (cm->op_params[i].bitrate == 0)
+    if (seq_params->op_params[i].bitrate == 0)
       aom_internal_error(
           &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
           "AV1 does not support this combination of profile, level, and tier.");
     // Buffer size in bits/s is bitrate in bits/s * 1 s
-    cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+    seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
   }
 }
 
 static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
-                                  const AV1EncoderConfig *oxcf) {
-  seq->still_picture = (oxcf->limit == 1);
+                                  const AV1EncoderConfig *oxcf, int use_svc) {
+  seq->still_picture = (oxcf->force_video_mode == 0) && (oxcf->limit == 1);
   seq->reduced_still_picture_hdr = seq->still_picture;
   seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
-  seq->force_screen_content_tools = 2;
+  seq->force_screen_content_tools = (oxcf->mode == REALTIME) ? 0 : 2;
   seq->force_integer_mv = 2;
   seq->order_hint_info.enable_order_hint = oxcf->enable_order_hint;
   seq->frame_id_numbers_present_flag =
       !(seq->still_picture && seq->reduced_still_picture_hdr) &&
-      !oxcf->large_scale_tile && oxcf->error_resilient_mode;
+      !oxcf->large_scale_tile && oxcf->error_resilient_mode && !use_svc;
   if (seq->still_picture && seq->reduced_still_picture_hdr) {
     seq->order_hint_info.enable_order_hint = 0;
     seq->force_screen_content_tools = 2;
@@ -1084,90 +1377,100 @@
   if (seq->operating_points_cnt_minus_1 == 0) {
     seq->operating_point_idc[0] = 0;
   } else {
-    // Set operating_point_idc[] such that for the i-th operating point the
-    // first (operating_points_cnt-i) spatial layers and the first temporal
-    // layer are decoded Note that highest quality operating point should come
-    // first
-    for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++)
-      seq->operating_point_idc[i] =
-          (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1;
+    // Set operating_point_idc[] such that the i=0 point corresponds to the
+    // highest quality operating point (all layers), and subsequent
+    // operarting points (i > 0) are lower quality corresponding to
+    // skip decoding enhancement  layers (temporal first).
+    int i = 0;
+    assert(seq->operating_points_cnt_minus_1 ==
+           (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) {
+        seq->operating_point_idc[i] =
+            (~(~0u << (cm->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (cm->number_temporal_layers - tl));
+        i++;
+      }
+    }
   }
 }
 
 static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
 
   cpi->oxcf = *oxcf;
   cpi->framerate = oxcf->init_framerate;
 
-  cm->seq_params.profile = oxcf->profile;
-  cm->seq_params.bit_depth = oxcf->bit_depth;
-  cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth;
-  cm->seq_params.color_primaries = oxcf->color_primaries;
-  cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics;
-  cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients;
-  cm->seq_params.monochrome = oxcf->monochrome;
-  cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position;
-  cm->seq_params.color_range = oxcf->color_range;
-  cm->timing_info_present = oxcf->timing_info_present;
-  cm->timing_info.num_units_in_display_tick =
+  seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->use_highbitdepth = oxcf->use_highbitdepth;
+  seq_params->color_primaries = oxcf->color_primaries;
+  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+  seq_params->monochrome = oxcf->monochrome;
+  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+  seq_params->color_range = oxcf->color_range;
+  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
       oxcf->timing_info.num_units_in_display_tick;
-  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
-  cm->timing_info.equal_picture_interval =
+  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
       oxcf->timing_info.equal_picture_interval;
-  cm->timing_info.num_ticks_per_picture =
+  seq_params->timing_info.num_ticks_per_picture =
       oxcf->timing_info.num_ticks_per_picture;
 
-  cm->seq_params.display_model_info_present_flag =
+  seq_params->display_model_info_present_flag =
       oxcf->display_model_info_present_flag;
-  cm->seq_params.decoder_model_info_present_flag =
+  seq_params->decoder_model_info_present_flag =
       oxcf->decoder_model_info_present_flag;
   if (oxcf->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
-    cm->buffer_model.num_units_in_decoding_tick =
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
         oxcf->buffer_model.num_units_in_decoding_tick;
     cm->buffer_removal_time_present = 1;
-    set_aom_dec_model_info(&cm->buffer_model);
-    set_dec_model_op_parameters(&cm->op_params[0]);
-  } else if (cm->timing_info_present &&
-             cm->timing_info.equal_picture_interval &&
-             !cm->seq_params.decoder_model_info_present_flag) {
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
     // set the decoder model parameters in resource availability mode
-    set_resource_availability_parameters(&cm->op_params[0]);
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
   } else {
-    cm->op_params[0].initial_display_delay =
+    seq_params->op_params[0].initial_display_delay =
         10;  // Default value (not signaled)
   }
 
-  if (cm->seq_params.monochrome) {
-    cm->seq_params.subsampling_x = 1;
-    cm->seq_params.subsampling_y = 1;
-  } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
-             cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
-             cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
-    cm->seq_params.subsampling_x = 0;
-    cm->seq_params.subsampling_y = 0;
+  if (seq_params->monochrome) {
+    seq_params->subsampling_x = 1;
+    seq_params->subsampling_y = 1;
+  } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+             seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+             seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    seq_params->subsampling_x = 0;
+    seq_params->subsampling_y = 0;
   } else {
-    if (cm->seq_params.profile == 0) {
-      cm->seq_params.subsampling_x = 1;
-      cm->seq_params.subsampling_y = 1;
-    } else if (cm->seq_params.profile == 1) {
-      cm->seq_params.subsampling_x = 0;
-      cm->seq_params.subsampling_y = 0;
+    if (seq_params->profile == 0) {
+      seq_params->subsampling_x = 1;
+      seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == 1) {
+      seq_params->subsampling_x = 0;
+      seq_params->subsampling_y = 0;
     } else {
-      if (cm->seq_params.bit_depth == AOM_BITS_12) {
-        cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
-        cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = oxcf->chroma_subsampling_x;
+        seq_params->subsampling_y = oxcf->chroma_subsampling_y;
       } else {
-        cm->seq_params.subsampling_x = 1;
-        cm->seq_params.subsampling_y = 0;
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
       }
     }
   }
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
-  set_sb_size(&cm->seq_params,
+  set_sb_size(seq_params,
               select_sb_size(cpi));  // set sb size before allocations
   alloc_compressor_data(cpi);
 
@@ -1176,17 +1479,27 @@
   // Single thread case: use counts in common.
   cpi->td.counts = &cpi->counts;
 
+  // Set init SVC parameters.
+  cpi->use_svc = 0;
+  cpi->svc.external_ref_frame_config = 0;
+  cpi->svc.non_reference_frame = 0;
+  cpi->svc.number_spatial_layers = 1;
+  cpi->svc.number_temporal_layers = 1;
+  cm->number_spatial_layers = 1;
+  cm->number_temporal_layers = 1;
+  cm->spatial_layer_id = 0;
+  cm->temporal_layer_id = 0;
+
   // change includes all joint functionality
   av1_change_config(cpi, oxcf);
 
-  cpi->static_mb_pct = 0;
   cpi->ref_frame_flags = 0;
 
   // Reset resize pending flags
-  cpi->resize_pending_width = 0;
-  cpi->resize_pending_height = 0;
+  resize_pending_params->width = 0;
+  resize_pending_params->height = 0;
 
-  init_buffer_indices(cpi);
+  init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
 }
 
 static void set_rc_buffer_sizes(RATE_CONTROL *rc,
@@ -1295,6 +1608,7 @@
            4;                                                               \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
@@ -1385,6 +1699,7 @@
 MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
 MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
 MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
@@ -1415,6 +1730,7 @@
            4;                                                            \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
@@ -1437,6 +1753,7 @@
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
 
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;           \
@@ -1460,6 +1777,7 @@
     return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
@@ -2356,24 +2674,108 @@
     }
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static void realloc_segmentation_maps(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
 
   // Create the encoder segmentation map and set all entries to 0
-  aom_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
-                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+  aom_free(cpi->enc_seg.map);
+  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
-                  av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+  CHECK_MEM_ERROR(
+      cm, cpi->cyclic_refresh,
+      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
 
   // Create a map used to mark inactive areas.
   aom_free(cpi->active_map.map);
   CHECK_MEM_ERROR(cm, cpi->active_map.map,
-                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(int width, int height,
+                                                uint8_t *block_mis_log2) {
+  const int is_720p_or_larger = AOMMIN(width, height) >= 720;
+
+  // 0: 4x4, 1: 8x8, 2: 16x16
+  *block_mis_log2 = is_720p_or_larger ? 2 : 1;
+}
+
+void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                        CompoundTypeRdBuffers *const bufs) {
+  CHECK_MEM_ERROR(
+      cm, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  CHECK_MEM_ERROR(
+      cm, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                        sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+static void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level,
+                                int tier) {
+  aom_clear_system_state();
+
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SequenceHeader *const seq_params = &cpi->common.seq_params;
+
+  // Adjust target bitrate to be no larger than 70% of level limit.
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const double level_bitrate_limit =
+      av1_get_max_bitrate_for_level(target_level, tier, profile);
+  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+  oxcf->target_bandwidth = AOMMIN(oxcf->target_bandwidth, max_bitrate);
+  // Also need to update cpi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+  if (stats != NULL)
+    cpi->twopass.bits_left =
+        (int64_t)(stats->duration * cpi->oxcf.target_bandwidth / 10000000.0);
+
+  // Adjust max over-shoot percentage.
+  oxcf->over_shoot_pct = 0;
+
+  // Adjust max quantizer.
+  oxcf->worst_allowed_q = 255;
+
+  // Adjust number of tiles and tile columns to be under level limit.
+  int max_tiles, max_tile_cols;
+  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+  while (oxcf->tile_columns > 0 && (1 << oxcf->tile_columns) > max_tile_cols) {
+    --oxcf->tile_columns;
+  }
+  const int tile_cols = (1 << oxcf->tile_columns);
+  while (oxcf->tile_rows > 0 &&
+         tile_cols * (1 << oxcf->tile_rows) > max_tiles) {
+    --oxcf->tile_rows;
+  }
+
+  // Adjust min compression ratio.
+  const int still_picture = seq_params->still_picture;
+  const double min_cr =
+      av1_get_min_cr_for_level(target_level, tier, still_picture);
+  oxcf->min_cr = AOMMAX(oxcf->min_cr, (unsigned int)(min_cr * 100));
 }
 
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
@@ -2382,6 +2784,7 @@
   const int num_planes = av1_num_planes(cm);
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->level_params;
 
   if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->bit_depth;
@@ -2395,23 +2798,13 @@
   assert(IMPLIES(seq_params->profile <= PROFILE_1,
                  seq_params->bit_depth <= AOM_BITS_10));
 
-  memcpy(cpi->target_seq_level_idx, oxcf->target_seq_level_idx,
-         sizeof(cpi->target_seq_level_idx));
-  cpi->keep_level_stats = 0;
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    if (cpi->target_seq_level_idx[i] < SEQ_LEVELS) {
-      cpi->keep_level_stats = 1;
-      break;
-    }
-  }
-
-  cm->timing_info_present = oxcf->timing_info_present;
-  cm->timing_info.num_units_in_display_tick =
+  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
       oxcf->timing_info.num_units_in_display_tick;
-  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
-  cm->timing_info.equal_picture_interval =
+  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
       oxcf->timing_info.equal_picture_interval;
-  cm->timing_info.num_ticks_per_picture =
+  seq_params->timing_info.num_ticks_per_picture =
       oxcf->timing_info.num_ticks_per_picture;
 
   seq_params->display_model_info_present_flag =
@@ -2420,50 +2813,73 @@
       oxcf->decoder_model_info_present_flag;
   if (oxcf->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
-    cm->buffer_model.num_units_in_decoding_tick =
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
         oxcf->buffer_model.num_units_in_decoding_tick;
     cm->buffer_removal_time_present = 1;
-    set_aom_dec_model_info(&cm->buffer_model);
-    set_dec_model_op_parameters(&cm->op_params[0]);
-  } else if (cm->timing_info_present &&
-             cm->timing_info.equal_picture_interval &&
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
              !seq_params->decoder_model_info_present_flag) {
     // set the decoder model parameters in resource availability mode
-    set_resource_availability_parameters(&cm->op_params[0]);
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
   } else {
-    cm->op_params[0].initial_display_delay =
+    seq_params->op_params[0].initial_display_delay =
         10;  // Default value (not signaled)
   }
 
   update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
-  cpi->common.options = oxcf->cfg;
+  cpi->superres_mode = oxcf->superres_mode;  // default
   x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
 
-  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+  memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx,
+         sizeof(level_params->target_seq_level_idx));
+  level_params->keep_level_stats = 0;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    if (level_params->target_seq_level_idx[i] <= SEQ_LEVELS) {
+      level_params->keep_level_stats |= 1u << i;
+      if (!level_params->level_info[i]) {
+        CHECK_MEM_ERROR(cm, level_params->level_info[i],
+                        aom_calloc(1, sizeof(*level_params->level_info[i])));
+      }
+    }
+  }
+
+  // TODO(huisu@): level targeting currently only works for the 0th operating
+  // point, so scalable coding is not supported yet.
+  if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) {
+    // Adjust encoder config in order to meet target level.
+    config_target_level(cpi, level_params->target_seq_level_idx[0],
+                        seq_params->tier[0]);
+  }
+
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
     rc->baseline_gf_interval = FIXED_GF_INTERVAL;
   } else {
     rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
   cpi->refresh_bwd_ref_frame = 0;
-  cpi->refresh_alt2_ref_frame = 0;
 
-  cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
-                                  ? REFRESH_FRAME_CONTEXT_DISABLED
-                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->features.refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+                                           ? REFRESH_FRAME_CONTEXT_DISABLED
+                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
   if (oxcf->large_scale_tile)
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
 
+  if (x->comp_rd_buffer.pred0 == NULL) {
+    av1_alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
+  }
+
   if (x->tmp_conv_dst == NULL) {
     CHECK_MEM_ERROR(
         cm, x->tmp_conv_dst,
@@ -2480,7 +2896,8 @@
   }
 
   av1_reset_segment_features(cm);
-  set_high_precision_mv(cpi, 1, 0);
+
+  av1_set_high_precision_mv(cpi, 1, 0);
 
   set_rc_buffer_sizes(rc, &cpi->oxcf);
 
@@ -2496,8 +2913,9 @@
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
   rc->best_quality = cpi->oxcf.best_allowed_q;
 
-  cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
-  cm->switchable_motion_mode = 1;
+  cm->features.interp_filter =
+      oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
 
   if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
     cm->render_width = cpi->oxcf.render_width;
@@ -2521,7 +2939,7 @@
     if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
         seq_params->sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(&cpi->td, num_planes);
+      av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2529,43 +2947,93 @@
   }
   update_frame_size(cpi);
 
-  cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
   set_tile_info(cpi);
 
-  cpi->ext_refresh_frame_flags_pending = 0;
-  cpi->ext_refresh_frame_context_pending = 0;
+  if (!cpi->svc.external_ref_frame_config)
+    cpi->ext_flags.refresh_frame_flags_pending = 0;
+  cpi->ext_flags.refresh_frame_context_pending = 0;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
+#endif
 
   // Init sequence level coding tools
   // This should not be called after the first key frame.
   if (!cpi->seq_params_locked) {
     seq_params->operating_points_cnt_minus_1 =
-        cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
-    init_seq_coding_tools(&cm->seq_params, cm, oxcf);
+        (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
+            ? cm->number_spatial_layers * cm->number_temporal_layers - 1
+            : 0;
+    init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
   }
+
+  if (cpi->use_svc)
+    av1_update_layer_context_change_config(cpi, oxcf->target_bandwidth);
 }
 
-static void init_level_info(AV1LevelInfo *level_info) {
-  memset(level_info, 0, MAX_NUM_OPERATING_POINTS * sizeof(*level_info));
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    AV1LevelSpec *const level_spec = &level_info[i].level_spec;
-    level_spec->level = SEQ_LEVEL_MAX;
-    AV1LevelStats *const level_stats = &level_info[i].level_stats;
-    level_stats->min_cropped_tile_width = INT_MAX;
-    level_stats->min_cropped_tile_height = INT_MAX;
-    level_stats->min_frame_width = INT_MAX;
-    level_stats->min_frame_height = INT_MAX;
-    level_stats->tile_width_is_valid = 1;
-    level_stats->min_cr = 1e8;
+static INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
+                                     TplParams *const tpl_data) {
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  set_tpl_stats_block_size(cm->width, cm->height,
+                           &tpl_data->tpl_stats_block_mis_log2);
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+    const int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+
+    tpl_data->tpl_stats_buffer[frame].is_valid = 0;
+    tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].stride =
+        tpl_data->tpl_stats_buffer[frame].width;
+    tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows;
+    tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols;
   }
+
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    CHECK_MEM_ERROR(
+        cm, tpl_data->tpl_stats_pool[frame],
+        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+                       tpl_data->tpl_stats_buffer[frame].height,
+                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+    if (aom_alloc_frame_buffer(
+            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
+            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+            cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER,
+            cm->features.byte_alignment))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate frame buffer");
+  }
+
+  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
 }
 
-AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
-                                BufferPool *const pool) {
-  unsigned int i;
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+                                   const AV1_COMMON *const cm) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  frame_info->frame_width = cm->width;
+  frame_info->frame_height = cm->height;
+  frame_info->mi_cols = mi_params->mi_cols;
+  frame_info->mi_rows = mi_params->mi_rows;
+  frame_info->mb_cols = mi_params->mb_cols;
+  frame_info->mb_rows = mi_params->mb_rows;
+  frame_info->num_mbs = mi_params->MBs;
+  frame_info->bit_depth = seq_params->bit_depth;
+  frame_info->subsampling_x = seq_params->subsampling_x;
+  frame_info->subsampling_y = seq_params->subsampling_y;
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
+                                FIRSTPASS_STATS *frame_stats_buf,
+                                COMPRESSOR_STAGE stage, int num_lap_buffers,
+                                int lap_lag_in_frames,
+                                STATS_BUFFER_CTX *stats_buf_context) {
   AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
   AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
@@ -2583,9 +3051,17 @@
   }
 
   cm->error.setjmp = 1;
-  cm->alloc_mi = enc_alloc_mi;
-  cm->free_mi = enc_free_mi;
-  cm->setup_mi = enc_setup_mi;
+  cpi->lap_enabled = num_lap_buffers > 0;
+  cpi->compressor_stage = stage;
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
+                             ? stat_stage_set_mb_mi
+                             : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
 
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
@@ -2595,15 +3071,21 @@
   memset(cm->fc, 0, sizeof(*cm->fc));
   memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
 
-  cpi->resize_state = 0;
-  cpi->resize_avg_qp = 0;
-  cpi->resize_buffer_underflow = 0;
-
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.lag_in_frames = lap_lag_in_frames;
+  }
+
   av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
+  cpi->rc.enable_scenecut_detection = 1;
+  if (cpi->lap_enabled &&
+      (num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)))
+    cpi->rc.enable_scenecut_detection = 0;
+  init_frame_info(&cpi->frame_info, cm);
+
   cm->current_frame.frame_number = 0;
   cm->current_frame_id = -1;
   cpi->seq_params_locked = 0;
@@ -2612,20 +3094,8 @@
   cpi->last_show_frame_buf = NULL;
   realloc_segmentation_maps(cpi);
 
-  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
-  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
-
-  for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
-       i++) {
-    CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
-        aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
-  }
-
   cpi->refresh_alt_ref_frame = 0;
 
-  init_level_info(cpi->level_info);
-
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
@@ -2657,9 +3127,10 @@
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
-                    aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
-                               cpi->common.mi_rows * cpi->common.mi_cols));
+    CHECK_MEM_ERROR(
+        cm, cpi->ssim_vars,
+        aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows *
+                   cpi->common.mi_params.mi_cols));
     cpi->worst_consistency = 100.0;
   }
 #endif
@@ -2667,12 +3138,7 @@
   av1_zero(aggregate_fc);
 #endif  // CONFIG_ENTROPY_STATS
 
-  cpi->first_time_stamp_ever = INT64_MAX;
-
-  cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
-  cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
-  cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
-  cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
+  cpi->time_stamps.first_ever = INT64_MAX;
 
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
@@ -2681,18 +3147,35 @@
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
 
-  if (oxcf->pass == 1) {
-    av1_init_first_pass(cpi);
-  } else if (oxcf->pass == 2) {
+  assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
+  for (int i = 0; i < size; i++)
+    cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i];
+
+  cpi->twopass.stats_buf_ctx = stats_buf_context;
+  cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
-    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
-    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+    if (!cpi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
 
-    av1_init_second_pass(cpi);
+      av1_init_second_pass(cpi);
+    } else {
+      av1_init_single_pass_lap(cpi);
+    }
   }
+#endif
+
+  int sb_mi_size = av1_get_sb_mi_size(cm);
 
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.above_pred_buf,
@@ -2714,32 +3197,64 @@
   for (int x = 0; x < 2; x++)
     for (int y = 0; y < 2; y++)
       CHECK_MEM_ERROR(
-          cm, cpi->td.mb.hash_value_buffer[x][y],
-          (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-                                 sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
 
-  cpi->td.mb.g_crc_initialized = 0;
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
 
   CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
 
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mbmi_ext,
+                  aom_calloc(sb_mi_size, sizeof(*cpi->td.mb.mbmi_ext)));
+
   av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
   av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
+    CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_sb_rdmult_scaling_factors)));
+  }
 
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
-                    aom_calloc(mi_rows * mi_cols,
-                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    cpi->tpl_stats[frame].is_valid = 0;
-    cpi->tpl_stats[frame].width = mi_cols;
-    cpi->tpl_stats[frame].height = mi_rows;
-    cpi->tpl_stats[frame].stride = mi_cols;
-    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
-    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const int bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_rdmult_scaling_factors)));
+    cpi->last_frame_unsharp_amount = 0.0;
+  }
+#endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    setup_tpl_buffers(cm, &cpi->tpl_data);
   }
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
@@ -2946,15 +3461,18 @@
 
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
 
+#if CONFIG_AV1_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
+#endif
 
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
    * av1_init_quantizer() for every frame.
    */
-  av1_init_quantizer(cpi);
-  av1_qm_init(cm);
+  av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                     cm->seq_params.bit_depth);
+  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
 
   av1_loop_filter_init(cm);
   cm->superres_scale_denominator = SCALE_NUMERATOR;
@@ -2976,7 +3494,7 @@
 
 void av1_remove_compressor(AV1_COMP *cpi) {
   AV1_COMMON *cm;
-  unsigned int i;
+  TplParams *const tpl_data = &cpi->tpl_data;
   int t;
 
   if (!cpi) return;
@@ -2986,7 +3504,7 @@
 
   if (cm->current_frame.frame_number > 0) {
 #if CONFIG_ENTROPY_STATS
-    if (cpi->oxcf.pass != 1) {
+    if (!is_stat_generation_stage(cpi)) {
       fprintf(stderr, "Writing counts.stt\n");
       FILE *f = fopen("counts.stt", "wb");
       fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
@@ -2996,12 +3514,12 @@
 #if CONFIG_INTERNAL_STATS
     aom_clear_system_state();
 
-    if (cpi->oxcf.pass != 1) {
+    if (!is_stat_generation_stage(cpi)) {
       char headings[512] = { 0 };
       char results[512] = { 0 };
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded =
-          (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+          (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
           10000000.000;
       double total_encode_time =
           (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
@@ -3050,30 +3568,38 @@
           SNPRINT2(results, "\t%7.3f", consistency);
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
-        fprintf(f, "%s\t    Time\tRcErr\tAbsErr\n", headings);
-        fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
-                rate_err, fabs(rate_err));
+
+        SNPRINT(headings, "\t    Time\tRcErr\tAbsErr");
+        SNPRINT2(results, "\t%8.0f", total_encode_time);
+        SNPRINT2(results, "\t%7.2f", rate_err);
+        SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+
+        fprintf(f, "%s\tAPsnr611\n", headings);
+        fprintf(f, "%s\t%7.3f\n", results,
+                (6 * cpi->psnr.stat[STAT_Y] + cpi->psnr.stat[STAT_U] +
+                 cpi->psnr.stat[STAT_V]) /
+                    (cpi->count * 8));
       }
 
       fclose(f);
     }
 #endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
-    if (cpi->oxcf.pass != 1) {
+    if (!is_stat_generation_stage(cpi)) {
       fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
     }
 #endif  // CONFIG_SPEED_STATS
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
-    if (cpi->oxcf.pass != 1) {
+    if (!is_stat_generation_stage(cpi)) {
       av1_print_partition_stats(&cpi->partition_stats);
     }
 #endif
   }
 
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    aom_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    cpi->tpl_stats[frame].is_valid = 0;
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
   }
 
   for (t = cpi->num_workers - 1; t >= 0; --t) {
@@ -3084,16 +3610,18 @@
     aom_get_worker_interface()->end(worker);
 
     // Deallocate allocated thread data.
-    if (cpi->row_mt == 1) aom_free(thread_data->td->tctx);
+    aom_free(thread_data->td->tctx);
     if (t > 0) {
       aom_free(thread_data->td->palette_buffer);
       aom_free(thread_data->td->tmp_conv_dst);
+      av1_release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
       for (int j = 0; j < 2; ++j) {
         aom_free(thread_data->td->tmp_obmc_bufs[j]);
       }
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
       aom_free(thread_data->td->wsrc_buf);
+      aom_free(thread_data->td->vt64x64);
 
       aom_free(thread_data->td->inter_modes_info);
       for (int x = 0; x < 2; x++) {
@@ -3104,16 +3632,16 @@
       }
       aom_free(thread_data->td->mask_buf);
       aom_free(thread_data->td->counts);
-      av1_free_pc_tree(thread_data->td, num_planes);
+      av1_free_pc_tree(cpi, thread_data->td, num_planes,
+                       cm->seq_params.sb_size);
+      aom_free(thread_data->td->mbmi_ext);
       aom_free(thread_data->td);
     }
   }
 #if CONFIG_MULTITHREAD
-  if (cpi->row_mt == 1) {
-    if (cpi->row_mt_mutex_ != NULL) {
-      pthread_mutex_destroy(cpi->row_mt_mutex_);
-      aom_free(cpi->row_mt_mutex_);
-    }
+  if (cpi->row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(cpi->row_mt_mutex_);
+    aom_free(cpi->row_mt_mutex_);
   }
 #endif
   av1_row_mt_mem_dealloc(cpi);
@@ -3127,22 +3655,17 @@
 
   dealloc_compressor_data(cpi);
 
-  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
-       ++i) {
-    aom_free(cpi->mbgraph_stats[i].mb_stats);
-  }
-
 #if CONFIG_INTERNAL_STATS
   aom_free(cpi->ssim_vars);
   cpi->ssim_vars = NULL;
 #endif  // CONFIG_INTERNAL_STATS
 
   av1_remove_common(cm);
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table);
-  }
+#if CONFIG_HTB_TRELLIS
   if (cpi->sf.use_hash_based_trellis) hbt_destroy();
+#endif  // CONFIG_HTB_TRELLIS
   av1_free_ref_frame_buffers(cm->buffer_pool);
+
   aom_free(cpi);
 
 #ifdef OUTPUT_YUV_SKINMAP
@@ -3157,8 +3680,14 @@
   struct aom_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
   aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
-                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+#endif
 
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
@@ -3169,10 +3698,10 @@
   aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
 
-int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
   if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
 
-  cpi->ext_ref_frame_flags = ref_frame_flags;
+  *ext_ref_frame_flags = ref_frame_flags;
   return 0;
 }
 
@@ -3200,9 +3729,10 @@
   }
 }
 
-int av1_update_entropy(AV1_COMP *cpi, int update) {
-  cpi->ext_refresh_frame_context = update;
-  cpi->ext_refresh_frame_context_pending = 1;
+int av1_update_entropy(bool *ext_refresh_frame_context,
+                       bool *ext_refresh_frame_context_pending, bool update) {
+  *ext_refresh_frame_context = update;
+  *ext_refresh_frame_context_pending = 1;
   return 0;
 }
 
@@ -3297,18 +3827,18 @@
 #endif  // OUTPUT_YUV_REC
 
 #define GM_RECODE_LOOP_NUM4X4_FACTOR 192
-static int recode_loop_test_global_motion(AV1_COMP *cpi) {
+static int recode_loop_test_global_motion(
+    WarpedMotionParams *const global_motion,
+    const int *const global_motion_used, int *const gm_params_cost) {
   int i;
   int recode = 0;
-  RD_COUNTS *const rdc = &cpi->td.rd_counts;
-  AV1_COMMON *const cm = &cpi->common;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    if (cm->global_motion[i].wmtype != IDENTITY &&
-        rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
-            cpi->gmparams_cost[i]) {
-      cm->global_motion[i] = default_warp_params;
-      assert(cm->global_motion[i].wmtype == IDENTITY);
-      cpi->gmparams_cost[i] = 0;
+    if (global_motion[i].wmtype != IDENTITY &&
+        global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+            gm_params_cost[i]) {
+      global_motion[i] = default_warp_params;
+      assert(global_motion[i].wmtype == IDENTITY);
+      gm_params_cost[i] = 0;
       recode = 1;
       // TODO(sarahparker): The earlier condition for recoding here was:
       // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
@@ -3328,8 +3858,9 @@
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      (cpi->sf.recode_loop == ALLOW_RECODE) ||
-      (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
         (rc->projected_frame_size < low_limit && q > minq)) {
@@ -3350,14 +3881,10 @@
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MV_REFERENCE_FRAME ref_frame;
-  const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
-    AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG,
-    AOM_BWD_FLAG,  AOM_ALT2_FLAG,  AOM_ALT_FLAG
-  };
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
-    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       BufferPool *const pool = cm->buffer_pool;
       const YV12_BUFFER_CONFIG *const ref =
           get_ref_frame_yv12_buf(cm, ref_frame);
@@ -3376,8 +3903,8 @@
             ref->border < AOM_BORDER_IN_PIXELS) {
           RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
           if (aom_yv12_realloc_with_new_border(
-                  &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-                  num_planes) != 0) {
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, num_planes) != 0) {
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
@@ -3400,7 +3927,7 @@
                   &new_fb->buf, cm->width, cm->height,
                   cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
                   cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                  cm->byte_alignment, NULL, NULL, NULL)) {
+                  cm->features.byte_alignment, NULL, NULL, NULL)) {
             if (force_scaling) {
               // Release the reference acquired in the get_free_fb() call above.
               --new_fb->ref_count;
@@ -3421,7 +3948,7 @@
         ++buf->ref_count;
       }
     } else {
-      if (cpi->oxcf.pass != 0) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
     }
   }
 }
@@ -3439,51 +3966,54 @@
 
 static void set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
-  const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int max_mv_def = AOMMAX(cm->width, cm->height);
 
   // Default based on max resolution.
-  cpi->mv_step_param = av1_init_search_range(max_mv_def);
+  mv_search_params->mv_step_param = av1_init_search_range(max_mv_def);
 
-  if (cpi->sf.mv.auto_mv_step_size) {
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
     if (frame_is_intra_only(cm)) {
       // Initialize max_mv_magnitude for use in the first INTER frame
       // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
+      mv_search_params->max_mv_magnitude = max_mv_def;
     } else {
-      if (cm->show_frame) {
+      // Use cpi->max_mv_magnitude == -1 to exclude first pass case.
+      if (cm->show_frame && mv_search_params->max_mv_magnitude != -1) {
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
-        cpi->mv_step_param = av1_init_search_range(
-            AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+        mv_search_params->mv_step_param = av1_init_search_range(
+            AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
       }
-      cpi->max_mv_magnitude = 0;
+      mv_search_params->max_mv_magnitude = -1;
     }
   }
 }
 
-static void set_screen_content_options(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
+void av1_set_screen_content_options(const AV1_COMP *cpi,
+                                    FeatureFlags *features) {
+  const AV1_COMMON *const cm = &cpi->common;
 
   if (cm->seq_params.force_screen_content_tools != 2) {
-    cm->allow_screen_content_tools = cm->allow_intrabc =
+    features->allow_screen_content_tools = features->allow_intrabc =
         cm->seq_params.force_screen_content_tools;
     return;
   }
 
   if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
-    cm->allow_screen_content_tools = cm->allow_intrabc = 1;
+    features->allow_screen_content_tools = features->allow_intrabc = 1;
     return;
   }
 
   // Estimate if the source frame is screen content, based on the portion of
   // blocks that have few luma colors.
-  const uint8_t *src = cpi->source->y_buffer;
+  const uint8_t *src = cpi->unfiltered_source->y_buffer;
   assert(src != NULL);
-  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-  const int stride = cpi->source->y_stride;
-  const int width = cpi->source->y_width;
-  const int height = cpi->source->y_height;
+  const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = cpi->unfiltered_source->y_stride;
+  const int width = cpi->unfiltered_source->y_width;
+  const int height = cpi->unfiltered_source->y_height;
   const int bd = cm->seq_params.bit_depth;
   const int blk_w = 16;
   const int blk_h = 16;
@@ -3519,70 +4049,225 @@
   }
 
   // The threshold values are selected experimentally.
-  cm->allow_screen_content_tools =
+  features->allow_screen_content_tools =
       counts_1 * blk_h * blk_w * 10 > width * height;
   // IntraBC would force loop filters off, so we use more strict rules that also
   // requires that the block has high variance.
-  cm->allow_intrabc = cm->allow_screen_content_tools &&
-                      counts_2 * blk_h * blk_w * 15 > width * height;
+  features->allow_intrabc = features->allow_screen_content_tools &&
+                            counts_2 * blk_h * blk_w * 12 > width * height;
 }
 
 static void set_size_independent_vars(AV1_COMP *cpi) {
   int i;
-  AV1_COMMON *cm = &cpi->common;
+  AV1_COMMON *const cm = &cpi->common;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cm->global_motion[i] = default_warp_params;
   }
-  cpi->global_motion_search_done = 0;
-
-  if (frame_is_intra_only(cm)) set_screen_content_options(cpi);
-  cpi->is_screen_content_type = (cm->allow_screen_content_tools != 0);
+  cpi->gm_info.search_done = 0;
 
   av1_set_speed_features_framesize_independent(cpi, cpi->speed);
   av1_set_rd_speed_thresholds(cpi);
-  cm->interp_filter = SWITCHABLE;
-  cm->switchable_motion_mode = 1;
+  cm->features.interp_filter = SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
 }
 
+#if !CONFIG_REALTIME_ONLY
+double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
+                                           int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor = (200.0 + 10.0 * factor);
+  return factor;
+}
+
+static int get_gfu_boost_from_r0_lap(double min_factor, double max_factor,
+                                     double r0, int frames_to_key) {
+  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+                                                      frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+
+double av1_get_kf_boost_projection_factor(int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, 10.0);
+  factor = AOMMAX(factor, 4.0);
+  factor = (75.0 + 14.0 * factor);
+  return factor;
+}
+
+static int get_kf_boost_from_r0(double r0, int frames_to_key) {
+  double factor = av1_get_kf_boost_projection_factor(frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+int combine_prior_with_tpl_boost(double min_factor, double max_factor,
+                                 int prior_boost, int tpl_boost,
+                                 int frames_to_key) {
+  double factor = sqrt((double)frames_to_key);
+  double range = max_factor - min_factor;
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor -= min_factor;
+  int boost =
+      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+  return boost;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+
+  const int tpl_idx = gf_group->index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame->is_valid) {
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_saved_base = 0;
+    int64_t mc_count_base = 0;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
+      for (int col = 0; col < mi_cols_sr; col += step) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+        mc_count_base += this_stats->mc_count;
+        mc_saved_base += this_stats->mc_saved;
+      }
+    }
+
+    if (mc_dep_cost_base == 0) {
+      tpl_frame->is_valid = 0;
+    } else {
+      aom_clear_system_state();
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
+      if (is_frame_arf_and_tpl_eligible(gf_group)) {
+        cpi->rd.arf_r0 = cpi->rd.r0;
+        if (cpi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
+          const int gfu_boost = get_gfu_boost_from_r0_lap(
+              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.arf_r0,
+              cpi->rc.num_stats_required_for_gfu_boost);
+          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+          //        gfu_boost);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
+              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
+        } else {
+          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+        }
+      } else if (frame_is_intra_only(cm)) {
+        // TODO(debargha): Turn off q adjustment for kf temporarily to
+        // reduce impact on speed of encoding. Need to investigate how
+        // to mitigate the issue.
+        if (cpi->oxcf.rc_mode == AOM_Q) {
+          const int kf_boost =
+              get_kf_boost_from_r0(cpi->rd.r0, cpi->rc.frames_to_key);
+          if (cpi->lap_enabled) {
+            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
+                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+                cpi->rc.kf_boost, kf_boost,
+                cpi->rc.num_stats_used_for_kf_boost);
+          } else {
+            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
+                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+                cpi->rc.kf_boost, kf_boost, cpi->rc.frames_to_key);
+          }
+        }
+      }
+      cpi->rd.mc_count_base = (double)mc_count_base /
+                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+      cpi->rd.mc_saved_base = (double)mc_saved_base /
+                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+      aom_clear_system_state();
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
                                     int *top_index) {
   AV1_COMMON *const cm = &cpi->common;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Setup variables that depend on the dimensions of the frame.
   av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
 
-  // Decide q and q bounds.
-  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
-                                top_index);
-
-  if (!frame_is_intra_only(cm)) {
-    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
-                          cpi->common.cur_frame_force_integer_mv);
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.enable_tpl_model && is_frame_tpl_eligible(cpi)) {
+    process_tpl_stats_frame(cpi);
+    av1_tpl_rdmult_setup(cpi);
   }
+#endif
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
+                                cpi->gf_group.index, bottom_index, top_index);
 
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in the second pass of a two pass encode, as it requires
   // lagged coding, and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.hl_sf.static_segmentation)
     configure_static_seg_features(cpi);
 }
 
 static void init_motion_estimation(AV1_COMP *cpi) {
-  int y_stride = cpi->scaled_source.y_stride;
-  int y_stride_src = (cpi->oxcf.resize_mode || cpi->oxcf.superres_mode)
-                         ? y_stride
-                         : cpi->lookahead->buf->img.y_stride;
+  AV1_COMMON *const cm = &cpi->common;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int y_stride = cpi->scaled_source.y_stride;
+  const int y_stride_src =
+      ((cpi->oxcf.width != cm->width || cpi->oxcf.height != cm->height) ||
+       av1_superres_scaled(cm))
+          ? y_stride
+          : cpi->lookahead->buf->img.y_stride;
+  int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride
+                                           : cpi->scaled_source.y_stride;
 
-  if (cpi->sf.mv.search_method == NSTEP) {
-    av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
-    av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD], y_stride_src);
-  } else if (cpi->sf.mv.search_method == DIAMOND) {
-    av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
-    av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD],
-                                   y_stride_src);
+  // Update if ss_cfg is uninitialized or the current frame has a new stride
+  const int should_update =
+      !mv_search_params->ss_cfg[SS_CFG_SRC].stride ||
+      !mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD].stride ||
+      (y_stride != mv_search_params->ss_cfg[SS_CFG_SRC].stride);
+
+  if (!should_update) {
+    return;
   }
+
+  if (cpi->sf.mv_sf.search_method == DIAMOND) {
+    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
+                                   y_stride);
+    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
+                                   y_stride_src);
+  } else {
+    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
+                                  y_stride);
+    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
+                                  y_stride_src);
+  }
+  av1_init_motion_fpf(&mv_search_params->ss_cfg[SS_CFG_FPF], fpf_y_stride);
 }
 
 #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
@@ -3617,15 +4302,10 @@
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     pool->frame_bufs[i].ref_count = 0;
   }
-  if (cm->seq_params.force_screen_content_tools) {
-    for (i = 0; i < FRAME_BUFFERS; ++i) {
-      av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
-    }
-  }
 }
 
-static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
-                                int subsampling_x, int subsampling_y) {
+void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                             int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
 
@@ -3636,25 +4316,30 @@
     seq_params->subsampling_y = subsampling_y;
     seq_params->use_highbitdepth = use_highbitdepth;
 
-    alloc_raw_frame_buffers(cpi);
+    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+    av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+    if (!is_stat_generation_stage(cpi)) {
+      alloc_altref_frame_buffer(cpi);
+      alloc_util_frame_buffers(cpi);
+    }
     init_ref_frame_bufs(cpi);
-    alloc_util_frame_buffers(cpi);
 
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
-    cpi->initial_mbs = cm->MBs;
+    cpi->initial_mbs = cm->mi_params.MBs;
   }
 }
 
 // Returns 1 if the assigned width or height was <= 0.
-static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  check_initial_width(cpi, cm->seq_params.use_highbitdepth,
-                      cm->seq_params.subsampling_x,
-                      cm->seq_params.subsampling_y);
+  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+                          cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -3664,7 +4349,7 @@
   if (cpi->initial_width && cpi->initial_height &&
       (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(&cpi->td, num_planes);
+    av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
     cpi->initial_width = cpi->initial_height = 0;
@@ -3683,24 +4368,28 @@
 
   if (width != cm->width || height != cm->height) {
     // There has been a change in the encoded frame size
-    set_size_literal(cpi, width, height);
-    set_mv_search_params(cpi);
+    av1_set_size_literal(cpi, width, height);
     // Recalculate 'all_lossless' in case super-resolution was (un)selected.
-    cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+    cm->features.all_lossless =
+        cm->features.coded_lossless && !av1_superres_scaled(cm);
   }
+  set_mv_search_params(cpi);
 
-  if (cpi->oxcf.pass == 2) {
+  if (is_stat_consumption_stage(cpi)) {
     av1_set_target_rate(cpi, cm->width, cm->height);
   }
 
   alloc_frame_mvs(cm, cm->cur_frame);
 
   // Allocate above context buffers
-  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
-      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
-      cm->num_allocated_above_contexts < cm->tile_rows) {
-    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
-    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+  CommonContexts *const above_contexts = &cm->above_contexts;
+  if (above_contexts->num_planes < av1_num_planes(cm) ||
+      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+      above_contexts->num_tile_rows < cm->tiles.rows) {
+    av1_free_above_context_buffers(above_contexts);
+    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+                                        cm->mi_params.mi_cols,
+                                        av1_num_planes(cm)))
       aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
   }
@@ -3709,7 +4398,8 @@
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
@@ -3722,7 +4412,7 @@
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
   av1_alloc_restoration_buffers(cm);
-  alloc_util_frame_buffers(cpi);
+  if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -3746,7 +4436,7 @@
   // Choose an arbitrary random number
   static unsigned int seed = 56789;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
   if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
@@ -3764,8 +4454,37 @@
   return new_denom;
 }
 
-#define ENERGY_BY_Q2_THRESH 0.01
-#define ENERGY_BY_AC_THRESH 0.2
+#if CONFIG_SUPERRES_IN_RECODE
+static int superres_in_recode_allowed(const AV1_COMP *const cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Empirically found to not be beneficial for AOM_Q mode and images coding.
+  return oxcf->superres_mode == SUPERRES_AUTO &&
+         (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) &&
+         cpi->rc.frames_to_key > 1;
+}
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+                                      const RATE_CONTROL *rc) {
+  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+  // level.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
+    if (rc->frames_to_key <= 1)
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+    else
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+  } else {
+    assert(0);
+  }
+  return 0;
+}
 
 static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
                                                      double threshq,
@@ -3775,29 +4494,64 @@
   const double tp = threshp * energy[1];
   const double thresh = AOMMIN(tq, tp);
   int k;
-  for (k = 16; k > 8; --k) {
+  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
     if (energy[k - 1] > thresh) break;
   }
   return 3 * SCALE_NUMERATOR - k;
 }
 
-static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex) {
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+                                             int sr_kf, int sr_arf) {
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
+      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
+    return SCALE_NUMERATOR;
+  }
+
   double energy[16];
   analyze_hor_freq(cpi, energy);
+
+  const double energy_by_q2_thresh =
+      get_energy_by_q2_thresh(gf_group, &cpi->rc);
+  int denom = get_superres_denom_from_qindex_energy(
+      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
   /*
   printf("\nenergy = [");
   for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
   printf("]\n");
+  printf("boost = %d\n",
+         (gf_group->update_type[gf_group->index] == KF_UPDATE)
+             ? cpi->rc.kf_boost
+             : cpi->rc.gfu_boost);
+  printf("denom = %d\n", denom);
   */
-  return get_superres_denom_from_qindex_energy(
-      qindex, energy, ENERGY_BY_Q2_THRESH, ENERGY_BY_AC_THRESH);
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi)) {
+    assert(cpi->superres_mode != SUPERRES_NONE);
+    // Force superres to be tried in the recode loop, as full-res is also going
+    // to be tried anyway.
+    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+  }
+#endif  // CONFIG_SUPERRES_IN_RECODE
+  return denom;
 }
 
+// If true, SUPERRES_AUTO mode will exhaustively search over all superres
+// denominators for all frames (except overlay and internal overlay frames).
+#define SUPERRES_RECODE_ALL_RATIOS 0
+
 static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
   // Choose an arbitrary random number
   static unsigned int seed = 34567;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
   // Make sure that superres mode of the frame is consistent with the
@@ -3806,8 +4560,14 @@
                  cpi->common.seq_params.enable_superres));
   assert(IMPLIES(!cpi->common.seq_params.enable_superres,
                  oxcf->superres_mode == SUPERRES_NONE));
+  // Make sure that superres mode for current encoding is consistent with user
+  // provided superres mode.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_AUTO,
+                 cpi->superres_mode == oxcf->superres_mode));
 
-  switch (oxcf->superres_mode) {
+  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+  // not the user given mode in 'oxcf'.
+  switch (cpi->superres_mode) {
     case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
     case SUPERRES_FIXED:
       if (cpi->common.current_frame.frame_type == KEY_FRAME)
@@ -3818,12 +4578,15 @@
     case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
     case SUPERRES_QTHRESH: {
       // Do not use superres when screen content tools are used.
-      if (cpi->common.allow_screen_content_tools) break;
+      if (cpi->common.features.allow_screen_content_tools) break;
       if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
         av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+
+      // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
+          &bottom_index, &top_index);
 
       const int qthresh = (frame_is_intra_only(&cpi->common))
                               ? oxcf->superres_kf_qthresh
@@ -3831,28 +4594,34 @@
       if (q <= qthresh) {
         new_denom = SCALE_NUMERATOR;
       } else {
-        new_denom = get_superres_denom_for_qindex(cpi, q);
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
       }
       break;
     }
     case SUPERRES_AUTO: {
-      // Don't use when screen content tools are used.
-      if (cpi->common.allow_screen_content_tools) break;
-      // Don't use for inter frames.
-      if (!frame_is_intra_only(&cpi->common)) break;
-      // Don't use for keyframes that can be used as references.
-      if (cpi->rc.frames_to_key != 1) break;
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
+        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
 
       // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
+          &bottom_index, &top_index);
 
       const int qthresh = 128;
       if (q <= qthresh) {
         new_denom = SCALE_NUMERATOR;
       } else {
-        new_denom = get_superres_denom_for_qindex(cpi, q);
+#if SUPERRES_RECODE_ALL_RATIOS
+        if (cpi->common.current_frame.frame_type == KEY_FRAME)
+          new_denom = oxcf->superres_kf_scale_denominator;
+        else
+          new_denom = oxcf->superres_scale_denominator;
+#else
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
       }
       break;
     }
@@ -3931,22 +4700,29 @@
 // Calculates resize and superres params for next frame
 static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
   size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
-  int resize_denom;
-  if (oxcf->pass == 1) return rsz;
-  if (cpi->resize_pending_width && cpi->resize_pending_height) {
-    rsz.resize_width = cpi->resize_pending_width;
-    rsz.resize_height = cpi->resize_pending_height;
-    cpi->resize_pending_width = cpi->resize_pending_height = 0;
+  int resize_denom = SCALE_NUMERATOR;
+  if (has_no_stats_stage(cpi) && cpi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
+    rsz.resize_width = cpi->common.width;
+    rsz.resize_height = cpi->common.height;
+    return rsz;
+  }
+  if (is_stat_generation_stage(cpi)) return rsz;
+  if (resize_pending_params->width && resize_pending_params->height) {
+    rsz.resize_width = resize_pending_params->width;
+    rsz.resize_height = resize_pending_params->height;
+    resize_pending_params->width = resize_pending_params->height = 0;
   } else {
     resize_denom = calculate_next_resize_scale(cpi);
-    rsz.resize_width = cpi->oxcf.width;
-    rsz.resize_height = cpi->oxcf.height;
+    rsz.resize_width = oxcf->width;
+    rsz.resize_height = oxcf->height;
     av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
                               resize_denom);
   }
   rsz.superres_denom = calculate_next_superres_scale(cpi);
-  if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
+  if (!validate_size_scales(oxcf->resize_mode, cpi->superres_mode, oxcf->width,
                             oxcf->height, &rsz))
     assert(0 && "Invalid scale parameters");
   return rsz;
@@ -3973,7 +4749,7 @@
   const size_params_type rsz = calculate_next_size_params(cpi);
   setup_frame_size_from_params(cpi, &rsz);
 
-  assert(is_min_tile_width_satisfied(cm));
+  assert(av1_is_min_tile_width_satisfied(cm));
 }
 
 static void superres_post_encode(AV1_COMP *cpi) {
@@ -3984,7 +4760,7 @@
 
   assert(cpi->oxcf.enable_superres);
   assert(!is_lossless_requested(&cpi->oxcf));
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   av1_superres_upscale(cm, NULL);
 
@@ -4003,7 +4779,8 @@
             &cpi->scaled_source, cm->superres_upscaled_width,
             cm->superres_upscaled_height, cm->seq_params.subsampling_x,
             cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+            AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
+            NULL))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate scaled source buffer for superres");
@@ -4015,51 +4792,9 @@
   }
 }
 
-static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-
-  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
-                 cm->coded_lossless && cm->all_lossless));
-
-  const int use_loopfilter = !cm->coded_lossless && !cm->large_scale_tile;
-  const int use_cdef = cm->seq_params.enable_cdef && !cm->coded_lossless &&
-                       !cm->large_scale_tile;
-  const int use_restoration = cm->seq_params.enable_restoration &&
-                              !cm->all_lossless && !cm->large_scale_tile;
-
-  struct loopfilter *lf = &cm->lf;
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, loop_filter_time);
-#endif
-  if (use_loopfilter) {
-    aom_clear_system_state();
-    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
-  } else {
-    lf->filter_level[0] = 0;
-    lf->filter_level[1] = 0;
-  }
-
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
-#if LOOP_FILTER_BITMASK
-                               0,
-#endif
-                               cpi->workers, cpi->num_workers,
-                               &cpi->lf_row_sync);
-    else
-      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
-#if LOOP_FILTER_BITMASK
-                            0,
-#endif
-                            0, num_planes, 0);
-  }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, loop_filter_time);
-#endif
-
+static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
+                                   MACROBLOCKD *xd, int use_restoration,
+                                   int use_cdef) {
   if (use_restoration)
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
 
@@ -4069,7 +4804,7 @@
 #endif
     // Find CDEF parameters
     av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
-                    cpi->sf.fast_cdef_search);
+                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
 
     // Apply the filter
     av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
@@ -4112,6 +4847,56 @@
 #endif
 }
 
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+                 cm->features.coded_lossless && cm->features.all_lossless));
+
+  const int use_loopfilter =
+      !cm->features.coded_lossless && !cm->tiles.large_scale;
+  const int use_cdef = cm->seq_params.enable_cdef &&
+                       !cm->features.coded_lossless && !cm->tiles.large_scale;
+  const int use_restoration = cm->seq_params.enable_restoration &&
+                              !cm->features.all_lossless &&
+                              !cm->tiles.large_scale;
+
+  struct loopfilter *lf = &cm->lf;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_filter_time);
+#endif
+  if (use_loopfilter) {
+    aom_clear_system_state();
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
+  } else {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  }
+
+  if (lf->filter_level[0] || lf->filter_level[1]) {
+    if (cpi->num_workers > 1)
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+#if CONFIG_LPF_MASK
+                               0,
+#endif
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
+#if CONFIG_LPF_MASK
+                            0,
+#endif
+                            0, num_planes, 0);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_filter_time);
+#endif
+
+  cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef);
+}
+
 static void fix_interp_filter(InterpFilter *const interp_filter,
                               const FRAME_COUNTS *const counts) {
   if (*interp_filter == SWITCHABLE) {
@@ -4170,14 +4955,14 @@
   }
 
   // Initialise all tiles' contexts from the global frame context
-  for (int tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-    for (int tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
-      const int tile_idx = tile_row * cm->tile_cols + tile_col;
+  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
       cpi->tile_data[tile_idx].tctx = *cm->fc;
     }
   }
 
-  fix_interp_filter(&cm->interp_filter, cpi->td.counts);
+  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
 }
 
 static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
@@ -4225,36 +5010,59 @@
 // its bitstream.  This function works out whether we under- or over-shot
 // our bitrate target and adjusts q as appropriate.  Also decides whether
 // or not we should do another recode loop, indicated by *loop
-static void recode_loop_update_q(AV1_COMP *const cpi, int *const loop,
-                                 int *const q, int *const q_low,
-                                 int *const q_high, const int top_index,
-                                 const int bottom_index,
-                                 int *const undershoot_seen,
-                                 int *const overshoot_seen,
-                                 const int loop_at_this_size) {
+static void recode_loop_update_q(
+    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+    int *const q_high, const int top_index, const int bottom_index,
+    int *const undershoot_seen, int *const overshoot_seen,
+    int *const low_cr_seen, const int loop_at_this_size) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  *loop = 0;
 
+  const int min_cr = cpi->oxcf.min_cr;
+  if (min_cr > 0) {
+    aom_clear_system_state();
+    const double compression_ratio =
+        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+    const double target_cr = min_cr / 100.0;
+    if (compression_ratio < target_cr) {
+      *low_cr_seen = 1;
+      if (*q < rc->worst_quality) {
+        const double cr_ratio = target_cr / compression_ratio;
+        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+      }
+    }
+    if (*low_cr_seen) return;
+  }
+
+  if (cpi->oxcf.rc_mode == AOM_Q) return;
+
+  const int last_q = *q;
   int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
   av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
                                    &frame_under_shoot_limit,
                                    &frame_over_shoot_limit);
   if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
-  if ((cm->current_frame.frame_type == KEY_FRAME) &&
-      rc->this_key_frame_forced &&
-      (rc->projected_frame_size < rc->max_frame_bandwidth)) {
-    int last_q = *q;
+  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
+      rc->projected_frame_size < rc->max_frame_bandwidth) {
     int64_t kf_err;
+    const int64_t high_err_target = cpi->ambient_err;
+    const int64_t low_err_target = cpi->ambient_err >> 1;
 
-    int64_t high_err_target = cpi->ambient_err;
-    int64_t low_err_target = cpi->ambient_err >> 1;
-
+#if CONFIG_AV1_HIGHBITDEPTH
     if (cm->seq_params.use_highbitdepth) {
       kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
     } else {
       kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
     }
+#else
+    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
     // Prevent possible divide by zero error below for perfect KF
     kf_err += !kf_err;
 
@@ -4265,7 +5073,7 @@
         (kf_err > low_err_target &&
          rc->projected_frame_size <= frame_under_shoot_limit)) {
       // Lower q_high
-      *q_high = *q > *q_low ? *q - 1 : *q_low;
+      *q_high = AOMMAX(*q - 1, *q_low);
 
       // Adjust Q
       *q = (int)((*q * high_err_target) / kf_err);
@@ -4274,7 +5082,7 @@
                rc->projected_frame_size >= frame_under_shoot_limit) {
       // The key frame is much better than the previous frame
       // Raise q_low
-      *q_low = *q < *q_high ? *q + 1 : *q_high;
+      *q_low = AOMMIN(*q + 1, *q_high);
 
       // Adjust Q
       *q = (int)((*q * low_err_target) / kf_err);
@@ -4283,25 +5091,33 @@
 
     // Clamp Q to upper and lower limits:
     *q = clamp(*q, *q_low, *q_high);
+    *loop = (*q != last_q);
+    return;
+  }
 
-    *loop = *q != last_q;
-  } else if (recode_loop_test(cpi, frame_over_shoot_limit,
-                              frame_under_shoot_limit, *q,
-                              AOMMAX(*q_high, top_index), bottom_index)) {
+  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+                       AOMMAX(*q_high, top_index), bottom_index)) {
     // Is the projected frame size out of range and are we allowed
     // to attempt to recode.
-    int last_q = *q;
 
     // Frame size out of permitted range:
     // Update correction factor & compute new Q to try...
     // Frame is too large
     if (rc->projected_frame_size > rc->this_frame_target) {
       // Special case if the projected size is > the max allowed.
-      if (rc->projected_frame_size >= rc->max_frame_bandwidth)
-        *q_high = rc->worst_quality;
+      if (*q == *q_high &&
+          rc->projected_frame_size >= rc->max_frame_bandwidth) {
+        const double q_val_high_current =
+            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
+        const double q_val_high_new =
+            q_val_high_current *
+            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
+                                  rc->best_quality, rc->worst_quality);
+      }
 
       // Raise Qlow as to at least the current value
-      *q_low = *q < *q_high ? *q + 1 : *q_high;
+      *q_low = AOMMIN(*q + 1, *q_high);
 
       if (*undershoot_seen || loop_at_this_size > 2 ||
           (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
@@ -4323,7 +5139,7 @@
       *overshoot_seen = 1;
     } else {
       // Frame is too small
-      *q_high = *q > *q_low ? *q - 1 : *q_low;
+      *q_high = AOMMAX(*q - 1, *q_low);
 
       if (*overshoot_seen || loop_at_this_size > 2 ||
           (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
@@ -4361,49 +5177,316 @@
 
     // Clamp Q to upper and lower limits:
     *q = clamp(*q, *q_low, *q_high);
+  }
 
-    *loop = (*q != last_q);
+  *loop = (*q != last_q);
+}
+
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilter ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+static uint16_t setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+    return mask;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) {
+        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+        reset_interp_filter_allowed_mask(&mask, filt_type);
+      }
+    }
+  }
+  return mask;
+}
+
+#if !CONFIG_REALTIME_ONLY
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+    const int allow_intrabc_orig_decision,
+    const int is_screen_content_type_orig_decision, const int pass,
+    int *projected_size_pass, PSNR_STATS *psnr) {
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+  projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+  if (pass != 1) return;
+
+  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
+  if (is_sc_encoding_much_better) {
+    // Use screen content tools, if we get coding gain.
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->intrabc_used;
+    cpi->is_screen_content_type = 1;
   } else {
-    *loop = 0;
+    // Use original screen content decision.
+    features->allow_screen_content_tools =
+        allow_screen_content_tools_orig_decision;
+    features->allow_intrabc = allow_intrabc_orig_decision;
+    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
   }
 }
 
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+                                                   const int pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (pass == 0) {
+    // In the first pass, encode without screen content tools.
+    // Use a high q, and a fixed block size for fast encoding.
+    cm->features.allow_screen_content_tools = 0;
+    cm->features.allow_intrabc = 0;
+    cpi->is_screen_content_type = 0;
+    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+    cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
+    return;
+  }
+  assert(pass == 1);
+  // In the second pass, encode with screen content tools.
+  // Use a high q, and a fixed block size for fast encoding.
+  cm->features.allow_screen_content_tools = 1;
+  // TODO(chengchen): turn intrabc on could lead to data race issue.
+  // cm->allow_intrabc = 1;
+  cpi->is_screen_content_type = 1;
+  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->is_screen_content_type".
+static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Variables to help determine if we should allow screen content tools.
+  int projected_size_pass[3] = { 0 };
+  PSNR_STATS psnr[3];
+  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+  const int allow_screen_content_tools_orig_decision =
+      cm->features.allow_screen_content_tools;
+  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+  // Turn off the encoding trial for forward key frame and superres.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.fwd_kf_enabled ||
+      cpi->superres_mode != SUPERRES_NONE || cpi->oxcf.mode == REALTIME ||
+      is_screen_content_type_orig_decision || !is_key_frame) {
+    return;
+  }
+
+  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+  // Find a better way to determine whether screen content tools should be used
+  // for lossless coding.
+  // Use a high q and a fixed partition to do quick encoding.
+  const int q_for_screen_content_quick_run =
+      is_lossless_requested(&cpi->oxcf) ? q_orig : AOMMAX(q_orig, 244);
+  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_block_size_orig =
+      cpi->sf.part_sf.always_this_block_size;
+
+  // Setup necessary params for encoding, including frame source, etc.
+  aom_clear_system_state();
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+  }
+
+  setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // The two encoding passes aim to help determine whether to use screen
+  // content tools, with a high q and fixed partition.
+  for (int pass = 0; pass < 2; ++pass) {
+    set_encoding_params_for_screen_content(cpi, pass);
+#if CONFIG_TUNE_VMAF
+    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      av1_set_quantizer(
+          cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+          av1_get_vmaf_base_qindex(cpi, q_for_screen_content_quick_run));
+    } else {
+#endif
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+                        q_for_screen_content_quick_run);
+#if CONFIG_TUNE_VMAF
+    }
+#endif
+    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params.bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+                                          0);
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+    // Screen content decision
+    screen_content_tools_determination(
+        cpi, allow_screen_content_tools_orig_decision,
+        allow_intrabc_orig_decision, is_screen_content_type_orig_decision, pass,
+        projected_size_pass, psnr);
+  }
+
+  // Set partition speed feature back.
+  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+  cpi->sf.part_sf.always_this_block_size = fixed_partition_block_size_orig;
+}
+#endif  // CONFIG_REALTIME_ONLY
+
 static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  const int allow_recode = cpi->sf.recode_loop != DISALLOW_RECODE;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
+  // Must allow recode if minimum compression ratio is set.
+  assert(IMPLIES(cpi->oxcf.min_cr > 0, allow_recode));
 
   set_size_independent_vars(cpi);
-
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search)
+    cpi->interp_search_flags.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
   cpi->source->buf_8bit_valid = 0;
 
   av1_setup_frame_size(cpi);
 
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi) && cpi->superres_mode != SUPERRES_NONE &&
+      cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    // Superres mode is currently enabled, but the denominator selected will
+    // disable superres. So no need to continue, as we will go through another
+    // recode loop for full-resolution after this anyway.
+    return -1;
+  }
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
   int top_index = 0, bottom_index = 0;
   int q = 0, q_low = 0, q_high = 0;
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
   q_low = bottom_index;
   q_high = top_index;
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    const int num_64x64_blocks =
+        (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+    if (cpi->td.vt64x64) {
+      if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+        aom_free(cpi->td.vt64x64);
+        cpi->td.vt64x64 = NULL;
+      }
+    }
+    if (!cpi->td.vt64x64) {
+      CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+                      aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+      cpi->td.num_64x64_blocks = num_64x64_blocks;
+    }
+  }
 
-  // Loop variables
-  int loop_count = 0;
-  int loop_at_this_size = 0;
-  int loop = 0;
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    FrameProbInfo *const frame_probs = &cpi->frame_probs;
+
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+    }
+
+    if (!cpi->sf.inter_sf.disable_obmc &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
+      av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+    }
+
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(frame_probs->warped_probs, default_warped_probs);
+    }
+
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(frame_probs->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+  }
+#if !CONFIG_REALTIME_ONLY
+  // Determine whether to use screen content tools using two fast encoding.
+  determine_sc_tools_with_encoding(cpi, q);
+#endif  // CONFIG_REALTIME_ONLY
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   printf("\n Encoding a frame:");
 #endif
+
+  // Loop variables
+  int loop = 0;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int low_cr_seen = 0;
+  int last_loop_allow_hp = 0;
+
   do {
+    loop = 0;
     aom_clear_system_state();
 
     // if frame was scaled calculate global_motion_search again if already
     // done
-    if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) {
+    if (loop_count > 0 && cpi->source && gm_info->search_done) {
       if (cpi->source->y_crop_width != cm->width ||
           cpi->source->y_crop_height != cm->height) {
-        cpi->global_motion_search_done = 0;
+        gm_info->search_done = 0;
       }
     }
     cpi->source =
@@ -4419,8 +5502,23 @@
       }
       scale_references(cpi);
     }
-    av1_set_quantizer(cm, q);
-    av1_init_quantizer(cpi);
+#if CONFIG_TUNE_VMAF
+    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+                        av1_get_vmaf_base_qindex(cpi, q));
+    } else {
+#endif
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, q);
+#if CONFIG_TUNE_VMAF
+    }
+#endif
+    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+
+    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params.bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q, 0);
 
@@ -4450,20 +5548,48 @@
     if (cm->seg.enabled) {
       if (!cm->seg.update_data && cm->prev_frame) {
         segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+        cm->seg.enabled = cm->prev_frame->seg.enabled;
       } else {
-        calculate_segdata(&cm->seg);
+        av1_calculate_segdata(&cm->seg);
       }
     } else {
       memset(&cm->seg, 0, sizeof(cm->seg));
     }
     segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+    cm->cur_frame->seg.enabled = cm->seg.enabled;
 
-    if (allow_recode) save_coding_context(cpi);
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, av1_encode_frame_time);
 #endif
+    // Set the motion vector precision based on mv stats from the last coded
+    // frame.
+    if (!frame_is_intra_only(cm)) {
+      av1_pick_and_set_high_precision_mv(cpi, q);
+
+      // If the precision has changed during different iteration of the loop,
+      // then we need to reset the global motion vectors
+      if (loop_count > 0 &&
+          cm->features.allow_high_precision_mv != last_loop_allow_hp) {
+        gm_info->search_done = 0;
+      }
+      last_loop_allow_hp = cm->features.allow_high_precision_mv;
+    }
+
     // transform / motion compensation build reconstruction frame
     av1_encode_frame(cpi);
+#if !CONFIG_REALTIME_ONLY
+    // Reset the mv_stats in case we are interrupted by an intraframe or an
+    // overlay frame.
+    if (cpi->mv_stats.valid) {
+      av1_zero(cpi->mv_stats);
+    }
+    // Gather the mv_stats for the next frame
+    if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+        av1_frame_allows_smart_mv(cpi)) {
+      av1_collect_mv_stats(cpi, q);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, av1_encode_frame_time);
 #endif
@@ -4473,32 +5599,38 @@
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
-    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
-      restore_coding_context(cpi);
-
+    const int do_dummy_pack =
+        (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         cpi->oxcf.rc_mode != AOM_Q) ||
+        cpi->oxcf.min_cr > 0;
+    if (do_dummy_pack) {
       finalize_encoded_frame(cpi);
       int largest_tile_id = 0;  // Output from bitstream: unused here
-      if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
+      if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
+          AOM_CODEC_OK) {
         return AOM_CODEC_ERROR;
+      }
 
       rc->projected_frame_size = (int)(*size) << 3;
-      restore_coding_context(cpi);
     }
 
-    if (allow_recode && cpi->oxcf.rc_mode != AOM_Q) {
+    if (allow_recode) {
       // Update q and decide whether to do a recode loop
       recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
                            bottom_index, &undershoot_seen, &overshoot_seen,
-                           loop_at_this_size);
+                           &low_cr_seen, loop_at_this_size);
     }
 
     // Special case for overlay frame.
-    if (rc->is_src_frame_alt_ref &&
-        rc->projected_frame_size < rc->max_frame_bandwidth)
+    if (loop && rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth) {
       loop = 0;
+    }
 
-    if (allow_recode && !cpi->sf.gm_disable_recode &&
-        recode_loop_test_global_motion(cpi)) {
+    if (allow_recode && !cpi->sf.gm_sf.gm_disable_recode &&
+        recode_loop_test_global_motion(cm->global_motion,
+                                       cpi->td.rd_counts.global_motion_used,
+                                       gm_info->params_cost)) {
       loop = 1;
     }
 
@@ -4515,9 +5647,318 @@
 #endif
   } while (loop);
 
+  // Update some stats from cyclic refresh.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
+    av1_cyclic_refresh_postencode(cpi);
+
   return AOM_CODEC_OK;
 }
 
+static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
+                                              uint8_t *dest, int64_t *sse,
+                                              int64_t *rate,
+                                              int *largest_tile_id) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_with_recode_loop_time);
+#endif
+  int err = encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_with_recode_loop_time);
+#endif
+  if (err != AOM_CODEC_OK) {
+    if (err == -1) {
+      // special case as described in encode_with_recode_loop().
+      // Encoding was skipped.
+      err = AOM_CODEC_OK;
+      if (sse != NULL) *sse = INT64_MAX;
+      if (rate != NULL) *rate = INT64_MAX;
+      *largest_tile_id = 0;
+    }
+    return err;
+  }
+
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_frame.frame_number > 1) {
+    av1_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif  // OUTPUT_YUV_SKINMAP
+
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (seq_params->use_highbitdepth) {
+      cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
+
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
+
+  // Pick the loop filter level for the frame.
+  if (!cm->features.allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  } else {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.nb_cdef_strengths = 1;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+  aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+
+#ifdef OUTPUT_YUV_REC
+  aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
+#endif
+
+  finalize_encoded_frame(cpi);
+  // Build the bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+  if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+
+  // Compute sse and rate.
+  if (sse != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    *sse = (seq_params->use_highbitdepth)
+               ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
+               : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#else
+    *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+  if (rate != NULL) {
+    const int64_t bits = (*size << 3);
+    *rate = (bits << 5);  // To match scale.
+  }
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_SUPERRES_IN_RECODE
+
+static void save_cur_buf(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
+  memset(&cc->copy_buffer, 0, sizeof(cc->copy_buffer));
+  if (aom_alloc_frame_buffer(&cc->copy_buffer, ybf->y_crop_width,
+                             ybf->y_crop_height, ybf->subsampling_x,
+                             ybf->subsampling_y,
+                             ybf->flags & YV12_FLAG_HIGHBITDEPTH, ybf->border,
+                             cm->features.byte_alignment) != AOM_CODEC_OK) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_MEM_ERROR,
+        "Failed to allocate copy buffer for saving coding context");
+  }
+  aom_yv12_copy_frame(ybf, &cc->copy_buffer, av1_num_planes(cm));
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  cc->lf = cm->lf;
+  cc->cdef_info = cm->cdef_info;
+  cc->rc = cpi->rc;
+}
+
+static void save_all_coding_context(AV1_COMP *cpi) {
+  save_cur_buf(cpi);
+  save_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static void restore_cur_buf(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  aom_yv12_copy_frame(&cc->copy_buffer, &cm->cur_frame->buf,
+                      av1_num_planes(cm));
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void restore_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  cm->lf = cc->lf;
+  cm->cdef_info = cc->cdef_info;
+  cpi->rc = cc->rc;
+}
+
+static void restore_all_coding_context(AV1_COMP *cpi) {
+  restore_cur_buf(cpi);
+  restore_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static void release_copy_buffer(CODING_CONTEXT *cc) {
+  aom_free_frame_buffer(&cc->copy_buffer);
+}
+
+static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
+                                            uint8_t *dest,
+                                            int *largest_tile_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params.enable_superres);
+  assert(superres_in_recode_allowed(cpi));
+  aom_codec_err_t err = AOM_CODEC_OK;
+  save_all_coding_context(cpi);
+
+  // Encode with superres.
+#if SUPERRES_RECODE_ALL_RATIOS
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int64_t superres_sses[SCALE_NUMERATOR];
+  int64_t superres_rates[SCALE_NUMERATOR];
+  int superres_largest_tile_ids[SCALE_NUMERATOR];
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
+      gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      oxcf->superres_scale_denominator = denom;
+      oxcf->superres_kf_scale_denominator = denom;
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      err = encode_with_recode_loop_and_filter(
+          cpi, size, dest, &superres_sses[this_index],
+          &superres_rates[this_index], &superres_largest_tile_ids[this_index]);
+      if (err != AOM_CODEC_OK) return err;
+      restore_all_coding_context(cpi);
+    }
+    // Reset.
+    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  } else {
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      superres_sses[this_index] = INT64_MAX;
+      superres_rates[this_index] = INT64_MAX;
+    }
+  }
+#else
+  int64_t sse1 = INT64_MAX;
+  int64_t rate1 = INT64_MAX;
+  int largest_tile_id1;
+  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+                                           &largest_tile_id1);
+  if (err != AOM_CODEC_OK) return err;
+  restore_all_coding_context(cpi);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+
+  // Encode without superres.
+  int64_t sse2 = INT64_MAX;
+  int64_t rate2 = INT64_MAX;
+  int largest_tile_id2;
+  cpi->superres_mode = SUPERRES_NONE;  // To force full-res.
+  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                           &largest_tile_id2);
+  cpi->superres_mode = cpi->oxcf.superres_mode;  // Reset.
+  assert(cpi->oxcf.superres_mode == SUPERRES_AUTO);
+  if (err != AOM_CODEC_OK) return err;
+
+  // Note: Both use common rdmult based on base qindex of fullres.
+  const int64_t rdmult =
+      av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
+
+#if SUPERRES_RECODE_ALL_RATIOS
+  // Find the best rdcost among all superres denoms.
+  double proj_rdcost1 = DBL_MAX;
+  int64_t sse1 = INT64_MAX;
+  int64_t rate1 = INT64_MAX;
+  int largest_tile_id1 = 0;
+  (void)sse1;
+  (void)rate1;
+  (void)largest_tile_id1;
+  int best_denom = -1;
+  for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) {
+    const int this_index = denom - (SCALE_NUMERATOR + 1);
+    const int64_t this_sse = superres_sses[this_index];
+    const int64_t this_rate = superres_rates[this_index];
+    const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+    const double this_rdcost = RDCOST_DBL(rdmult, this_rate, this_sse);
+    if (this_rdcost < proj_rdcost1) {
+      sse1 = this_sse;
+      rate1 = this_rate;
+      largest_tile_id1 = this_largest_tile_id;
+      proj_rdcost1 = this_rdcost;
+      best_denom = denom;
+    }
+  }
+#else
+  const double proj_rdcost1 = RDCOST_DBL(rdmult, rate1, sse1);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+  const double proj_rdcost2 = RDCOST_DBL(rdmult, rate2, sse2);
+
+  // Re-encode with superres if it's better.
+  if (proj_rdcost1 < proj_rdcost2) {
+    restore_all_coding_context(cpi);
+    // TODO(urvang): We should avoid rerunning the recode loop by saving
+    // previous output+state, or running encode only for the selected 'q' in
+    // previous step.
+#if SUPERRES_RECODE_ALL_RATIOS
+    // Again, temporarily force the best denom.
+    oxcf->superres_scale_denominator = best_denom;
+    oxcf->superres_kf_scale_denominator = best_denom;
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+    int64_t sse3 = INT64_MAX;
+    int64_t rate3 = INT64_MAX;
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                             largest_tile_id);
+    assert(sse1 == sse3);
+    assert(rate1 == rate3);
+    assert(largest_tile_id1 == *largest_tile_id);
+#if SUPERRES_RECODE_ALL_RATIOS
+    // Reset.
+    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+  } else {
+    *largest_tile_id = largest_tile_id2;
+  }
+
+  release_copy_buffer(&cpi->coding_context);
+
+  return err;
+}
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
 #define DUMP_RECON_FRAMES 0
 
 #if DUMP_RECON_FRAMES == 1
@@ -4580,11 +6021,11 @@
       "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
       "refresh_alt_ref_frame=%d, "
       "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
-      current_frame->frame_number, cpi->twopass.gf_group.index,
-      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-      current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
-      cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
-      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+      current_frame->frame_number, cpi->gf_group.index,
+      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
+      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
+      cpi->refresh_alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride,
+      cm->width, cm->height);
 #if 0
   int ref_frame;
   printf("get_ref_frame_map_idx: [");
@@ -4613,66 +6054,20 @@
 }
 #endif  // DUMP_RECON_FRAMES
 
-static int get_interp_filter_selected(const AV1_COMMON *const cm,
-                                      MV_REFERENCE_FRAME ref,
-                                      InterpFilters ifilter) {
-  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
-  if (buf == NULL) return 0;
-  return buf->interp_filter_selected[ifilter];
-}
-
-static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  int ref_total[REF_FRAMES] = { 0 };
-
-  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
-    return 0;
-
-  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
-    for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-         ++ifilter) {
-      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
-    }
-  }
-  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
-                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
-                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
-
-  int mask = 0;
-  for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-       ++ifilter) {
-    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
-    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
-      int filter_score =
-          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
-      if (filter_score < ref_total_total) mask |= 1 << ifilter;
-    }
-  }
-  return mask;
-}
-
-static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+static int is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
                          const YV12_BUFFER_CONFIG *last_picture,
-                         hash_table *last_hash_table) {
+                         ForceIntegerMVInfo *const force_intpel_info) {
   aom_clear_system_state();
   // check use hash ME
   int k;
-  uint32_t hash_value_1;
-  uint32_t hash_value_2;
 
-  const int block_size = 8;
+  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
   const double threshold_current = 0.8;
   const double threshold_average = 0.95;
   const int max_history_size = 32;
   int T = 0;  // total block
   int C = 0;  // match with collocated block
   int S = 0;  // smooth region but not match with collocated block
-  int M = 0;  // match with other block
 
   const int pic_width = cur_picture->y_width;
   const int pic_height = cur_picture->y_height;
@@ -4726,33 +6121,21 @@
         S++;
         continue;
       }
-
-      av1_get_block_hash_value(
-          cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
-          block_size, &hash_value_1, &hash_value_2,
-          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
-      // Hashing does not work for highbitdepth currently.
-      // TODO(Roger): Make it work for highbitdepth.
-      if (av1_use_hash_me(&cpi->common)) {
-        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
-          M++;
-        }
-      }
     }
   }
 
   assert(T > 0);
-  double csm_rate = ((double)(C + S + M)) / ((double)(T));
-  double m_rate = ((double)(M)) / ((double)(T));
+  double cs_rate = ((double)(C + S)) / ((double)(T));
 
-  cpi->csm_rate_array[cpi->rate_index] = csm_rate;
-  cpi->m_rate_array[cpi->rate_index] = m_rate;
+  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
 
-  cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
-  cpi->rate_size++;
-  cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+  force_intpel_info->rate_index =
+      (force_intpel_info->rate_index + 1) % max_history_size;
+  force_intpel_info->rate_size++;
+  force_intpel_info->rate_size =
+      AOMMIN(force_intpel_info->rate_size, max_history_size);
 
-  if (csm_rate < threshold_current) {
+  if (cs_rate < threshold_current) {
     return 0;
   }
 
@@ -4760,29 +6143,22 @@
     return 1;
   }
 
-  double csm_average = 0.0;
-  double m_average = 0.0;
+  double cs_average = 0.0;
 
-  for (k = 0; k < cpi->rate_size; k++) {
-    csm_average += cpi->csm_rate_array[k];
-    m_average += cpi->m_rate_array[k];
+  for (k = 0; k < force_intpel_info->rate_size; k++) {
+    cs_average += force_intpel_info->cs_rate_array[k];
   }
-  csm_average /= cpi->rate_size;
-  m_average /= cpi->rate_size;
+  cs_average /= force_intpel_info->rate_size;
 
-  if (csm_average < threshold_average) {
+  if (cs_average < threshold_average) {
     return 0;
   }
 
-  if (M > (T - C - S) / 3) {
+  if ((T - C - S) < 0) {
     return 1;
   }
 
-  if (csm_rate > 0.99 && m_rate > 0.01) {
-    return 1;
-  }
-
-  if (csm_average + m_average > 1.01) {
+  if (cs_average > 1.01) {
     return 1;
   }
 
@@ -4801,6 +6177,74 @@
   }
 }
 
+static void set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  double log_sum = 0.0;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  // Loop through each 16x16 block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          if (use_hbd) {
+            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
+                                                      xd->bd);
+          } else {
+            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      cpi->ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+}
+
+extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
+                                     const char *filename);
+
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4808,6 +6252,7 @@
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
+  FeatureFlags *const features = &cm->features;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_frame_to_data_rate_time);
@@ -4816,28 +6261,21 @@
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
-  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
-  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  cm->tiles.large_scale = cpi->oxcf.large_scale_tile;
+  cm->tiles.single_tile_decoding = cpi->oxcf.single_tile_decoding;
 
-  cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
-  // cm->allow_ref_frame_mvs needs to be written into the frame header while
-  // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is
-  // separated from frame_might_allow_ref_frame_mvs().
-  cm->allow_ref_frame_mvs &= !cm->large_scale_tile;
+  features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // features->allow_ref_frame_mvs needs to be written into the frame header
+  // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case
+  // is separated from frame_might_allow_ref_frame_mvs().
+  features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
 
-  cm->allow_warped_motion =
+  features->allow_warped_motion =
       cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
 
-  cm->last_frame_type = current_frame->frame_type;
-  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
-    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
-
-  cpi->two_pass_partition_search = cpi->sf.two_pass_partition_search &&
-                                   !cpi->partition_search_skippable_frame;
+  cpi->last_frame_type = current_frame->frame_type;
 
   if (encode_show_existing_frame(cm)) {
-    restore_coding_context(cpi);
-
     finalize_encoded_frame(cpi);
     // Build the bitstream
     int largest_tile_id = 0;  // Output from bitstream: unused here
@@ -4878,23 +6316,23 @@
   }
 
   // Work out whether to force_integer_mv this frame
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->common.features.allow_screen_content_tools &&
       !frame_is_intra_only(cm)) {
     if (cpi->common.seq_params.force_integer_mv == 2) {
       // Adaptive mode: see what previous frame encoded did
       if (cpi->unscaled_last_source != NULL) {
-        cm->cur_frame_force_integer_mv =
-            is_integer_mv(cpi, cpi->source, cpi->unscaled_last_source,
-                          cpi->previous_hash_table);
+        features->cur_frame_force_integer_mv = is_integer_mv(
+            cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
       } else {
-        cpi->common.cur_frame_force_integer_mv = 0;
+        cpi->common.features.cur_frame_force_integer_mv = 0;
       }
     } else {
-      cpi->common.cur_frame_force_integer_mv =
+      cpi->common.features.cur_frame_force_integer_mv =
           cpi->common.seq_params.force_integer_mv;
     }
   } else {
-    cpi->common.cur_frame_force_integer_mv = 0;
+    cpi->common.features.cur_frame_force_integer_mv = 0;
   }
 
   // Set default state for segment based loop filter update flags.
@@ -4915,16 +6353,16 @@
     cpi->rc.source_alt_ref_active = 0;
   }
   if (cpi->oxcf.mtu == 0) {
-    cm->num_tg = cpi->oxcf.num_tile_groups;
+    cpi->num_tg = cpi->oxcf.num_tile_groups;
   } else {
     // Use a default value for the purposes of weighting costs in probability
     // updates
-    cm->num_tg = DEFAULT_MAX_NUM_TG;
+    cpi->num_tg = DEFAULT_MAX_NUM_TG;
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+  if (has_no_stats_stage(cpi) && oxcf->rc_mode == AOM_CBR &&
       current_frame->frame_type != KEY_FRAME) {
     if (av1_rc_drop_frame(cpi)) {
       av1_rc_postencode_update_drop_frame(cpi);
@@ -4933,6 +6371,15 @@
     }
   }
 
+  if (oxcf->tuning == AOM_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
+
+#if CONFIG_TUNE_VMAF
+  if (oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_set_mb_vmaf_rdmult_scaling(cpi);
+  }
+#endif
+
   aom_clear_system_state();
 
 #if CONFIG_INTERNAL_STATS
@@ -4971,10 +6418,10 @@
 
   switch (cpi->oxcf.cdf_update_mode) {
     case 0:  // No CDF update for any frames(4~6% compression loss).
-      cm->disable_cdf_update = 1;
+      features->disable_cdf_update = 1;
       break;
     case 1:  // Enable CDF update for all frames.
-      cm->disable_cdf_update = 0;
+      features->disable_cdf_update = 0;
       break;
     case 2:
       // Strategically determine at which frames to do CDF update.
@@ -4982,86 +6429,28 @@
       // compression loss).
       // TODO(huisu@google.com): design schemes for various trade-offs between
       // compression quality and decoding speed.
-      cm->disable_cdf_update =
+      features->disable_cdf_update =
           (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
       break;
   }
-  cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, encode_with_recode_loop_time);
-#endif
-  if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
-    return AOM_CODEC_ERROR;
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_with_recode_loop_time);
-#endif
-
-#ifdef OUTPUT_YUV_SKINMAP
-  if (cpi->common.current_frame.frame_number > 1) {
-    av1_compute_skin_map(cpi, yuv_skinmap_file);
-  }
-#endif  // OUTPUT_YUV_SKINMAP
-
-  // Special case code to reduce pulsing when key frames are forced at a
-  // fixed interval. Note the reconstruction error if it is the frame before
-  // the force key frame
-  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-    if (seq_params->use_highbitdepth) {
-      cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    } else {
-      cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+  int largest_tile_id = 0;
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi)) {
+    if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
     }
-  }
-
-  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
-  cm->cur_frame->buf.transfer_characteristics =
-      seq_params->transfer_characteristics;
-  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
-  cm->cur_frame->buf.monochrome = seq_params->monochrome;
-  cm->cur_frame->buf.chroma_sample_position =
-      seq_params->chroma_sample_position;
-  cm->cur_frame->buf.color_range = seq_params->color_range;
-  cm->cur_frame->buf.render_width = cm->render_width;
-  cm->cur_frame->buf.render_height = cm->render_height;
-
-  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
-  // off.
-
-  // Pick the loop filter level for the frame.
-  if (!cm->allow_intrabc) {
-    loopfilter_frame(cpi, cm);
   } else {
-    cm->lf.filter_level[0] = 0;
-    cm->lf.filter_level[1] = 0;
-    cm->cdef_info.cdef_bits = 0;
-    cm->cdef_info.cdef_strengths[0] = 0;
-    cm->cdef_info.nb_cdef_strengths = 1;
-    cm->cdef_info.cdef_uv_strengths[0] = 0;
-    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+#endif  // CONFIG_SUPERRES_IN_RECODE
+    if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
+                                           &largest_tile_id) != AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+#if CONFIG_SUPERRES_IN_RECODE
   }
-
-  // TODO(debargha): Fix mv search range on encoder side
-  // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
-  aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
-
-#ifdef OUTPUT_YUV_REC
-  aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
-#endif
-
-  finalize_encoded_frame(cpi);
-  // Build the bitstream
-  int largest_tile_id = 0;  // Output from pack_bitstream
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, av1_pack_bitstream_final_time);
-#endif
-  if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
-    return AOM_CODEC_ERROR;
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, av1_pack_bitstream_final_time);
-#endif
+#endif  // CONFIG_SUPERRES_IN_RECODE
 
   cpi->seq_params_locked = 1;
 
@@ -5084,7 +6473,7 @@
       update_reference_segmentation_map(cpi);
     } else if (cm->last_frame_seg_map) {
       memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
-             cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
+             cm->mi_params.mi_cols * cm->mi_params.mi_rows * sizeof(uint8_t));
     }
   }
 
@@ -5102,25 +6491,25 @@
   av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
 #endif  // CONFIG_ENTROPY_STATS
 
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+  if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     *cm->fc = cpi->tile_data[largest_tile_id].tctx;
     av1_reset_cdf_symbol_counters(cm->fc);
   }
-  if (!cm->large_scale_tile) {
+  if (!cm->tiles.large_scale) {
     cm->cur_frame->frame_context = *cm->fc;
   }
-#define EXT_TILE_DEBUG 0
-#if EXT_TILE_DEBUG
-  if (cm->large_scale_tile && oxcf->pass == 2) {
-    char fn[20] = "./fc";
-    fn[4] = current_frame->frame_number / 100 + '0';
-    fn[5] = (current_frame->frame_number % 100) / 10 + '0';
-    fn[6] = (current_frame->frame_number % 10) + '0';
-    fn[7] = '\0';
-    av1_print_frame_contexts(cm->fc, fn);
+
+  if (cpi->oxcf.ext_tile_debug) {
+    // (yunqing) This test ensures the correctness of large scale tile coding.
+    if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
+      char fn[20] = "./fc";
+      fn[4] = current_frame->frame_number / 100 + '0';
+      fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+      fn[6] = (current_frame->frame_number % 10) + '0';
+      fn[7] = '\0';
+      av1_print_frame_contexts(cm->fc, fn);
+    }
   }
-#endif  // EXT_TILE_DEBUG
-#undef EXT_TILE_DEBUG
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, encode_frame_to_data_rate_time);
@@ -5139,15 +6528,10 @@
   }
 #endif
 
-  cm->last_frame_type = current_frame->frame_type;
+  cpi->last_frame_type = current_frame->frame_type;
 
   av1_rc_postencode_update(cpi, *size);
 
-  // Store encoded frame's hash table for is_integer_mv() next time
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-    cpi->previous_hash_table = &cm->cur_frame->hash_table;
-  }
-
   // Clear the one shot update flags for segmentation map and mode/ref loop
   // filter deltas.
   cm->seg.update_map = 0;
@@ -5159,13 +6543,8 @@
   // it is not shown, we still need update the count down.
 
   if (cm->show_frame) {
-    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
-    // are
-    // being used as reference.
-    swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
-
     ++current_frame->frame_number;
   }
 
@@ -5184,8 +6563,8 @@
   cpi->unscaled_last_source = frame_input->last_source;
 
   current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
-  cm->error_resilient_mode = frame_params->error_resilient_mode;
-  cm->primary_ref_frame = frame_params->primary_ref_frame;
+  cm->features.error_resilient_mode = frame_params->error_resilient_mode;
+  cm->features.primary_ref_frame = frame_params->primary_ref_frame;
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
   cpi->ref_frame_flags = frame_params->ref_frame_flags;
@@ -5196,26 +6575,23 @@
   memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->refresh_last_frame = frame_params->refresh_last_frame;
   cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
   cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
-  cpi->refresh_alt2_ref_frame = frame_params->refresh_alt2_ref_frame;
   cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
 
   if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
     current_frame->frame_number = 0;
 
-  if (cm->show_existing_frame) {
-    current_frame->order_hint = cm->cur_frame->order_hint;
-  } else {
-    current_frame->order_hint =
-        current_frame->frame_number + frame_params->order_offset;
-    current_frame->order_hint %=
-        (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
-  }
+  current_frame->order_hint =
+      current_frame->frame_number + frame_params->order_offset;
+  current_frame->display_order_hint = current_frame->order_hint;
+  current_frame->order_hint %=
+      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
 
-  if (cpi->oxcf.pass == 1) {
+  if (is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
     av1_first_pass(cpi, frame_input->ts_duration);
+#endif
   } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
     if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
         AOM_CODEC_OK) {
@@ -5272,7 +6648,16 @@
   const int subsampling_y = sd->subsampling_y;
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
 
-  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+    av1_vmaf_frame_preprocessing(cpi, sd);
+  }
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_vmaf_blk_preprocessing(cpi, sd);
+  }
+#endif
 
 #if CONFIG_INTERNAL_STATS
   struct aom_usec_timer timer;
@@ -5332,18 +6717,13 @@
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
-  uint32_t in_bit_depth = 8;
-  uint32_t bit_depth = 8;
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
 #if CONFIG_INTER_STATS_ONLY
   if (cm->current_frame.frame_type == KEY_FRAME) return;  // skip key frame
 #endif
   cpi->bytes += frame_bytes;
-
-  if (cm->seq_params.use_highbitdepth) {
-    in_bit_depth = cpi->oxcf.input_bit_depth;
-    bit_depth = cm->seq_params.bit_depth;
-  }
   if (cm->show_frame) {
     const YV12_BUFFER_CONFIG *orig = cpi->source;
     const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
@@ -5354,9 +6734,11 @@
       PSNR_STATS psnr;
       double frame_ssim2 = 0.0, weight = 0.0;
       aom_clear_system_state();
-      // TODO(yaowu): unify these two versions into one.
+#if CONFIG_AV1_HIGHBITDEPTH
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
-
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
                         &cpi->psnr);
       cpi->total_sq_error += psnr.sse[0];
@@ -5424,7 +6806,7 @@
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush,
-                            const aom_rational_t *timebase) {
+                            const aom_rational64_t *timestamp_ratio) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
 
@@ -5432,13 +6814,12 @@
   assert(cpi->oxcf.max_threads == 0 &&
          "bitstream debug tool does not support multithreading");
   bitstream_queue_record_write();
-  bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
-                                  cm->show_frame);
+  aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
+                                      cm->show_frame);
 #endif
-
-  // Indicates whether or not to use an adaptive quantize b rather than
-  // the traditional version
-  cm->use_quant_b_adapt = cpi->oxcf.quant_b_adapt;
+  if (cpi->use_svc && cm->number_spatial_layers > 1) {
+    av1_one_pass_cbr_svc_start_layer(cpi);
+  }
 
   cm->showable_frame = 0;
   *size = 0;
@@ -5446,22 +6827,23 @@
   struct aom_usec_timer cmptimer;
   aom_usec_timer_start(&cmptimer);
 #endif
-  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
+  av1_set_high_precision_mv(cpi, 1, 0);
 
   // Normal defaults
-  cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
-                                  ? REFRESH_FRAME_CONTEXT_DISABLED
-                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->features.refresh_frame_context = oxcf->frame_parallel_decoding_mode
+                                           ? REFRESH_FRAME_CONTEXT_DISABLED
+                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
   if (oxcf->large_scale_tile)
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   // Initialize fields related to forward keyframes
   cpi->no_show_kf = 0;
 
   if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
 
-  const int result = av1_encode_strategy(cpi, size, dest, frame_flags,
-                                         time_stamp, time_end, timebase, flush);
+  const int result =
+      av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end,
+                          timestamp_ratio, flush);
   if (result != AOM_CODEC_OK && result != -1) {
     return AOM_CODEC_ERROR;
   } else if (result == -1) {
@@ -5471,22 +6853,38 @@
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
-#endif
+#endif  // CONFIG_INTERNAL_STATS
   if (cpi->b_calculate_psnr) {
-    if (cm->show_existing_frame || (oxcf->pass != 1 && cm->show_frame)) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
       generate_psnr_packet(cpi);
     }
   }
-  if (cpi->keep_level_stats && oxcf->pass != 1)
+
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+       oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+       oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN)) {
+    av1_update_vmaf_curve(cpi, cpi->source, &cpi->common.cur_frame->buf);
+  }
+#endif
+
+  if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
+      av1_init_level_info(cpi);
+    }
     av1_update_level_info(cpi, *size, *time_stamp, *time_end);
+  }
 
 #if CONFIG_INTERNAL_STATS
-  if (oxcf->pass != 1) {
+  if (!is_stat_generation_stage(cpi)) {
     compute_internal_stats(cpi, (int)(*size));
   }
 #endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
-  if (cpi->oxcf.pass != 1 && !cm->show_existing_frame) {
+  if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
     cpi->tx_search_count += cpi->td.mb.tx_search_count;
     cpi->td.mb.tx_search_count = 0;
   }
@@ -5494,7 +6892,7 @@
 
   aom_clear_system_state();
 
-  return 0;
+  return AOM_CODEC_OK;
 }
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
@@ -5548,8 +6946,9 @@
   return cm->error.error_code;
 }
 
-int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
-                          AOM_SCALING vert_mode) {
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode) {
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
   if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
@@ -5558,13 +6957,15 @@
   Scale2Ratio(vert_mode, &vr, &vs);
 
   // always go to the next whole number
-  cpi->resize_pending_width = (hs - 1 + cpi->oxcf.width * hr) / hs;
-  cpi->resize_pending_height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+  resize_pending_params->width = (hs - 1 + oxcf->width * hr) / hs;
+  resize_pending_params->height = (vs - 1 + oxcf->height * vr) / vs;
 
   return 0;
 }
 
-int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
+int av1_get_quantizer(AV1_COMP *cpi) {
+  return cpi->common.quant_params.base_qindex;
+}
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
   size_t output_size = 0;
@@ -5623,38 +7024,68 @@
   return AOM_CODEC_OK;
 }
 
+static void svc_set_updates_external_ref_frame_config(
+    ExternalFlags *const ext_flags, SVC *const svc) {
+  ext_flags->refresh_frame_flags_pending = 1;
+  ext_flags->refresh_last_frame = svc->refresh[svc->ref_idx[0]];
+  ext_flags->refresh_golden_frame = svc->refresh[svc->ref_idx[3]];
+  ext_flags->refresh_bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
+  ext_flags->refresh_alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
+  ext_flags->refresh_alt_ref_frame = svc->refresh[svc->ref_idx[6]];
+  svc->non_reference_frame = 1;
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (svc->refresh[i] == 1) {
+      svc->non_reference_frame = 0;
+      break;
+    }
+  }
+}
+
+static int svc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int ref = AOM_REFFRAME_ALL;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    if (!cpi->svc.reference[i]) ref ^= (1 << i);
+  }
+  return ref;
+}
+
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
   // TODO(yunqingwang): For what references to use, external encoding flags
   // should be consistent with internal reference frame selection. Need to
   // ensure that there is not conflict between the two. In AV1 encoder, the
   // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
-  // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be
-  // LAST.
-  cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL;
+  // GOLDEN, BWDREF, ALTREF2.
+
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
   if (flags &
       (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
        AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
        AOM_EFLAG_NO_REF_ARF2)) {
-    if (flags & AOM_EFLAG_NO_REF_LAST) {
-      cpi->ext_ref_frame_flags = 0;
+    int ref = AOM_REFFRAME_ALL;
+
+    if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_ARF) {
+      ref ^= AOM_ALT_FLAG;
+      ref ^= AOM_BWD_FLAG;
+      ref ^= AOM_ALT2_FLAG;
     } else {
-      int ref = AOM_REFFRAME_ALL;
+      if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+    }
 
-      if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
-      if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
-
-      if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
-
-      if (flags & AOM_EFLAG_NO_REF_ARF) {
-        ref ^= AOM_ALT_FLAG;
-        ref ^= AOM_BWD_FLAG;
-        ref ^= AOM_ALT2_FLAG;
-      } else {
-        if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
-        if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
-      }
-
-      av1_use_as_reference(cpi, ref);
+    av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+  } else {
+    if (cpi->svc.external_ref_frame_config) {
+      int ref = svc_set_references_external_ref_frame_config(cpi);
+      av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
     }
   }
 
@@ -5673,26 +7104,31 @@
       upd ^= AOM_ALT2_FLAG;
     }
 
-    cpi->ext_refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
-    cpi->ext_refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
-    cpi->ext_refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
-    cpi->ext_refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
-    cpi->ext_refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
-    cpi->ext_refresh_frame_flags_pending = 1;
+    ext_flags->refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
+    ext_flags->refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    ext_flags->refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    ext_flags->refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    ext_flags->refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    ext_flags->refresh_frame_flags_pending = 1;
   } else {
-    cpi->ext_refresh_frame_flags_pending = 0;
+    if (cpi->svc.external_ref_frame_config)
+      svc_set_updates_external_ref_frame_config(ext_flags, &cpi->svc);
+    else
+      ext_flags->refresh_frame_flags_pending = 0;
   }
 
-  cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
-                               ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
-  cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode |
-                                 ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
-  cpi->ext_use_s_frame =
+  ext_flags->use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+                                 ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  ext_flags->use_error_resilient = cpi->oxcf.error_resilient_mode |
+                                   ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  ext_flags->use_s_frame =
       cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
-  cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+  ext_flags->use_primary_ref_none =
+      (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
 
   if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
-    av1_update_entropy(cpi, 0);
+    av1_update_entropy(&ext_flags->refresh_frame_context,
+                       &ext_flags->refresh_frame_context_pending, 0);
   }
 }
 
@@ -5701,7 +7137,7 @@
 
   uint8_t header_buf[512] = { 0 };
   const uint32_t sequence_header_size =
-      write_sequence_header_obu(cpi, &header_buf[0]);
+      av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]);
   assert(sequence_header_size <= sizeof(header_buf));
   if (sequence_header_size == 0) return NULL;
 
@@ -5712,8 +7148,8 @@
   if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
-  if (av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
-      obu_header_size) {
+  if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0,
+                           &header_buf[0]) != obu_header_size) {
     return NULL;
   }
 

diff --git a/libaom/av1/encoder/encoder.h b/libaom/av1/encoder/encoder.h
index bf02394..82d00cb 100644
--- a/libaom/av1/encoder/encoder.h
+++ b/libaom/av1/encoder/encoder.h

@@ -20,27 +20,27 @@
 #include "aom/aomcx.h"
 
 #include "av1/common/alloccommon.h"
-#include "av1/common/entropymode.h"
-#include "av1/common/thread_common.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/resize.h"
-#include "av1/common/timing.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
 #include "av1/common/enums.h"
+#include "av1/common/resize.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/timing.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/level.h"
 #include "av1/encoder/lookahead.h"
-#include "av1/encoder/mbgraph.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
+#include "av1/encoder/svc_layercontext.h"
 #include "av1/encoder/tokenize.h"
-#include "av1/encoder/block.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -56,25 +56,26 @@
 extern "C" {
 #endif
 
+// Number of frames required to test for scene cut detection
+#define SCENE_CUT_KEY_TEST_INTERVAL 16
+
+// Rational number with an int64 numerator
+// This structure holds a fractional value
+typedef struct aom_rational64 {
+  int64_t num;       // fraction numerator
+  int den;           // fraction denominator
+} aom_rational64_t;  // alias for struct aom_rational
+
 typedef struct {
-  int nmv_vec_cost[MV_JOINTS];
-  int nmv_costs[2][MV_VALS];
-  int nmv_costs_hp[2][MV_VALS];
-
-  FRAME_CONTEXT fc;
+#if CONFIG_SUPERRES_IN_RECODE
+  struct loopfilter lf;
+  CdefInfo cdef_info;
+  YV12_BUFFER_CONFIG copy_buffer;
+  RATE_CONTROL rc;
+#endif  // CONFIG_SUPERRES_IN_RECODE
 } CODING_CONTEXT;
 
 enum {
-  REGULAR_FRAME,       // regular inter frame
-  ARF_FRAME,           // alternate reference frame
-  OVERLAY_FRAME,       // overlay frame
-  GLD_FRAME,           // golden frame
-  BRF_FRAME,           // backward reference frame
-  INTERNAL_ARF_FRAME,  // internal alternate reference frame
-  FRAME_CONTEXT_INDEXES
-} UENUM1BYTE(FRAME_CONTEXT_INDEX);
-
-enum {
   NORMAL = 0,
   FOURFIVE = 1,
   THREEFIVE = 2,
@@ -110,9 +111,9 @@
 } UENUM1BYTE(AQ_MODE);
 enum {
   NO_DELTA_Q = 0,
-  DELTA_Q_ONLY = 1,
-  DELTA_Q_LF = 2,
-  DELTAQ_MODE_COUNT  // This should always be the last member of the enum
+  DELTA_Q_OBJECTIVE = 1,   // Modulation to improve objective quality
+  DELTA_Q_PERCEPTUAL = 2,  // Modulation to improve perceptual quality
+  DELTA_Q_MODE_COUNT       // This should always be the last member of the enum
 } UENUM1BYTE(DELTAQ_MODE);
 
 enum {
@@ -136,45 +137,79 @@
 
 typedef enum {
   kInvalid = 0,
-  kLowSadLowSumdiff = 1,
-  kLowSadHighSumdiff = 2,
-  kHighSadLowSumdiff = 3,
-  kHighSadHighSumdiff = 4,
-  kLowVarHighSumdiff = 5,
-  kVeryHighSad = 6,
+  kLowSad = 1,
+  kHighSad = 2,
+  kLowVarHighSumdiff = 3,
 } CONTENT_STATE_SB;
 
 enum {
   SS_CFG_SRC = 0,
   SS_CFG_LOOKAHEAD = 1,
-  SS_CFG_TOTAL = 2
+  SS_CFG_FPF = 2,
+  SS_CFG_TOTAL = 3
 } UENUM1BYTE(SS_CFG_OFFSET);
 
+// TODO(jingning): This needs to be cleaned up next.
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TOTAL_BUFFERS + REF_FRAMES + 1)
+
 typedef struct TplDepStats {
   int64_t intra_cost;
   int64_t inter_cost;
-  int64_t mc_flow;
-  int64_t mc_dep_cost;
-
+  int64_t srcrf_dist;
+  int64_t recrf_dist;
+  int64_t srcrf_rate;
+  int64_t recrf_rate;
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
+  int_mv mv[INTER_REFS_PER_FRAME];
   int ref_frame_index;
-  int_mv mv;
+  int64_t pred_error[INTER_REFS_PER_FRAME];
+  int64_t mc_count;
+  int64_t mc_saved;
 } TplDepStats;
 
 typedef struct TplDepFrame {
   uint8_t is_valid;
   TplDepStats *tpl_stats_ptr;
+  const YV12_BUFFER_CONFIG *gf_picture;
+  YV12_BUFFER_CONFIG *rec_picture;
+  int ref_map_index[REF_FRAMES];
   int stride;
   int width;
   int height;
   int mi_rows;
   int mi_cols;
-  int base_qindex;
+  unsigned int frame_display_index;
+  int base_rdmult;
 } TplDepFrame;
 
+typedef struct TplParams {
+  // Block granularity of tpl score storage.
+  uint8_t tpl_stats_block_mis_log2;
+
+  // Buffer to store the frame level tpl information for each frame in a gf
+  // group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+  // group
+  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+  // Buffer to store tpl stats at block granularity.
+  // tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+  // group.
+  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+  // Buffer to store tpl reconstructed frame.
+  // tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+  // Pointer to tpl_stats_buffer.
+  TplDepFrame *tpl_frame;
+} TplParams;
+
 typedef enum {
   COST_UPD_SB,
   COST_UPD_SBROW,
   COST_UPD_TILE,
+  COST_UPD_OFF,
 } COST_UPDATE_TYPE;
 
 #define TPL_DEP_COST_SCALE_LOG2 4
@@ -235,10 +270,13 @@
   int worst_allowed_q;
   int best_allowed_q;
   int cq_level;
+  int enable_chroma_deltaq;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
   DELTAQ_MODE deltaq_mode;
+  int deltalf_mode;
   int enable_cdef;
   int enable_restoration;
+  int force_video_mode;
   int enable_obmc;
   int disable_trellis_quant;
   int using_qm;
@@ -247,9 +285,6 @@
   int qm_v;
   int qm_minlevel;
   int qm_maxlevel;
-#if CONFIG_DIST_8X8
-  int using_dist_8x8;
-#endif
   unsigned int num_tile_groups;
   unsigned int mtu;
 
@@ -299,6 +334,7 @@
 
   int min_gf_interval;
   int max_gf_interval;
+  int gf_min_pyr_height;
   int gf_max_pyr_height;
 
   int row_mt;
@@ -310,12 +346,14 @@
   int tile_heights[MAX_TILE_ROWS];
 
   int enable_tpl_model;
+  int enable_keyframe_filtering;
 
   int max_threads;
 
   aom_fixed_buf_t two_pass_stats_in;
 
   aom_tune_metric tuning;
+  const char *vmaf_model_path;
   aom_tune_content content;
   int use_highbitdepth;
   aom_color_primaries_t color_primaries;
@@ -342,7 +380,8 @@
   unsigned int full_still_picture_hdr;
   int enable_dual_filter;
   unsigned int motion_vector_unit_test;
-  const cfg_options_t *cfg;
+  unsigned int sb_multipass_unit_test;
+  unsigned int ext_tile_debug;
   int enable_rect_partitions;
   int enable_ab_partitions;
   int enable_1to4_partitions;
@@ -350,7 +389,6 @@
   int max_partition_size;
   int enable_intra_edge_filter;
   int enable_tx64;
-  int tx_size_search_method;
   int enable_flip_idtx;
   int enable_order_hint;
   int enable_dist_wtd_comp;
@@ -373,6 +411,7 @@
   int enable_paeth_intra;
   int enable_cfl_intra;
   int enable_superres;
+  int enable_overlay;
   int enable_palette;
   int enable_intrabc;
   int enable_angle_delta;
@@ -392,17 +431,54 @@
   int quant_b_adapt;
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
   int border_in_pixels;
   AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
   // Bit mask to specify which tier each of the 32 possible operating points
   // conforms to.
   unsigned int tier_mask;
+  // If true, encoder will use fixed QP offsets, that are either:
+  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+  // - Picked automatically from cq_level.
+  int use_fixed_qp_offsets;
+  // List of QP offsets for: keyframe, ALTREF, and 3 levels of internal ARFs.
+  // If any of these values are negative, fixed offsets are disabled.
+  // Uses internal q range.
+  double fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
+  // min_cr / 100 is the target minimum compression ratio for each frame.
+  unsigned int min_cr;
+  const cfg_options_t *encoder_cfg;
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+typedef struct {
+  // obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+  // jth block size and ith frame update type, averaged over past frames. If
+  // obmc_probs[i][j] < thresh, then OBMC search is pruned.
+  int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+  // warped_probs[i] is the probability of warped motion being the best motion
+  // mode for ith frame update type, averaged over past frames. If
+  // warped_probs[i] < thresh, then warped motion search is pruned.
+  int warped_probs[FRAME_UPDATE_TYPES];
+
+  // tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+  // for jth transform size and ith frame update type, averaged over past
+  // frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+  // type is pruned.
+  int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+
+  // switchable_interp_probs[i][j][k] is the probability of kth interpolation
+  // filter being the best for jth filter context and ith frame update type,
+  // averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+  // then interpolation filter search is pruned for that case.
+  int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
+                             [SWITCHABLE_FILTERS];
+} FrameProbInfo;
+
 typedef struct FRAME_COUNTS {
 // Note: This structure should only contain 'unsigned int' fields, or
 // aggregates built solely from 'unsigned int' fields/elements
@@ -522,8 +598,6 @@
   int64_t sse_arr[MAX_INTER_MODES];
   int64_t est_rd_arr[MAX_INTER_MODES];
   RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
-  bool true_rd_arr[MAX_INTER_MODES];
-  uint8_t blk_skip_arr[MAX_INTER_MODES][MAX_MIB_SIZE * MAX_MIB_SIZE];
   RD_STATS rd_cost_arr[MAX_INTER_MODES];
   RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
   RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
@@ -546,12 +620,73 @@
   int num_threads_working;
 } AV1RowMTInfo;
 
+typedef struct {
+  // TODO(kyslov): consider changing to 64bit
+
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+  uint32_t sum_square_error;
+  int32_t sum_error;
+  int log2_count;
+  int variance;
+} VPartVar;
+
+typedef struct {
+  VPartVar none;
+  VPartVar horz[2];
+  VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+  VPVariance part_variances;
+  VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+  VPVariance part_variances;
+  VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+  VPVariance part_variances;
+  VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+  VPVariance part_variances;
+  VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+  VPVariance part_variances;
+  VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+  VPVariance part_variances;
+  VP64x64 *split;
+} VP128x128;
+
+typedef struct {
+  // Thresholds for variance based partitioning. If block variance > threshold,
+  // then that block is forced to split.
+  // thresholds[0] - threshold for 128x128;
+  // thresholds[1] - threshold for 64x64;
+  // thresholds[2] - threshold for 32x32;
+  // thresholds[3] - threshold for 16x16;
+  // thresholds[4] - threshold for 8x8;
+  int64_t thresholds[5];
+
+  // MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+  // minmax > threshold_minmax, the 16x16 is forced to split.
+  int64_t threshold_minmax;
+} VarBasedPartitionInfo;
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
-  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  int m_search_count;
-  int ex_search_count;
   CFL_CTX cfl;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
@@ -580,6 +715,9 @@
   int global_motion_used[REF_FRAMES];
   int compound_ref_used_flag;
   int skip_mode_used_flag;
+  int tx_type_used[TX_SIZES_ALL][TX_TYPES];
+  int obmc_used[BLOCK_SIZES_ALL][2];
+  int warped_used[2];
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -587,7 +725,7 @@
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+  PC_TREE *pc_root;
   tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
@@ -598,10 +736,15 @@
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
   PALETTE_BUFFER *palette_buffer;
+  CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
   uint8_t *tmp_obmc_bufs[2];
   int intrabc_used;
+  int deltaq_used;
   FRAME_CONTEXT *tctx;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  VP64x64 *vt64x64;
+  int32_t num_64x64_blocks;
 } ThreadData;
 
 struct EncWorkerData;
@@ -612,6 +755,16 @@
   unsigned char *map;
 } ActiveMap;
 
+typedef struct {
+  // cs_rate_array[i] is the fraction of blocks in a frame which either match
+  // with the collocated block or are smooth, where i is the rate_index.
+  double cs_rate_array[32];
+  // rate_index is used to index cs_rate_array.
+  int rate_index;
+  // rate_size is the total number of entries populated in cs_rate_array.
+  int rate_size;
+} ForceIntegerMVInfo;
+
 #if CONFIG_INTERNAL_STATS
 // types of stats
 enum {
@@ -633,6 +786,17 @@
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
+typedef struct {
+  // Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level for
+  // use in bitstream preparation. frame_base[mi_row * stride + mi_col] stores
+  // the mode information of block (mi_row,mi_col).
+  MB_MODE_INFO_EXT_FRAME *frame_base;
+  // Size of frame_base buffer.
+  int alloc_size;
+  // Stride of frame_base buffer.
+  int stride;
+} MBMIExtFrameBufferInfo;
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
 typedef struct PartitionStats {
   int partition_decisions[6][EXT_PARTITION_TYPES];
@@ -657,13 +821,11 @@
   av1_compute_global_motion_time,
   av1_setup_motion_field_time,
   encode_sb_time,
-  first_partition_search_pass_time,
   rd_pick_partition_time,
   rd_pick_sb_modes_time,
   av1_rd_pick_intra_mode_sb_time,
   av1_rd_pick_inter_mode_sb_time,
   handle_intra_mode_time,
-  handle_inter_mode_time,
   do_tx_search_time,
   handle_newmv_time,
   compound_type_rd_time,
@@ -686,8 +848,6 @@
       return "av1_compute_global_motion_time";
     case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
     case encode_sb_time: return "encode_sb_time";
-    case first_partition_search_pass_time:
-      return "first_partition_search_pass_time";
     case rd_pick_partition_time: return "rd_pick_partition_time";
     case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
     case av1_rd_pick_intra_mode_sb_time:
@@ -695,7 +855,6 @@
     case av1_rd_pick_inter_mode_sb_time:
       return "av1_rd_pick_inter_mode_sb_time";
     case handle_intra_mode_time: return "handle_intra_mode_time";
-    case handle_inter_mode_time: return "handle_inter_mode_time";
     case do_tx_search_time: return "do_tx_search_time";
     case handle_newmv_time: return "handle_newmv_time";
     case compound_type_rd_time: return "compound_type_rd_time";
@@ -711,20 +870,225 @@
 // The maximum number of internal ARFs except ALTREF_FRAME
 #define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
 
+typedef struct {
+  // Array to store the cost for signalling each global motion model.
+  // gmtype_cost[i] stores the cost of signalling the ith Global Motion model.
+  int type_cost[TRANS_TYPES];
+
+  // Array to store the cost for signalling a particular global motion model for
+  // each reference frame. gmparams_cost[i] stores the cost of signalling global
+  // motion for the ith reference frame.
+  int params_cost[REF_FRAMES];
+
+  // Flag to indicate if global motion search needs to be rerun.
+  bool search_done;
+} GlobalMotionInfo;
+
+typedef struct {
+  // Stores the default value of skip flag depending on chroma format
+  // Set as 1 for monochrome and 3 for other color formats
+  int default_interp_skip_flags;
+  // Filter mask to allow certain interp_filter type.
+  uint16_t interp_filter_search_mask;
+} InterpSearchFlags;
+
+typedef struct {
+  // Largest MV component used in a frame.
+  // The value from the previous frame is used to set the full pixel search
+  // range for the current frame.
+  int max_mv_magnitude;
+  // Parameter indicating initial search window to be used in full-pixel search.
+  // Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+  int mv_step_param;
+  // Pointer to sub-pixel search function.
+  // In encoder: av1_find_best_sub_pixel_tree
+  //             av1_find_best_sub_pixel_tree_pruned
+  //             av1_find_best_sub_pixel_tree_pruned_more
+  //             av1_find_best_sub_pixel_tree_pruned_evenmore
+  // In MV unit test: av1_return_max_sub_pixel_mv
+  //                  av1_return_min_sub_pixel_mv
+  fractional_mv_step_fp *find_fractional_mv_step;
+  // Search site configuration for full-pel MV search.
+  // ss_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple motion
+  // search.
+  // ss_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal filter
+  // ss_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+  search_site_config ss_cfg[SS_CFG_TOTAL];
+} MotionVectorSearchParams;
+
+typedef struct {
+  // When resize is triggered externally, the desired dimensions are stored in
+  // this struct until used in the next frame to be coded. These values are
+  // effective only for one frame and are reset after they are used.
+  int width;
+  int height;
+} ResizePendingParams;
+
+typedef struct {
+  // Threshold of transform domain distortion
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation.
+  // Index 2: Winner mode evaluation.
+  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
+  // speed feature is ON
+  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+  // Factor to control R-D optimization of coeffs based on block
+  // mse.
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc). Index 1: Mode evaluation.
+  // Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+  // feature is ON
+  unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES];
+
+  // Transform size to be used in transform search
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+  // feature is ON
+  TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
+
+  // Transform domain distortion levels
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
+  // speed feature is ON
+  unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
+
+  // Predict transform skip levels to be used for default, mode and winner mode
+  // evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+  // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+  unsigned int predict_skip_level[MODE_EVAL_TYPES];
+} WinnerModeParams;
+
+typedef struct {
+  // Bit mask to disable certain reference frame types.
+  int ref_frame_flags;
+
+  // Flags to determine which reference buffers are refreshed by this frame.
+  // When set, the encoder will update the particular reference frame buffer
+  // with the contents of the current frame.
+  bool refresh_last_frame;
+  bool refresh_golden_frame;
+  bool refresh_bwd_ref_frame;
+  bool refresh_alt2_ref_frame;
+  bool refresh_alt_ref_frame;
+
+  // Flag to indicate that updation of refresh frame flags from external
+  // interface is pending.
+  bool refresh_frame_flags_pending;
+
+  // Flag to enable the updation of frame contexts at the end of a frame decode.
+  bool refresh_frame_context;
+
+  // Flag to indicate that updation of refresh_frame_context from external
+  // interface is pending.
+  bool refresh_frame_context_pending;
+
+  // Flag to enable temporal MV prediction.
+  bool use_ref_frame_mvs;
+
+  // Flag to code the frame as error-resilient.
+  bool use_error_resilient;
+
+  // Flag to code the frame as s-frame.
+  bool use_s_frame;
+
+  // Flag to set the frame's primary_ref_frame to PRIMARY_REF_NONE.
+  bool use_primary_ref_none;
+} ExternalFlags;
+
+typedef struct {
+  int arf_stack[FRAME_BUFFERS];
+  int arf_stack_size;
+  int lst_stack[FRAME_BUFFERS];
+  int lst_stack_size;
+  int gld_stack[FRAME_BUFFERS];
+  int gld_stack_size;
+} RefBufferStack;
+
+typedef struct {
+  // Some misc info
+  int high_prec;
+  int q;
+  int order;
+
+  // MV counters
+  int inter_count;
+  int intra_count;
+  int default_mvs;
+  int mv_joint_count[4];
+  int last_bit_zero;
+  int last_bit_nonzero;
+
+  // Keep track of the rates
+  int total_mv_rate;
+  int hp_total_mv_rate;
+  int lp_total_mv_rate;
+
+  // Texture info
+  int horz_text;
+  int vert_text;
+  int diag_text;
+
+  // Whether the current struct contains valid data
+  int valid;
+} MV_STATS;
+
+typedef struct {
+  int frame_width;
+  int frame_height;
+  int mi_rows;
+  int mi_cols;
+  int mb_rows;
+  int mb_cols;
+  int num_mbs;
+  aom_bit_depth_t bit_depth;
+  int subsampling_x;
+  int subsampling_y;
+} FRAME_INFO;
+
+typedef struct {
+  // 3-bit number containing the segment affiliation for each 4x4 block in the
+  // frame. map[y * stride + x] contains the segment id of the 4x4 block at
+  // (x,y) position.
+  uint8_t *map;
+  // Flag to indicate if current frame has lossless segments or not.
+  // 1: frame has at least one lossless segment.
+  // 0: frame has no lossless segments.
+  bool has_lossless_segment;
+} EncSegmentationInfo;
+
+typedef struct {
+  // Start time stamp of the previous frame
+  int64_t prev_start_seen;
+  // End time stamp of the previous frame
+  int64_t prev_end_seen;
+  // Start time stamp of the first frame
+  int64_t first_ever;
+} TimeStamps;
+
 typedef struct AV1_COMP {
-  QUANTS quants;
+  // Quantization and dequantization parameters for internal quantizer setup
+  // in the encoder.
+  EncQuantDequantParams enc_quant_dequant_params;
   ThreadData td;
   FRAME_COUNTS counts;
-  MB_MODE_INFO_EXT *mbmi_ext_base;
+
+  // Holds buffer storing mode information at 4x4/8x8 level.
+  MBMIExtFrameBufferInfo mbmi_ext_info;
+
   CB_COEFF_BUFFER *coeff_buffer_base;
-  Dequants dequants;
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
-  struct lookahead_entry *alt_ref_source;
   int no_show_kf;
 
-  int optimize_seg_arr[MAX_SEGMENTS];
+  TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
 
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
@@ -732,21 +1096,15 @@
   YV12_BUFFER_CONFIG scaled_source;
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
+  YV12_BUFFER_CONFIG *unfiltered_source;
 
-  TplDepFrame tpl_stats[MAX_LAG_BUFFERS];
-  YV12_BUFFER_CONFIG *tpl_recon_frames[INTER_REFS_PER_FRAME + 1];
+  TplParams tpl_data;
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
-  // The following item corresponds to two_pass_partition_search speed features.
-  int two_pass_partition_search;
 
-  double csm_rate_array[32];
-  double m_rate_array[32];
-  int rate_size;
-  int rate_index;
-  hash_table *previous_hash_table;
-  int previous_index;
+  // Variables related to forcing integer mv decisions for the current frame.
+  ForceIntegerMVInfo force_intpel_info;
 
   unsigned int row_mt;
   RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
@@ -757,17 +1115,10 @@
   // after the current frame is encoded, the XYZ reference frame gets refreshed
   // (updated) to be the current frame.
   //
-  // Special case: 'refresh_last_frame' specifies that:
-  // - LAST_FRAME reference should be updated to be the current frame (as usual)
-  // - Also, LAST2_FRAME and LAST3_FRAME references are implicitly updated to be
-  // the two past reference frames just before LAST_FRAME that are available.
-  //
   // Note: Usually at most one of these refresh flags is true at a time.
   // But a key-frame is special, for which all the flags are true at once.
-  int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_bwd_ref_frame;
-  int refresh_alt2_ref_frame;
   int refresh_alt_ref_frame;
 
   // For each type of reference frame, this contains the index of a reference
@@ -776,19 +1127,8 @@
   // frame of the same type as the current frame).
   int fb_of_context_type[REF_FRAMES];
 
-  int ext_refresh_frame_flags_pending;
-  int ext_refresh_last_frame;
-  int ext_refresh_golden_frame;
-  int ext_refresh_bwd_ref_frame;
-  int ext_refresh_alt2_ref_frame;
-  int ext_refresh_alt_ref_frame;
-
-  int ext_refresh_frame_context_pending;
-  int ext_refresh_frame_context;
-  int ext_use_ref_frame_mvs;
-  int ext_use_error_resilient;
-  int ext_use_s_frame;
-  int ext_use_primary_ref_none;
+  // Flags signalled by the external interface at frame level.
+  ExternalFlags ext_flags;
 
   YV12_BUFFER_CONFIG last_frame_uf;
   YV12_BUFFER_CONFIG trial_frame_rst;
@@ -800,44 +1140,38 @@
 
   CODING_CONTEXT coding_context;
 
-  int gmtype_cost[TRANS_TYPES];
-  int gmparams_cost[REF_FRAMES];
+  // Parameters related to global motion search.
+  GlobalMotionInfo gm_info;
 
-  int nmv_costs[2][MV_VALS];
-  int nmv_costs_hp[2][MV_VALS];
+  // Parameters related to winner mode processing.
+  WinnerModeParams winner_mode_params;
 
-  int64_t last_time_stamp_seen;
-  int64_t last_end_time_stamp_seen;
-  int64_t first_time_stamp_ever;
+  // Frame time stamps
+  TimeStamps time_stamps;
 
   RATE_CONTROL rc;
   double framerate;
 
   struct aom_codec_pkt_list *output_pkt_list;
 
-  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
-  int mbgraph_n_frames;  // number of frames filled in the above
-  int static_mb_pct;     // % forced skip mbs by segmentation
   int ref_frame_flags;
-  int ext_ref_frame_flags;
 
   // speed is passed as a per-frame parameter into the encoder
   int speed;
   // sf contains fine-grained config set internally based on speed
   SPEED_FEATURES sf;
 
-  unsigned int max_mv_magnitude;
-  int mv_step_param;
+  // Parameters for motion vector search process.
+  MotionVectorSearchParams mv_search_params;
 
   int all_one_sided_refs;
 
-  uint8_t *segmentation_map;
+  // Segmentation related information for current frame.
+  EncSegmentationInfo enc_seg;
 
   CYCLIC_REFRESH *cyclic_refresh;
   ActiveMap active_map;
 
-  fractional_mv_step_fp *find_fractional_mv_step;
-  av1_diamond_search_fn_t diamond_search_sad;
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
 
 #if CONFIG_INTERNAL_STATS
@@ -845,10 +1179,21 @@
   uint64_t time_compress_data;
 #endif
 
+  // number of show frames encoded in current gf_group
+  int num_gf_group_show_frames;
+
   TWO_PASS twopass;
 
+  GF_GROUP gf_group;
+
+  // To control the reference frame buffer and selection.
+  RefBufferStack ref_buffer_stack;
+
   YV12_BUFFER_CONFIG alt_ref_buffer;
 
+  // Tell if OVERLAY frame shows existing alt_ref frame.
+  int show_existing_alt_ref;
+
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
 
@@ -884,25 +1229,16 @@
 
   int droppable;
 
+  FRAME_INFO frame_info;
+
   int initial_width;
   int initial_height;
   int initial_mbs;  // Number of MBs in the full-size frame; to be used to
                     // normalize the firstpass stats. This will differ from the
                     // number of MBs in the current frame when the frame is
                     // scaled.
-
-  // When resize is triggered through external control, the desired width/height
-  // are stored here until use in the next frame coded. They are effective only
-  // for
-  // one frame and are reset after use.
-  int resize_pending_width;
-  int resize_pending_height;
-
-  // ss_cfg[SS_CFG_LOOKAHEAD] : used in following cases
-  //                           -> temporal filtering
-  //                           -> intrabc
-  // ss_cfg[SS_CFG_SRC] : used everywhere except above mentioned cases
-  search_site_config ss_cfg[SS_CFG_TOTAL];
+  // Resize related parameters
+  ResizePendingParams resize_pending_params;
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
@@ -910,10 +1246,6 @@
   TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
   TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
 
-  int resize_state;
-  int resize_avg_qp;
-  int resize_buffer_underflow;
-
   // Sequence parameters have been transmitted already and locked
   // or not. Once locked av1_change_config cannot change the seq
   // parameters.
@@ -922,41 +1254,26 @@
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
-  // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_128x128; 1 - threshold_64x64;
-  // 2 - threshold_32x32; 3 - threshold_16x16;
-  // 4 - vbp_threshold_8x8;
-  int64_t vbp_thresholds[5];
-  int64_t vbp_threshold_minmax;
-  int64_t vbp_threshold_sad;
-  int64_t vbp_threshold_copy;
-  BLOCK_SIZE vbp_bsize_min;
+  // Thresholds for variance based partitioning.
+  VarBasedPartitionInfo vbp_info;
+
+  // Probabilities for pruning of various AV1 tools.
+  FrameProbInfo frame_probs;
 
   // Multi-threading
   int num_workers;
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   int existing_fb_idx_to_show;
-  int is_arf_filter_off[MAX_INTERNAL_ARFS + 1];
-  int global_motion_search_done;
   int internal_altref_allowed;
   // A flag to indicate if intrabc is ever used in current frame.
   int intrabc_used;
-  int dv_cost[2][MV_VALS];
-  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
-  int dv_joint_cost[MV_JOINTS];
-  int has_lossless_segment;
 
-  // Factors to control gating of compound type selection based on best
-  // approximate rd so far
-  int max_comp_type_rd_threshold_mul;
-  int max_comp_type_rd_threshold_div;
+  // Tables to calculate IntraBC MV cost.
+  IntraBCMVCosts dv_costs;
 
-  unsigned int tx_domain_dist_threshold;
-
-  // Factor to control R-D optimization of coeffs based on block
-  // mse.
-  unsigned int coeff_opt_dist_threshold;
+  // Mark which ref frames can be skipped for encoding current frame druing RDO.
+  int prune_ref_frame_mask;
 
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
@@ -966,10 +1283,10 @@
 #if CONFIG_DENOISE
   struct aom_denoise_and_model_t *denoise_and_model;
 #endif
-  // Stores the default value of skip flag depending on chroma format
-  // Set as 1 for monochrome and 3 for other color formats
-  int default_interp_skip_flags;
-  int preserve_arf_as_gld;
+
+  // Flags related to interpolation filter search.
+  InterpSearchFlags interp_search_flags;
+
   MultiThreadHandle multi_thread_ctxt;
   void (*row_mt_sync_read_ptr)(AV1RowMTSync *const, int, int);
   void (*row_mt_sync_write_ptr)(AV1RowMTSync *const, int, int, const int);
@@ -990,13 +1307,50 @@
   uint64_t frame_component_time[kTimingComponents];
 #endif
 
-  // The following data are for AV1 bitstream levels.
-  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
-  int keep_level_stats;
-  AV1LevelInfo level_info[MAX_NUM_OPERATING_POINTS];
-  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
-  int frame_header_count;
-  FrameWindowBuffer frame_window_buffer;
+  // Parameters for AV1 bitstream levels.
+  AV1LevelParams level_params;
+
+  // whether any no-zero delta_q was actually used
+  int deltaq_used;
+
+  // Indicates the true relative distance of ref frame w.r.t. current frame
+  int ref_relative_dist[INTER_REFS_PER_FRAME];
+
+  // Indicate nearest references w.r.t. current frame in past and future
+  int8_t nearest_past_ref;
+  int8_t nearest_future_ref;
+
+  // TODO(sdeng): consider merge the following arrays.
+  double *tpl_rdmult_scaling_factors;
+  double *tpl_sb_rdmult_scaling_factors;
+  double *ssim_rdmult_scaling_factors;
+
+#if CONFIG_TUNE_VMAF
+  double *vmaf_rdmult_scaling_factors;
+  double last_frame_ysse;
+  double last_frame_vmaf;
+  double last_frame_unsharp_amount;
+#endif
+
+  int use_svc;
+  SVC svc;
+
+  int lap_enabled;
+  COMPRESSOR_STAGE compressor_stage;
+
+  // Some motion vector stats from the last encoded frame to help us decide what
+  // precision to use to encode the current frame.
+  MV_STATS mv_stats;
+
+  // Frame type of the last frame. May be used in some heuristics for speeding
+  // up the encoding.
+  FRAME_TYPE last_frame_type;
+  int num_tg;
+
+  // Super-resolution mode currently being used by the encoder.
+  // This may / may not be same as user-supplied mode in oxcf->superres_mode
+  // (when we are recoding to try multiple options for example).
+  SUPERRES_MODE superres_mode;
 } AV1_COMP;
 
 typedef struct {
@@ -1025,10 +1379,8 @@
   int remapped_ref_idx[REF_FRAMES];
 
   // Flags which determine which reference buffers are refreshed by this frame
-  int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_bwd_ref_frame;
-  int refresh_alt2_ref_frame;
   int refresh_alt_ref_frame;
 
   // Speed level to use for this frame: Bigger number means faster.
@@ -1046,11 +1398,19 @@
 void av1_initialize_enc(void);
 
 struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
-                                       BufferPool *const pool);
+                                       BufferPool *const pool,
+                                       FIRSTPASS_STATS *frame_stats_buf,
+                                       COMPRESSOR_STAGE stage,
+                                       int num_lap_buffers,
+                                       int lap_lag_in_frames,
+                                       STATS_BUFFER_CTX *stats_buf_context);
 void av1_remove_compressor(AV1_COMP *cpi);
 
 void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
 
+void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                             int subsampling_x, int subsampling_y);
+
 // receive a frames worth of data. caller can assume that a copy of this
 // frame is made and not just a copy of the pointer..
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
@@ -1060,7 +1420,7 @@
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush,
-                            const aom_rational_t *timebase);
+                            const aom_rational64_t *timebase);
 
 int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
                const EncodeFrameInput *const frame_input,
@@ -1075,57 +1435,106 @@
                                        YV12_BUFFER_CONFIG *new_frame,
                                        YV12_BUFFER_CONFIG *sd);
 
-int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags);
 
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
 int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height);
+
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
 
-int av1_update_entropy(AV1_COMP *cpi, int update);
+int av1_update_entropy(bool *ext_refresh_frame_context,
+                       bool *ext_refresh_frame_context_pending, bool update);
 
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 
 int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 
-int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
-                          AOM_SCALING vert_mode);
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode);
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
 
+void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                        CompoundTypeRdBuffers *const bufs);
+void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs);
+
+// Set screen content options.
+// This function estimates whether to use screen content tools, by counting
+// the portion of blocks that have few luma colors.
+// Modifies:
+//   cpi->commom.allow_screen_content_tools
+//   cpi->common.allow_intrabc
+// However, the estimation is not accurate and may misclassify videos.
+// A slower but more accurate approach that determines whether to use screen
+// content tools is employed later. See determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(const struct AV1_COMP *cpi,
+                                    FeatureFlags *features);
+
+// TODO(jingning): Move these functions as primitive members for the new cpi
+// class.
+static INLINE void stack_push(int *stack, int *stack_size, int item) {
+  for (int i = *stack_size - 1; i >= 0; --i) stack[i + 1] = stack[i];
+  stack[0] = item;
+  ++*stack_size;
+}
+
+static INLINE int stack_pop(int *stack, int *stack_size) {
+  if (*stack_size <= 0) return -1;
+
+  int item = stack[0];
+  for (int i = 0; i < *stack_size; ++i) stack[i] = stack[i + 1];
+  --*stack_size;
+
+  return item;
+}
+
+static INLINE int stack_pop_end(int *stack, int *stack_size) {
+  int item = stack[*stack_size - 1];
+  stack[*stack_size - 1] = -1;
+  --*stack_size;
+
+  return item;
+}
+
+static INLINE void stack_reset(int *stack, int *stack_size) {
+  for (int i = 0; i < *stack_size; ++i) stack[i] = INVALID_IDX;
+  *stack_size = 0;
+}
+
 // av1 uses 10,000,000 ticks/second as time stamp
 #define TICKS_PER_SEC 10000000LL
 
-static INLINE int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
-                                              int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+static INLINE int64_t
+timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  return n * timestamp_ratio->num / timestamp_ratio->den;
 }
 
-static INLINE int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
-                                              int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+static INLINE int64_t
+ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  int64_t round = timestamp_ratio->num / 2;
+  if (round > 0) --round;
+  return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
 }
 
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+
+  return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
+         update_type == GF_UPDATE;
 }
 
 // TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
-static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
-  return cm->allow_screen_content_tools;
-}
-
-static INLINE hash_table *av1_get_ref_frame_hash_map(
-    const AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame) {
-  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
-  RefCntBuffer *buf =
-      (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
-  return buf ? &buf->hash_table : NULL;
+static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+  return (cpi->common.features.allow_screen_content_tools &&
+          cpi->common.features.allow_intrabc &&
+          frame_is_intra_only(&cpi->common));
 }
 
 static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
@@ -1183,7 +1592,7 @@
                                  int mi_row, TOKENEXTRA **tok, int sb_size_log2,
                                  int num_planes) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
+  const int tile_cols = cm->tiles.cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
 
@@ -1202,6 +1611,36 @@
   return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
 }
 
+// Check if statistics generation stage
+static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+  assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
+                 cpi->oxcf.pass == 0 && cpi->lap_enabled));
+  return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE));
+}
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+  return (cpi->oxcf.pass == 2);
+}
+
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+  return (is_stat_consumption_stage_twopass(cpi) ||
+          (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) &&
+           cpi->lap_enabled));
+}
+
+// Check if the current stage has statistics
+static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+  assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
+}
+
+// Function return size of frame stats buffer
+static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+  /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
+  return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
+}
+
 // TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1217,10 +1656,23 @@
   return frame_index & 0x1;
 }
 
-static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
-  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+                                              const int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
 }
 
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
+}
+
+// Compression ratio of current frame.
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size);
+
 void av1_new_framerate(AV1_COMP *cpi, double framerate);
 
 void av1_setup_frame_size(AV1_COMP *cpi);
@@ -1241,21 +1693,31 @@
 // frame. An exception can be made for a forward keyframe since it has no
 // previous dependencies.
 static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
-  return cm->show_existing_frame && (!cm->error_resilient_mode ||
+  return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
                                      cm->current_frame.frame_type == KEY_FRAME);
 }
 
+// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
+// 'mi_row' and 'mi_col'.
+static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+                                 const BLOCK_SIZE mi_alloc_bsize,
+                                 const int mbmi_ext_stride) {
+  const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
+  const int mi_ext_row = mi_row / mi_ext_size_1d;
+  const int mi_ext_col = mi_col / mi_ext_size_1d;
+  return mi_ext_row * mbmi_ext_stride + mi_ext_col;
+}
+
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi,
-                                         MACROBLOCK *const x,
-                                         MACROBLOCKD *const xd, int mi_row,
-                                         int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int idx_str = xd->mi_stride * mi_row + mi_col;
-  xd->mi = cm->mi_grid_visible + idx_str;
-  xd->mi[0] = cm->mi + idx_str;
-  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+static INLINE void set_mode_info_offsets(
+    const CommonModeInfoParams *const mi_params,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
+    MACROBLOCKD *const xd, int mi_row, int mi_col) {
+  set_mi_offsets(mi_params, xd, mi_row, mi_col);
+  const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize,
+                                     mbmi_ext_info->stride);
+  x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx;
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -1287,6 +1749,91 @@
                                                              AOM_ALT2_FLAG,
                                                              AOM_ALT_FLAG };
 
+// When more than 'max_allowed_refs' are available, we reduce the number of
+// reference frames one at a time based on this order.
+static const MV_REFERENCE_FRAME disable_order[] = {
+  LAST3_FRAME,
+  LAST2_FRAME,
+  ALTREF2_FRAME,
+  GOLDEN_FRAME,
+};
+
+static INLINE int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
+  const unsigned int max_allowed_refs_for_given_speed =
+      (cpi->sf.inter_sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
+                                                  : INTER_REFS_PER_FRAME;
+  return AOMMIN(max_allowed_refs_for_given_speed,
+                cpi->oxcf.max_reference_frames);
+}
+
+static const MV_REFERENCE_FRAME
+    ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
+      LAST_FRAME,    ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
+      ALTREF2_FRAME, LAST2_FRAME,  LAST3_FRAME,
+    };
+
+static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+                                      const YV12_BUFFER_CONFIG **ref_frames,
+                                      const int ext_ref_frame_flags) {
+  // cpi->ext_flags.ref_frame_flags allows certain reference types to be
+  // disabled by the external interface.  These are set by
+  // av1_apply_encoding_flags(). Start with what the external interface allows,
+  // then suppress any reference types which we have found to be duplicates.
+  int flags = ext_ref_frame_flags;
+
+  for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
+    const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
+    // If this_ref has appeared before, mark the corresponding ref frame as
+    // invalid. For nonrd mode, only disable GOLDEN_FRAME if it's the same
+    // as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+    int index = (sf->rt_sf.use_nonrd_pick_mode &&
+                 ref_frame_priority_order[i] == GOLDEN_FRAME)
+                    ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+                    : i;
+    for (int j = 0; j < index; ++j) {
+      if (this_ref == ref_frames[j]) {
+        flags &= ~(1 << (ref_frame_priority_order[i] - 1));
+        break;
+      }
+    }
+  }
+  return flags;
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi,
+                                              int *ref_frame_flags) {
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      total_valid_refs++;
+    }
+  }
+
+  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
+    }
+
+    switch (ref_frame_to_disable) {
+      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
+    }
+    --total_valid_refs;
+  }
+  assert(total_valid_refs <= max_allowed_refs);
+}
+
 // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
 // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
 // function, the memory must be freed by the caller. Both the buf member of the
@@ -1298,6 +1845,45 @@
 // field.
 aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
 
+#define MAX_GFUBOOST_FACTOR 10.0
+#define MIN_GFUBOOST_FACTOR 4.0
+double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
+                                           int frame_count);
+double av1_get_kf_boost_projection_factor(int frame_count);
+
+#define ENABLE_KF_TPL 1
+#define MAX_PYR_LEVEL_FROMTOP_DELTAQ 0
+
+static INLINE int is_frame_kf_and_tpl_eligible(AV1_COMP *const cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  return (cm->current_frame.frame_type == KEY_FRAME) && cm->show_frame &&
+         (cpi->rc.frames_to_key > 1);
+}
+
+static INLINE int is_frame_arf_and_tpl_eligible(const GF_GROUP *gf_group) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  return update_type == ARF_UPDATE || update_type == GF_UPDATE;
+}
+
+static INLINE int is_frame_tpl_eligible(AV1_COMP *const cpi) {
+#if ENABLE_KF_TPL
+  return is_frame_kf_and_tpl_eligible(cpi) ||
+         is_frame_arf_and_tpl_eligible(&cpi->gf_group);
+#else
+  return is_frame_arf_and_tpl_eligible(&cpi->gf_group);
+#endif  // ENABLE_KF_TPL
+}
+
+// Get update type of the current frame.
+static INLINE FRAME_UPDATE_TYPE
+get_frame_update_type(const GF_GROUP *gf_group) {
+  return gf_group->update_type[gf_group->index];
+}
+
+static INLINE int av1_pixels_to_mi(int pixels) {
+  return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
+}
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
 static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
   FILE *f = fopen("partition_stats.csv", "w");

diff --git a/libaom/av1/encoder/encodetxb.c b/libaom/av1/encoder/encodetxb.c
index 37f4bb9..825d52a 100644
--- a/libaom/av1/encoder/encodetxb.c
+++ b/libaom/av1/encoder/encodetxb.c

@@ -23,6 +23,7 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
+#if CONFIG_HTB_TRELLIS
 static int hbt_needs_init = 1;
 static CRC32C crc_calculator;
 static const int HBT_EOB = 16;            // also the length in opt_qcoeff
@@ -41,6 +42,7 @@
 } OptTxbQcoeff;
 
 OptTxbQcoeff *hbt_hash_table;
+#endif  // CONFIG_HTB_TRELLIS
 
 typedef struct LevelDownStats {
   int update;
@@ -60,10 +62,19 @@
   int new_eob;
 } LevelDownStats;
 
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
@@ -73,17 +84,6 @@
 
 void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
 
-void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                          int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  int mib_size_log2 = cm->seq_params.mib_size_log2;
-  int stride = (cm->mi_cols >> mib_size_log2) + 1;
-  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
-  x->mbmi_ext->cb_coef_buff = &cpi->coeff_buffer_base[offset];
-  x->mbmi_ext->cb_offset = x->cb_offset;
-  assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
-}
-
 static void write_golomb(aom_writer *w, int level) {
   int x = level + 1;
   int i = x;
@@ -151,7 +151,7 @@
     t = eob_to_pos_large[e];
   }
 
-  *extra = eob - k_eob_group_start[t];
+  *extra = eob - av1_eob_group_start[t];
 
   return t;
 }
@@ -234,9 +234,9 @@
       break;
   }
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  if (av1_eob_offset_bits[eob_pt] > 0) {
     int eob_ctx = eob_pt - 3;
-    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
 #if CONFIG_ENTROPY_STATS
     counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
@@ -254,12 +254,12 @@
   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
   eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  if (av1_eob_offset_bits[eob_pt] > 0) {
     const int eob_ctx = eob_pt - 3;
-    const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
     const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
     eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
-    const int offset_bits = k_eob_offset_bits[eob_pt];
+    const int offset_bits = av1_eob_offset_bits[eob_pt];
     if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
   }
   return eob_cost;
@@ -502,36 +502,35 @@
   }
 }
 
-void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           aom_writer *w, int blk_row, int blk_col, int plane,
-                          TX_SIZE tx_size, const tran_low_t *tcoeff,
-                          uint16_t eob, TXB_CTX *txb_ctx) {
+                          int block, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+  const int txb_offset =
+      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+  const uint16_t eob = eob_txb[block];
+  const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+  const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  aom_write_symbol(w, eob == 0,
-                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
   if (eob == 0) return;
+
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int16_t *const scan = scan_order->scan;
-  int c;
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-  av1_txb_init_levels(tcoeff, width, height, levels);
-
-  av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  // Only y plane's tx_type is transmitted
+  if (plane == 0) {
+    av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+  }
 
   int eob_extra;
   const int eob_pt = get_eob_pos_token(eob, &eob_extra);
   const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
   switch (eob_multi_size) {
     case 0:
@@ -564,7 +563,7 @@
       break;
   }
 
-  const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
   if (eob_offset_bits > 0) {
     const int eob_ctx = eob_pt - 3;
     int eob_shift = eob_offset_bits - 1;
@@ -578,9 +577,21 @@
     }
   }
 
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const tran_low_t *tcoeff_txb =
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+  const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-  for (c = eob - 1; c >= 0; --c) {
+  const int bwl = get_txb_bwl(tx_size);
+  for (int c = eob - 1; c >= 0; --c) {
     const int pos = scan[c];
     const int coeff_ctx = coeff_contexts[pos];
     const tran_low_t v = tcoeff[pos];
@@ -611,14 +622,16 @@
 
   // Loop to code all signs in the transform block,
   // starting with the sign of DC (if applicable)
-  for (c = 0; c < eob; ++c) {
+  for (int c = 0; c < eob; ++c) {
     const tran_low_t v = tcoeff[scan[c]];
     const tran_low_t level = abs(v);
     const int sign = (v < 0) ? 1 : 0;
     if (level) {
       if (c == 0) {
-        aom_write_symbol(
-            w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+        const int dc_sign_ctx =
+            (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+        aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                         2);
       } else {
         aom_write_bit(w, sign);
       }
@@ -634,28 +647,8 @@
   aom_writer *w;
 } ENCODE_TXB_ARGS;
 
-static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
-                                  aom_writer *w, int plane, int block,
-                                  int blk_row, int blk_col, TX_SIZE tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int txb_offset =
-      x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-  tran_low_t *tcoeff_txb =
-      x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
-  uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
-  uint8_t *txb_skip_ctx_txb =
-      x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
-  int *dc_sign_ctx_txb =
-      x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
-  tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
-  uint16_t eob = eob_txb[block];
-  TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
-  av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
-                       &txb_ctx);
-}
-
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
-                         int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
   int block[MAX_MB_PLANE] = { 0 };
@@ -665,23 +658,20 @@
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  int mu_blocks_high = mi_size_high[max_unit_bsize];
   mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
   mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
   for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
     for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
       for (int plane = 0; plane < num_planes; ++plane) {
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
+        if (plane && !xd->is_chroma_ref) break;
         const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
         const int stepr = tx_size_high_unit[tx_size];
         const int stepc = tx_size_wide_unit[tx_size];
         const int step = stepr * stepc;
-
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
         const int unit_height = ROUND_POWER_OF_TWO(
             AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
         const int unit_width = ROUND_POWER_OF_TWO(
@@ -690,8 +680,8 @@
              blk_row += stepr) {
           for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                blk_col += stepc) {
-            write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
-                                  blk_col, tx_size);
+            av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+                                 block[plane], tx_size);
             block[plane] += step;
           }
         }
@@ -701,19 +691,19 @@
 }
 
 // TODO(angiebird): use this function whenever it's possible
-static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                            const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
-                            TX_TYPE tx_type) {
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+                            int reduced_tx_set_used) {
   if (plane > 0) return 0;
 
   const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
       !xd->lossless[xd->mi[0]->segment_id]) {
     const int ext_tx_set =
-        get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (is_inter) {
       if (ext_tx_set > 0)
         return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
@@ -733,14 +723,44 @@
   return 0;
 }
 
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+
 static AOM_FORCE_INLINE int warehouse_efficients_txb(
-    const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
-    const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
     const struct macroblock_plane *p, const int eob,
     const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
-    const MACROBLOCKD *const xd, const TX_TYPE tx_type,
-    const TX_CLASS tx_class) {
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
   const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
@@ -757,7 +777,7 @@
 
   av1_txb_init_levels(qcoeff, width, height, levels);
 
-  cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
 
   cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
 
@@ -769,7 +789,7 @@
   {
     const int pos = scan[c];
     const tran_low_t v = qcoeff[pos];
-    const int sign = v >> 31;
+    const int sign = AOMSIGN(v);
     const int level = (v ^ sign) - sign;
     const int coeff_ctx = coeff_contexts[pos];
     cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
@@ -796,7 +816,7 @@
     const int coeff_ctx = coeff_contexts[pos];
     const tran_low_t v = qcoeff[pos];
     const int level = abs(v);
-    const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
     if (v) {
       // sign bit cost
       cost += av1_cost_literal(1);
@@ -805,13 +825,13 @@
         cost += get_br_cost(level, lps_cost[ctx]);
       }
     }
-    cost += cost0;
   }
-  if (c == 0) {
+  // c == 0 after previous loop
+  {
     const int pos = scan[c];
     const tran_low_t v = qcoeff[pos];
     const int coeff_ctx = coeff_contexts[pos];
-    const int sign = v >> 31;
+    const int sign = AOMSIGN(v);
     const int level = (v ^ sign) - sign;
     cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
 
@@ -829,9 +849,74 @@
   return cost;
 }
 
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
-                        const int plane, const int block, const TX_SIZE tx_size,
-                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+  return cost;
+}
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
+                                 1209,  1393, 1577, 1763, 1947,
+                                 2132,  2317, 2501, 2686, 2871 };
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type) {
+  assert(plane == 0);
+
+  int cost = 0;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+  int eob = p->eobs[block];
+
+  // coeffs
+  int c = eob - 1;
+  // eob
+  {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]) - 1;
+    cost += (v << (AV1_PROB_COST_SHIFT + 2));
+  }
+  // other coeffs
+  for (c = eob - 2; c >= 0; c--) {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]);
+    const int idx = AOMMIN(v, 14);
+
+    cost += costLUT[idx];
+  }
+
+  // const_term does not contain DC, and log(e) does not contain eob, so both
+  // (eob-1)
+  cost += (const_term + loge_par) * (eob - 1);
+
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
   const struct macroblock_plane *p = &x->plane[plane];
   const int eob = p->eobs[block];
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -845,18 +930,47 @@
   const MACROBLOCKD *const xd = &x->e_mbd;
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
 
-#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
-  case tx_class_literal:                                                       \
-    return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p,  \
-                                    eob, plane_type, coeff_costs, xd, tx_type, \
-                                    tx_class_literal);
-  switch (tx_class) {
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
-#undef WAREHOUSE_EFFICIENTS_TXB_CASE
-    default: assert(false); return 0;
+  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+                                  plane_type, coeff_costs, xd, tx_type,
+                                  tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  int eob = p->eobs[block];
+
+  if (adjust_eob) {
+    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+    const int16_t *scan = scan_order->scan;
+    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+                          tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
   }
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb_laplacian(
+      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+      tx_type, tx_class, reduced_tx_set_used);
 }
 
 static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
@@ -1024,6 +1138,7 @@
   return update;
 }
 
+#if CONFIG_HTB_TRELLIS
 static void hbt_init() {
   hbt_hash_table =
       aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
@@ -1293,6 +1408,7 @@
   return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
                           txb_eob_costs, p, block, fast_mode, rate_cost);
 }
+#endif  // CONFIG_HTB_TRELLIS
 
 static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
     int ci, tran_low_t abs_qc, int coeff_ctx,
@@ -1386,8 +1502,9 @@
     TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
     int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
-  const int dqv = dequant[si != 0];
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   const int ci = scan[si];
   const tran_low_t qc = qcoeff[ci];
   const int is_last = si == (eob - 1);
@@ -1443,8 +1560,8 @@
     int bwl, int64_t rdmult, int shift, const int16_t *dequant,
     const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
     const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
-    uint8_t *levels) {
-  const int dqv = dequant[1];
+    uint8_t *levels, const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   (void)eob;
   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
   // and not the last (scan_idx != eob - 1)
@@ -1488,44 +1605,15 @@
   }
 }
 
-static INLINE void update_coeff_eob_fast(int *eob, int shift,
-                                         const int16_t *dequant_ptr,
-                                         const int16_t *scan,
-                                         const tran_low_t *coeff_ptr,
-                                         tran_low_t *qcoeff_ptr,
-                                         tran_low_t *dqcoeff_ptr) {
-  // TODO(sarahparker) make this work for aomqm
-  int eob_out = *eob;
-  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
-                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
-
-  for (int i = *eob - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const int qcoeff = qcoeff_ptr[rc];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
-      eob_out--;
-      qcoeff_ptr[rc] = 0;
-      dqcoeff_ptr[rc] = 0;
-    } else {
-      break;
-    }
-  }
-
-  *eob = eob_out;
-}
-
 static AOM_FORCE_INLINE void update_coeff_eob(
     int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
     int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
     int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
     const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
-  const int dqv = dequant[si != 0];
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   assert(si != *eob - 1);
   const int ci = scan[si];
   const tran_low_t qc = qcoeff[ci];
@@ -1662,9 +1750,15 @@
   const int shift = av1_get_tx_scale(tx_size);
   int eob = p->eobs[block];
   const int16_t *dequant = p->dequant_QTX;
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+
+  // This function is not called if eob = 0.
+  assert(eob > 0);
 
   if (fast_mode) {
     update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
@@ -1696,7 +1790,8 @@
             ? 7 - mbmi->segment_id
             : 2) +
        (cpi->oxcf.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+                cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
+                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
             ? (3 - x->sb_energy_level)
             : 0));
   const int64_t rdmult =
@@ -1729,7 +1824,7 @@
     update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels);
+                         levels, iqmatrix);
     --si;
   } else {
     assert(abs_qc == 1);
@@ -1752,7 +1847,7 @@
                        tx_size, tx_class_literal, bwl, height,             \
                        txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
                        txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
-                       levels, sharpness);                                 \
+                       levels, sharpness, iqmatrix);                       \
     }                                                                      \
     break;
   switch (tx_class) {
@@ -1773,7 +1868,7 @@
     for (; si >= 1; --si) {                                                    \
       update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
                           rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
-                          qcoeff, dqcoeff, levels);                            \
+                          qcoeff, dqcoeff, levels, iqmatrix);                  \
     }                                                                          \
     break;
   switch (tx_class) {
@@ -1791,10 +1886,11 @@
     update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels);
+                         levels, iqmatrix);
   }
 
-  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+                                            cm->features.reduced_tx_set_used);
   if (eob == 0)
     accu_rate += skip_cost;
   else
@@ -1814,18 +1910,20 @@
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
                      TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
   const AV1_COMMON *cm = &cpi->common;
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int eob = p->eobs[block];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
   const int16_t *dequant = p->dequant_QTX;
   const int seg_eob = av1_get_max_eob(tx_size);
   const int bwl = get_txb_bwl(tx_size);
@@ -1846,19 +1944,18 @@
       2;
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
-  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
   const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
   assert(width == (1 << bwl));
-  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  const int tx_type_cost =
+      get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
   TxbInfo txb_info = {
     qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
     txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
     scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
   };
 
+#if CONFIG_HTB_TRELLIS
   // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
   // by storing the coefficient deltas in a hash table.
   // Currently disabled in speedfeatures.c
@@ -1866,7 +1963,9 @@
     return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
                              fast_mode, rate_cost);
   }
-
+#else
+  (void)fast_mode;
+#endif  // CONFIG_HTB_TRELLIS
   av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update =
@@ -1898,34 +1997,14 @@
   return cul_level;
 }
 
-void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
-                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                              void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const uint16_t eob = p->eobs[block];
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
-                   blk_row);
-}
-
-static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                 int blk_row, int blk_col, int plane,
-                                 TX_SIZE tx_size, FRAME_COUNTS *counts,
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+                                 MACROBLOCKD *xd, int blk_row, int blk_col,
+                                 int plane, TX_SIZE tx_size,
+                                 FRAME_COUNTS *counts,
                                  uint8_t allow_update_cdf) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   int is_inter = is_inter_block(mbmi);
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
   FRAME_CONTEXT *fc = xd->tile_ctx;
 #if !CONFIG_ENTROPY_STATS
   (void)counts;
@@ -1933,15 +2012,30 @@
 
   // Only y plane's tx_type is updated
   if (plane > 0) return;
-  TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
-                                    cm->reduced_tx_set_used);
-  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
+  const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+  if (is_inter) {
+    if (cpi->oxcf.use_inter_dct_only) {
+      assert(tx_type == DCT_DCT);
+    }
+  } else {
+    if (cpi->oxcf.use_intra_dct_only) {
+      assert(tx_type == DCT_DCT);
+    } else if (cpi->oxcf.use_intra_default_tx_only) {
+      const TX_TYPE default_type = get_default_tx_type(
+          PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
+      (void)default_type;
+      assert(tx_type == default_type);
+    }
+  }
+
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (eset > 0) {
       const TxSetType tx_set_type =
-          av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+          av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
       if (is_inter) {
         if (allow_update_cdf) {
           update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
@@ -1985,91 +2079,98 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = xd->mi[0];
   const int eob = p->eobs[block];
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
-              pd->left_context + blk_row, &txb_ctx);
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  const uint8_t allow_update_cdf = args->allow_update_cdf;
-  const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#if CONFIG_ENTROPY_STATS
-  int cdf_idx = cm->coef_cdf_category;
-#endif  // CONFIG_ENTROPY_STATS
-
-#if CONFIG_ENTROPY_STATS
-  ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
-#endif  // CONFIG_ENTROPY_STATS
-  if (allow_update_cdf) {
-    update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
-               2);
-  }
-
-  const int txb_offset =
-      x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-  uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
-  uint8_t *txb_skip_ctx_txb =
-      x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
-  txb_skip_ctx_txb[block] = txb_ctx.txb_skip_ctx;
-  eob_txb[block] = eob;
-
-  if (eob == 0) {
-    av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
-    return;
-  }
-
-  tran_low_t *tcoeff_txb =
-      x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
-  tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
-  const int segment_id = mbmi->segment_id;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
-
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  av1_txb_init_levels(tcoeff, width, height, levels);
-  update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
-                       allow_update_cdf);
-
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
   const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int16_t *const scan = scan_order->scan;
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+    const int bwl = get_txb_bwl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    const uint8_t allow_update_cdf = args->allow_update_cdf;
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 #if CONFIG_ENTROPY_STATS
-  av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+                 eob == 0, 2);
+    }
+
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset =
+        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, width);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
                          td->counts, allow_update_cdf);
+
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const int16_t *const scan = scan_order->scan;
+
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, allow_update_cdf);
 #else
-  av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
-                         allow_update_cdf);
+    av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                           allow_update_cdf);
 #endif
 
-  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
 
-  for (int c = eob - 1; c >= 0; --c) {
-    const int pos = scan[c];
-    const int coeff_ctx = coeff_contexts[pos];
-    const tran_low_t v = qcoeff[pos];
-    const tran_low_t level = abs(v);
+    for (int c = eob - 1; c >= 0; --c) {
+      const int pos = scan[c];
+      const int coeff_ctx = coeff_contexts[pos];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
 
-    if (allow_update_cdf) {
-      if (c == eob - 1) {
-        assert(coeff_ctx < 4);
-        update_cdf(
-            ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
-            AOMMIN(level, 3) - 1, 3);
-      } else {
-        update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
-                   AOMMIN(level, 3), 4);
+      if (allow_update_cdf) {
+        if (c == eob - 1) {
+          assert(coeff_ctx < 4);
+          update_cdf(
+              ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+              AOMMIN(level, 3) - 1, 3);
+        } else {
+          update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                     AOMMIN(level, 3), 4);
+        }
       }
-    }
-    {
       if (c == eob - 1) {
         assert(coeff_ctx < 4);
 #if CONFIG_ENTROPY_STATS
@@ -2080,78 +2181,81 @@
                                       [coeff_ctx][AOMMIN(level, 3)];
 #endif
       }
-    }
-    if (level > NUM_BASE_LEVELS) {
-      const int base_range = level - 1 - NUM_BASE_LEVELS;
-      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
-      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
-        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
-        if (allow_update_cdf) {
-          update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
-                                         [plane_type][br_ctx],
-                     k, BR_CDF_SIZE);
-        }
-        for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          if (allow_update_cdf) {
+            update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                           [plane_type][br_ctx],
+                       k, BR_CDF_SIZE);
+          }
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
 #if CONFIG_ENTROPY_STATS
-          ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
-                                 [br_ctx][lps == k];
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
 #endif  // CONFIG_ENTROPY_STATS
-          if (lps == k) break;
-        }
+            if (lps == k) break;
+          }
 #if CONFIG_ENTROPY_STATS
-        ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
-                                     [plane_type][br_ctx][k];
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
 #endif
-        if (k < BR_CDF_SIZE - 1) break;
+          if (k < BR_CDF_SIZE - 1) break;
+        }
       }
     }
-  }
-
-  // Update the context needed to code the DC sign (if applicable)
-  if (tcoeff[0] != 0) {
-    const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
-    const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
 #if CONFIG_ENTROPY_STATS
-    ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
 #endif  // CONFIG_ENTROPY_STATS
-    if (allow_update_cdf)
-      update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
-    int *dc_sign_ctx_txb =
-        x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
-    dc_sign_ctx_txb[block] = dc_sign_ctx;
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
   }
-
   const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
-                   blk_row);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
 }
 
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            int mi_row, int mi_col, uint8_t allow_update_cdf) {
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                            uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
-  (void)rate;
-  (void)mi_row;
-  (void)mi_col;
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
   if (mbmi->skip) {
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+    av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
 
-  if (!dry_run) {
-    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_and_record_txb_context, &arg,
-                                  num_planes);
-  } else if (dry_run == DRY_RUN_NORMAL) {
-    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_txb_context_b, &arg, num_planes);
-  } else {
-    printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
-    assert(0);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    av1_foreach_transformed_block_in_plane(
+        xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
   }
 }
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+  const int offset =
+      (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  return cpi->coeff_buffer_base + offset;
+}

diff --git a/libaom/av1/encoder/encodetxb.h b/libaom/av1/encoder/encodetxb.h
index 0682590..7122895 100644
--- a/libaom/av1/encoder/encodetxb.h
+++ b/libaom/av1/encoder/encodetxb.h

@@ -14,8 +14,8 @@
 
 #include "config/aom_config.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/encoder.h"
@@ -24,6 +24,10 @@
 extern "C" {
 #endif
 
+#define TXB_SKIP_CTX_MASK 15
+#define DC_SIGN_CTX_SHIFT 4
+#define DC_SIGN_CTX_MASK 3
+
 typedef struct TxbInfo {
   tran_low_t *qcoeff;
   uint8_t *levels;  // absolute values and clamped to 255.
@@ -48,38 +52,42 @@
 
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
-                        const int plane, const int block, const TX_SIZE tx_size,
-                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx);
-void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob);
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           aom_writer *w, int blk_row, int blk_col, int plane,
-                          TX_SIZE tx_size, const tran_low_t *tcoeff,
-                          uint16_t eob, TXB_CTX *txb_ctx);
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
-                         int mi_col, aom_writer *w, BLOCK_SIZE bsize);
+                          int block, TX_SIZE tx_size);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, BLOCK_SIZE bsize);
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob);
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            int mi_row, int mi_col, uint8_t allow_update_cdf);
-
-void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
-                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                              void *arg);
-
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                            uint8_t allow_update_cdf);
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, void *arg);
-
-void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                          int mi_row, int mi_col);
-
+#if CONFIG_HTB_TRELLIS
 void hbt_destroy();
+#endif  // CONFIG_HTB_TRELLIS
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
                          int sharpness, int fast_mode);
 
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col);
+
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 17, 13 },

diff --git a/libaom/av1/encoder/ethread.c b/libaom/av1/encoder/ethread.c
index c8c2107..693270b 100644
--- a/libaom/av1/encoder/ethread.c
+++ b/libaom/av1/encoder/ethread.c

@@ -16,7 +16,7 @@
 #include "av1/encoder/rdopt.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   for (int i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
@@ -27,17 +27,32 @@
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
   td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+
+  for (int i = 0; i < TX_SIZES_ALL; i++) {
+    for (int j = 0; j < TX_TYPES; j++)
+      td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j];
+  }
+
+  for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+    for (int j = 0; j < 2; j++) {
+      td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
+  }
 }
 
-static void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   const int mib_size = cm->seq_params.mib_size;
   const int frame_lf_count =
       av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-  for (int row = 0; row < cm->tile_rows; row++) {
-    for (int col = 0; col < cm->tile_cols; col++) {
-      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col];
+  for (int row = 0; row < cm->tiles.rows; row++) {
+    for (int col = 0; col < cm->tiles.cols; col++) {
+      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
       const TileInfo *const tile_info = &tile_data->tile_info;
       for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
            mi_row += mib_size) {
@@ -45,8 +60,8 @@
           av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
         for (int mi_col = tile_info->mi_col_start;
              mi_col < tile_info->mi_col_end; mi_col += mib_size) {
-          const int idx_str = cm->mi_stride * mi_row + mi_col;
-          MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+          const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
+          MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
           if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
@@ -194,8 +209,8 @@
   }
 }
 
-static void assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
-                                  int num_tiles, int num_workers) {
+static AOM_INLINE void assign_tile_to_thread(
+    MultiThreadHandle *multi_thread_ctxt, int num_tiles, int num_workers) {
   int tile_id = 0;
   int i;
 
@@ -220,12 +235,13 @@
   return 0;
 }
 
-static void switch_tile_and_get_next_job(AV1_COMP *const cpi, int *cur_tile_id,
-                                         int *current_mi_row,
-                                         int *end_of_frame) {
+static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
+                                                    int *cur_tile_id,
+                                                    int *current_mi_row,
+                                                    int *end_of_frame) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
 
   int tile_id = -1;  // Stores the tile ID with minimum proc done
   int max_mis_to_encode = 0;
@@ -326,12 +342,8 @@
       memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
     }
 
-    av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
-
-    // Disable exhaustive search speed features for row based multi-threading of
-    // encoder.
-    td->mb.m_search_count_ptr = NULL;
-    td->mb.ex_search_count_ptr = NULL;
+    av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                           &td->mb.e_mbd);
 
     cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
     av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
@@ -353,8 +365,8 @@
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
   const AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   int t;
 
   (void)unused;
@@ -365,7 +377,7 @@
     int tile_col = t % tile_cols;
 
     TileDataEnc *const this_tile =
-        &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+        &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
     thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
     thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
     av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
@@ -374,9 +386,10 @@
   return 1;
 }
 
-static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
   AV1_COMMON *const cm = &cpi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int sb_mi_size = av1_get_sb_mi_size(cm);
 
   CHECK_MEM_ERROR(cm, cpi->workers,
                   aom_malloc(num_workers * sizeof(*cpi->workers)));
@@ -385,7 +398,7 @@
                   aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
 #if CONFIG_MULTITHREAD
-  if (cpi->row_mt == 1) {
+  if (cpi->oxcf.row_mt == 1) {
     if (cpi->row_mt_mutex_ == NULL) {
       CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_,
                       aom_malloc(sizeof(*(cpi->row_mt_mutex_))));
@@ -413,7 +426,7 @@
 
       // Set up pc_tree.
       thread_data->td->pc_tree = NULL;
-      av1_setup_pc_tree(cm, thread_data->td);
+      av1_setup_pc_tree(cpi, thread_data->td);
 
       CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                       (uint8_t *)aom_memalign(
@@ -454,6 +467,8 @@
           cm, thread_data->td->palette_buffer,
           aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
 
+      av1_alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
+
       CHECK_MEM_ERROR(
           cm, thread_data->td->tmp_conv_dst,
           aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
@@ -465,6 +480,18 @@
                                  sizeof(*thread_data->td->tmp_obmc_bufs[j])));
       }
 
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->mbmi_ext,
+          aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext)));
+
+      if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+        const int num_64x64_blocks =
+            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->vt64x64,
+            aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+      }
+
       // Create threads
       if (!winterface->reset(worker))
         aom_internal_error(&cm->error, AOM_CODEC_ERROR,
@@ -473,7 +500,7 @@
       // Main thread acts as a worker and uses the thread data in cpi.
       thread_data->td = &cpi->td;
     }
-    if (cpi->row_mt == 1)
+    if (cpi->oxcf.row_mt == 1)
       CHECK_MEM_ERROR(
           cm, thread_data->td->tctx,
           (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
@@ -481,7 +508,7 @@
   }
 }
 
-static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   // Encode a frame
   for (int i = num_workers - 1; i >= 0; i--) {
@@ -498,7 +525,7 @@
   }
 }
 
-static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = 0;
 
@@ -513,11 +540,14 @@
                        "Failed to encode tile data");
 }
 
-static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
+                                                       int num_workers) {
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     cpi->intrabc_used |= thread_data->td->intrabc_used;
+    cpi->deltaq_used |= thread_data->td->deltaq_used;
+
     // Accumulate counters.
     if (i > 0) {
       av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
@@ -530,8 +560,8 @@
   }
 }
 
-static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
-                                int num_workers) {
+static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
@@ -541,6 +571,7 @@
     worker->data2 = NULL;
 
     thread_data->td->intrabc_used = 0;
+    thread_data->td->deltaq_used = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -554,14 +585,15 @@
       for (int x = 0; x < 2; x++) {
         for (int y = 0; y < 2; y++) {
           memcpy(thread_data->td->hash_value_buffer[x][y],
-                 cpi->td.mb.hash_value_buffer[x][y],
+                 cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
                      sizeof(*thread_data->td->hash_value_buffer[0][0]));
-          thread_data->td->mb.hash_value_buffer[x][y] =
+          thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] =
               thread_data->td->hash_value_buffer[x][y];
         }
       }
       thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+      thread_data->td->mb.mbmi_ext = thread_data->td->mbmi_ext;
     }
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
@@ -569,6 +601,7 @@
 
     if (i > 0) {
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+      thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.tmp_obmc_bufs[j] =
@@ -586,8 +619,8 @@
 
 void av1_encode_tiles_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
@@ -620,8 +653,8 @@
 
 void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int num_workers = 0;
   int total_num_threads_row_mt = 0;
@@ -636,7 +669,7 @@
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
-      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col];
+      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
       int num_sb_rows_in_tile =
           av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
       int num_sb_cols_in_tile =
@@ -678,8 +711,6 @@
       av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
                              this_tile->tile_info.mi_col_start,
                              this_tile->tile_info.mi_col_end, tile_row);
-      this_tile->m_search_count = 0;   // Count of motion search hits.
-      this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
     }
   }
 

diff --git a/libaom/av1/encoder/extend.c b/libaom/av1/encoder/extend.c
index e9621a5..934cf56 100644
--- a/libaom/av1/encoder/extend.c
+++ b/libaom/av1/encoder/extend.c

@@ -103,18 +103,14 @@
 void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
   // Extend src frame in buffer
-  // Altref filtering assumes 16 pixel extension
-  const int et_y = 16;
-  const int el_y = 16;
-  // Motion estimation may use src block variance with the block size up
-  // to 64x64, so the right and bottom need to be extended to 64 multiple
-  // or up to 16, whichever is greater.
+  const int et_y = dst->border;
+  const int el_y = dst->border;
   const int er_y =
-      AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
       src->y_crop_width;
-  const int eb_y =
-      AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
-      src->y_crop_height;
+  const int eb_y = AOMMAX(src->y_height + dst->border,
+                          ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+                   src->y_crop_height;
   const int uv_width_subsampling = (src->uv_width != src->y_width);
   const int uv_height_subsampling = (src->uv_height != src->y_height);
   const int et_uv = et_y >> uv_height_subsampling;
@@ -126,63 +122,30 @@
     highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                                  dst->y_stride, src->y_crop_width,
                                  src->y_crop_height, et_y, el_y, eb_y, er_y);
-
-    highbd_copy_and_extend_plane(
-        src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-
-    highbd_copy_and_extend_plane(
-        src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    if (src->u_buffer) {
+      highbd_copy_and_extend_plane(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    }
+    if (src->v_buffer) {
+      highbd_copy_and_extend_plane(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    }
     return;
   }
 
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
                         et_y, el_y, eb_y, er_y);
-
-  copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
-                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
-                        et_uv, el_uv, eb_uv, er_uv);
-
-  copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
-                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
-                        et_uv, el_uv, eb_uv, er_uv);
-}
-
-void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw) {
-  // If the side is not touching the bounder then don't extend.
-  const int et_y = srcy ? 0 : dst->border;
-  const int el_y = srcx ? 0 : dst->border;
-  const int eb_y = srcy + srch != src->y_height
-                       ? 0
-                       : dst->border + dst->y_height - src->y_height;
-  const int er_y = srcx + srcw != src->y_width
-                       ? 0
-                       : dst->border + dst->y_width - src->y_width;
-  const int src_y_offset = srcy * src->y_stride + srcx;
-  const int dst_y_offset = srcy * dst->y_stride + srcx;
-
-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
-  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
-                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
-                        et_y, el_y, eb_y, er_y);
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
-                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
-                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+  if (src->u_buffer) {
+    copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+  }
+  if (src->v_buffer) {
+    copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+  }
 }

diff --git a/libaom/av1/encoder/extend.h b/libaom/av1/encoder/extend.h
index e0432cc..b8cc5b9 100644
--- a/libaom/av1/encoder/extend.h
+++ b/libaom/av1/encoder/extend.h

@@ -22,9 +22,6 @@
 void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst);
 
-void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/firstpass.c b/libaom/av1/encoder/firstpass.c
index f6a0fb2..0955510 100644
--- a/libaom/av1/encoder/firstpass.c
+++ b/libaom/av1/encoder/firstpass.c

@@ -17,13 +17,13 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/variance.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
 
-#include "aom_dsp/variance.h"
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
@@ -53,13 +53,13 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
-static void output_stats(FIRSTPASS_STATS *stats,
-                         struct aom_codec_pkt_list *pktlist) {
+static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
+                                    struct aom_codec_pkt_list *pktlist) {
   struct aom_codec_cx_pkt pkt;
   pkt.kind = AOM_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
   pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
-  aom_codec_pkt_list_add(pktlist, &pkt);
+  if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt);
 
 // TEMP debug code
 #if OUTPUT_FPF
@@ -109,8 +109,8 @@
   section->duration = 1.0;
 }
 
-static void accumulate_stats(FIRSTPASS_STATS *section,
-                             const FIRSTPASS_STATS *frame) {
+static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section,
+                                        const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
   section->weight += frame->weight;
   section->intra_error += frame->intra_error;
@@ -136,12 +136,9 @@
   section->duration += frame->duration;
 }
 
-void av1_init_first_pass(AV1_COMP *cpi) {
-  av1_twopass_zero_stats(&cpi->twopass.total_stats);
-}
-
 void av1_end_first_pass(AV1_COMP *cpi) {
-  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+  if (cpi->twopass.stats_buf_ctx->total_stats)
+    output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list);
 }
 
 static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -162,6 +159,7 @@
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                       int bd) {
   switch (bd) {
@@ -201,6 +199,7 @@
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
@@ -212,78 +211,52 @@
   return sr;
 }
 
-static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
-                                     const MV *ref_mv, MV *best_mv,
-                                     int *best_motion_err) {
+static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                                const MV *ref_mv,
+                                                FULLPEL_MV *best_mv,
+                                                int *best_motion_err) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MV tmp_mv = kZeroMv;
-  MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
-  int num00, tmp_err, n;
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  int tmp_err;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
-
-  int step_param = 3;
-  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   const int sr = get_search_range(cpi);
-  step_param += sr;
-  further_steps -= sr;
+  const int step_param = 3 + sr;
 
-  // Override the default variance function to use MSE.
-  v_fn_ptr.vf = get_block_variance_fn(bsize);
-  if (is_cur_buf_hbd(xd)) {
-    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  const search_site_config *first_pass_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_FPF];
+  FULLPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
+                                     first_pass_search_sites);
+  ms_params.search_method = NSTEP;
+
+  FULLPEL_MV this_best_mv;
+  tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
+                                  &this_best_mv, NULL);
+
+  if (tmp_err < INT_MAX) {
+    tmp_err = av1_get_mvpred_sse(x, &this_best_mv, ref_mv, &v_fn_ptr) +
+              new_mv_mode_penalty;
   }
 
-  // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full,
-                                    &tmp_mv, step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
-  if (tmp_err < INT_MAX)
-    tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
-  if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
-
   if (tmp_err < *best_motion_err) {
     *best_motion_err = tmp_err;
-    *best_mv = tmp_mv;
-  }
-
-  // Carry out further step/diamond searches as necessary.
-  n = num00;
-  num00 = 0;
-
-  while (n < further_steps) {
-    ++n;
-
-    if (num00) {
-      --num00;
-    } else {
-      tmp_err = cpi->diamond_search_sad(
-          x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full, &tmp_mv, step_param + n,
-          x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
-      if (tmp_err < INT_MAX)
-        tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
-      if (tmp_err < INT_MAX - new_mv_mode_penalty)
-        tmp_err += new_mv_mode_penalty;
-
-      if (tmp_err < *best_motion_err) {
-        *best_motion_err = tmp_err;
-        *best_mv = tmp_mv;
-      }
-    }
+    *best_mv = this_best_mv;
   }
 }
 
-static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
+static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
+                            int mb_row, int mb_col) {
   if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
-      cm->mi_cols) {
+      mi_params->mi_cols) {
     return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
-                   cm->mi_rows
+                   mi_params->mi_rows
                ? BLOCK_16X16
                : BLOCK_16X8;
   } else {
     return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
-                   cm->mi_rows
+                   mi_params->mi_rows
                ? BLOCK_8X16
                : BLOCK_8X8;
   }
@@ -316,416 +289,716 @@
   return raw_err_stdev;
 }
 
+// This structure contains several key parameters to be accumulate for this
+// frame.
+typedef struct {
+  // Intra prediction error.
+  int64_t intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  int64_t frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  int64_t coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  int64_t sr_coded_error;
+  // Best of intra pred error and inter pred error using altref frame as ref.
+  int64_t tr_coded_error;
+  // Count of motion vector.
+  int mv_count;
+  // Count of blocks that pick inter prediction (inter pred error is smaller
+  // than intra pred error).
+  int inter_count;
+  // Count of blocks that pick second ref (golden frame).
+  int second_ref_count;
+  // Count of blocks that pick third ref (altref frame).
+  int third_ref_count;
+  // Count of blocks where the inter and intra are very close and very low.
+  double neutral_count;
+  // Count of blocks where intra error is very small.
+  int intra_skip_count;
+  // Start row.
+  int image_data_start_row;
+  // Count of unique non-zero motion vectors.
+  int new_mv_count;
+  // Sum of inward motion vectors.
+  int sum_in_vectors;
+  // Sum of motion vector row.
+  int sum_mvr;
+  // Sum of motion vector column.
+  int sum_mvc;
+  // Sum of absolute value of motion vector row.
+  int sum_mvr_abs;
+  // Sum of absolute value of motion vector column.
+  int sum_mvc_abs;
+  // Sum of the square of motion vector row.
+  int64_t sum_mvrs;
+  // Sum of the square of motion vector column.
+  int64_t sum_mvcs;
+  // A factor calculated using intra pred error.
+  double intra_factor;
+  // A factor that measures brightness.
+  double brightness_factor;
+} FRAME_STATS;
+
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
+// Computes and returns the intra pred error of a block.
+// intra pred error: sum of squared error of the intra predicted residual.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   this_frame: the current frame buffer.
+//   tile: tile information (not used in first pass, already init to zero)
+//   mb_row: row index in the unit of first pass block size.
+//   mb_col: column index in the unit of first pass block size.
+//   y_offset: the offset of y frame buffer, indicating the starting point of
+//             the current block.
+//   uv_offset: the offset of u and v frame buffer, indicating the starting
+//              point of the current block.
+//   fp_block_size: first pass block size.
+//   qindex: quantization step size to encode the frame.
+//   stats: frame encoding stats.
+// Modifies:
+//   stats->intra_skip_count
+//   stats->image_data_start_row
+//   stats->intra_factor
+//   stats->brightness_factor
+//   stats->intra_error
+//   stats->frame_avg_wavelet_energy
+// Returns:
+//   this_intra_error.
+static int firstpass_intra_prediction(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *const this_frame,
+    const TileInfo *const tile, const int mb_row, const int mb_col,
+    const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
+    const int qindex, FRAME_STATS *const stats) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mb_scale = mi_size_wide[fp_block_size];
+  const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+
+  aom_clear_system_state();
+  set_mi_offsets(mi_params, xd, mb_row * mb_scale, mb_col * mb_scale);
+  xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
+  xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+  xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+  xd->left_available = (mb_col != 0);
+  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize],
+                 mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows,
+                 mi_params->mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+  xd->mi[0]->segment_id = 0;
+  xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+  xd->mi[0]->mode = DC_PRED;
+  xd->mi[0]->tx_size =
+      use_dc_pred ? (bsize >= fp_block_size ? TX_16X16 : TX_8X8) : TX_4X4;
+
+  av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+  if (this_intra_error < UL_INTRA_THRESH) {
+    ++stats->intra_skip_count;
+  } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+    stats->image_data_start_row = mb_row;
+  }
+
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: this_intra_error >>= 4; break;
+      case AOM_BITS_12: this_intra_error >>= 8; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
+
+  aom_clear_system_state();
+  double log_intra = log(this_intra_error + 1.0);
+  if (log_intra < 10.0) {
+    stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+  } else {
+    stats->intra_factor += 1.0;
+  }
+
+  int level_sample;
+  if (seq_params->use_highbitdepth) {
+    level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+  } else {
+    level_sample = x->plane[0].src.buf[0];
+  }
+  if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+    stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+  } else {
+    stats->brightness_factor += 1.0;
+  }
+
+  // Intrapenalty below deals with situations where the intra and inter
+  // error scores are very low (e.g. a plain black frame).
+  // We do not have special cases in first pass for 0,0 and nearest etc so
+  // all inter modes carry an overhead cost estimate for the mv.
+  // When the error score is very low this causes us to pick all or lots of
+  // INTRA modes and throw lots of key frames.
+  // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+  this_intra_error += INTRA_MODE_PENALTY;
+
+  // Accumulate the intra error.
+  stats->intra_error += (int64_t)this_intra_error;
+
+  const int hbd = is_cur_buf_hbd(xd);
+  const int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  for (int r8 = 0; r8 < 2; ++r8) {
+    for (int c8 = 0; c8 < 2; ++c8) {
+      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+
+  return this_intra_error;
+}
+
+// Returns the sum of square error between source and reference blocks.
+static int get_prediction_error_bitdepth(const int is_high_bitdepth,
+                                         const int bitdepth,
+                                         const BLOCK_SIZE block_size,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  (void)is_high_bitdepth;
+  (void)bitdepth;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_high_bitdepth) {
+    return highbd_get_prediction_error(block_size, src, ref, bitdepth);
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  return get_prediction_error(block_size, src, ref);
+}
+
+// Accumulates motion vector stats.
+// Modifies member variables of "stats".
+static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
+                                const int mb_row, const int mb_col,
+                                const int mb_rows, const int mb_cols,
+                                MV *last_mv, FRAME_STATS *stats) {
+  if (is_zero_mv(&best_mv)) return;
+
+  ++stats->mv_count;
+  // Non-zero vector, was it different from the last non zero vector?
+  if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count;
+  *last_mv = best_mv;
+
+  // Does the row vector point inwards or outwards?
+  if (mb_row < mb_rows / 2) {
+    if (mv.row > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_row > mb_rows / 2) {
+    if (mv.row > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+
+  // Does the col vector point inwards or outwards?
+  if (mb_col < mb_cols / 2) {
+    if (mv.col > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_col > mb_cols / 2) {
+    if (mv.col > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+}
+
+#define LOW_MOTION_ERROR_THRESH 25
+// Computes and returns the inter prediction error from the last frame.
+// Computes inter prediction errors from the golden and alt ref frams and
+// Updates stats accordingly.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   last_frame: the frame buffer of the last frame.
+//   golden_frame: the frame buffer of the golden frame.
+//   alt_ref_frame: the frame buffer of the alt ref frame.
+//   mb_row: row index in the unit of first pass block size.
+//   mb_col: column index in the unit of first pass block size.
+//   recon_yoffset: the y offset of the reconstructed  frame buffer,
+//                  indicating the starting point of the current block.
+//   recont_uvoffset: the u/v offset of the reconstructed frame buffer,
+//                    indicating the starting point of the current block.
+//   src_yoffset: the y offset of the source frame buffer.
+//   alt_ref_frame_offset: the y offset of the alt ref frame buffer.
+//   fp_block_size: first pass block size.
+//   this_intra_error: the intra prediction error of this block.
+//   raw_motion_err_counts: the count of raw motion vectors.
+//   raw_motion_err_list: the array that records the raw motion error.
+//   best_ref_mv: best reference mv found so far.
+//   last_mv: last mv.
+//   stats: frame encoding stats.
+//  Modifies:
+//    raw_motion_err_list
+//    best_ref_mv
+//    last_mv
+//    stats: many member params in it.
+//  Returns:
+//    this_inter_error
+static int firstpass_inter_prediction(
+    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const last_frame,
+    const YV12_BUFFER_CONFIG *const golden_frame,
+    const YV12_BUFFER_CONFIG *const alt_ref_frame, const int mb_row,
+    const int mb_col, const int recon_yoffset, const int recon_uvoffset,
+    const int src_yoffset, const int alt_ref_frame_yoffset,
+    const BLOCK_SIZE fp_block_size, const int this_intra_error,
+    const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv,
+    MV *last_mv, FRAME_STATS *stats) {
+  int this_inter_error = this_intra_error;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_high_bitdepth = is_cur_buf_hbd(xd);
+  const int bitdepth = xd->bd;
+  const int mb_scale = mi_size_wide[fp_block_size];
+  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  // Assume 0,0 motion with no mv overhead.
+  FULLPEL_MV mv = kZeroFullMv;
+  FULLPEL_MV tmp_mv = kZeroFullMv;
+  xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << 2),
+                        (fp_block_size_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+
+  int motion_error =
+      get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                    &x->plane[0].src, &xd->plane[0].pre[0]);
+
+  // Compute the motion error of the 0,0 motion using the last source
+  // frame as the reference. Skip the further motion search on
+  // reconstructed frame if this error is small.
+  struct buf_2d unscaled_last_source_buf_2d;
+  unscaled_last_source_buf_2d.buf =
+      cpi->unscaled_last_source->y_buffer + src_yoffset;
+  unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+  const int raw_motion_error = get_prediction_error_bitdepth(
+      is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+      &unscaled_last_source_buf_2d);
+  raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+
+  // TODO(pengchong): Replace the hard-coded threshold
+  if (raw_motion_error > LOW_MOTION_ERROR_THRESH) {
+    // Test last reference frame using the previous best mv as the
+    // starting point (best reference) for the search.
+    first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+    // If the current best reference mv is not centered on 0,0 then do a
+    // 0,0 based search as well.
+    if (!is_zero_mv(best_ref_mv)) {
+      int tmp_err = INT_MAX;
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+      if (tmp_err < motion_error) {
+        motion_error = tmp_err;
+        mv = tmp_mv;
+      }
+    }
+
+    // Motion search in 2nd reference frame.
+    int gf_motion_error = motion_error;
+    if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      // Assume 0,0 motion with no mv overhead.
+      xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
+      xd->plane[0].pre[0].stride = golden_frame->y_stride;
+      gf_motion_error =
+          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
+    }
+    if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
+      ++stats->second_ref_count;
+    }
+    // In accumulating a score for the 2nd reference frame take the
+    // best of the motion predicted score and the intra coded error
+    // (just as will be done for) accumulation of "coded_error" for
+    // the last frame.
+    if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error);
+    } else {
+      // TODO(chengchen): I believe logically this should also be changed to
+      // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error).
+      stats->sr_coded_error += motion_error;
+    }
+
+    // Motion search in 3rd reference frame.
+    int alt_motion_error = motion_error;
+    if (alt_ref_frame != NULL) {
+      xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
+      xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
+      alt_motion_error =
+          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
+    }
+    if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&
+        alt_motion_error < this_intra_error) {
+      ++stats->third_ref_count;
+    }
+    // In accumulating a score for the 3rd reference frame take the
+    // best of the motion predicted score and the intra coded error
+    // (just as will be done for) accumulation of "coded_error" for
+    // the last frame.
+    if (alt_ref_frame != NULL) {
+      stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error);
+    } else {
+      // TODO(chengchen): I believe logically this should also be changed to
+      // stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error).
+      stats->tr_coded_error += motion_error;
+    }
+
+    // Reset to last frame as reference buffer.
+    xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+    xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+    xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+  } else {
+    stats->sr_coded_error += motion_error;
+    stats->tr_coded_error += motion_error;
+  }
+
+  // Start by assuming that intra mode is best.
+  best_ref_mv->row = 0;
+  best_ref_mv->col = 0;
+
+  if (motion_error <= this_intra_error) {
+    aom_clear_system_state();
+
+    // Keep a count of cases where the inter and intra were very close
+    // and very low. This helps with scene cut detection for example in
+    // cropped clips with black bars at the sides or top and bottom.
+    if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) &&
+        (this_intra_error < (2 * INTRA_MODE_PENALTY))) {
+      stats->neutral_count += 1.0;
+      // Also track cases where the intra is not much worse than the inter
+      // and use this in limiting the GF/arf group length.
+    } else if ((this_intra_error > NCOUNT_INTRA_THRESH) &&
+               (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+      stats->neutral_count +=
+          (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
+    }
+
+    const MV best_mv = get_mv_from_fullmv(&mv);
+    this_inter_error = motion_error;
+    xd->mi[0]->mode = NEWMV;
+    xd->mi[0]->mv[0].as_mv = best_mv;
+    xd->mi[0]->tx_size = TX_4X4;
+    xd->mi[0]->ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = NONE_FRAME;
+    av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale, mb_col * mb_scale,
+                                  NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+    av1_encode_sby_pass1(cpi, x, bsize);
+    stats->sum_mvr += best_mv.row;
+    stats->sum_mvr_abs += abs(best_mv.row);
+    stats->sum_mvc += best_mv.col;
+    stats->sum_mvc_abs += abs(best_mv.col);
+    stats->sum_mvrs += best_mv.row * best_mv.row;
+    stats->sum_mvcs += best_mv.col * best_mv.col;
+    ++stats->inter_count;
+
+    *best_ref_mv = best_mv;
+    accumulate_mv_stats(best_mv, mv, mb_row, mb_col, mi_params->mb_rows,
+                        mi_params->mb_cols, last_mv, stats);
+  }
+
+  return this_inter_error;
+}
+
+// Updates the first pass stats of this frame.
+// Input:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   stats: stats accumulated for this frame.
+//   raw_err_stdev: the statndard deviation for the motion error of all the
+//                  inter blocks of the (0,0) motion using the last source
+//                  frame as the reference.
+//   frame_number: current frame number.
+//   ts_duration: Duration of the frame / collection of frames.
+// Updates:
+//   twopass->total_stats: the accumulated stats.
+//   twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats,
+//                                         update its value and its position
+//                                         in the buffer.
+static void update_firstpass_stats(AV1_COMP *cpi,
+                                   const FRAME_STATS *const stats,
+                                   const double raw_err_stdev,
+                                   const int frame_number,
+                                   const int64_t ts_duration) {
+  TWO_PASS *twopass = &cpi->twopass;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  FIRSTPASS_STATS fps;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : mi_params->MBs;
+  const double min_err = 200 * sqrt(num_mbs);
+
+  fps.weight = stats->intra_factor * stats->brightness_factor;
+  fps.frame = frame_number;
+  fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
+  fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
+  fps.tr_coded_error = (double)(stats->tr_coded_error >> 8) + min_err;
+  fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
+  fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
+  fps.count = 1.0;
+  fps.pcnt_inter = (double)stats->inter_count / num_mbs;
+  fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
+  fps.pcnt_third_ref = (double)stats->third_ref_count / num_mbs;
+  fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
+  fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
+  fps.inactive_zone_rows = (double)stats->image_data_start_row;
+  fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+  fps.raw_error_stdev = raw_err_stdev;
+
+  if (stats->mv_count > 0) {
+    fps.MVr = (double)stats->sum_mvr / stats->mv_count;
+    fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count;
+    fps.MVc = (double)stats->sum_mvc / stats->mv_count;
+    fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count;
+    fps.MVrv = ((double)stats->sum_mvrs -
+                ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) /
+               stats->mv_count;
+    fps.MVcv = ((double)stats->sum_mvcs -
+                ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) /
+               stats->mv_count;
+    fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2);
+    fps.new_mv_count = stats->new_mv_count;
+    fps.pcnt_motion = (double)stats->mv_count / num_mbs;
+  } else {
+    fps.MVr = 0.0;
+    fps.mvr_abs = 0.0;
+    fps.MVc = 0.0;
+    fps.mvc_abs = 0.0;
+    fps.MVrv = 0.0;
+    fps.MVcv = 0.0;
+    fps.mv_in_out_count = 0.0;
+    fps.new_mv_count = 0.0;
+    fps.pcnt_motion = 0.0;
+  }
+
+  // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+  // something less than the full time between subsequent values of
+  // cpi->source_time_stamp.
+  fps.duration = (double)ts_duration;
+
+  // We will store the stats inside the persistent twopass struct (and NOT the
+  // local variable 'fps'), and then cpi->output_pkt_list will point to it.
+  *this_frame_stats = fps;
+  output_stats(this_frame_stats, cpi->output_pkt_list);
+  if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
+    accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+  }
+  /*In the case of two pass, first pass uses it as a circular buffer,
+   * when LAP is enabled it is used as a linear buffer*/
+  twopass->stats_buf_ctx->stats_in_end++;
+  if ((cpi->oxcf.pass == 1) && (twopass->stats_buf_ctx->stats_in_end >=
+                                twopass->stats_buf_ctx->stats_in_buf_end)) {
+    twopass->stats_buf_ctx->stats_in_end =
+        twopass->stats_buf_ctx->stats_in_start;
+  }
+}
+
+static void print_reconstruction_frame(
+    const YV12_BUFFER_CONFIG *const last_frame, int frame_number,
+    int do_print) {
+  if (!do_print) return;
+
+  char filename[512];
+  FILE *recon_file;
+  snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number);
+
+  if (frame_number == 0) {
+    recon_file = fopen(filename, "wb");
+  } else {
+    recon_file = fopen(filename, "ab");
+  }
+
+  fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file);
+  fclose(recon_file);
+}
+
+#define FIRST_PASS_ALT_REF_DISTANCE 16
 void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
-  int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  TileInfo tile;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx =
-      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
-  int i;
-
-  int recon_yoffset, src_yoffset, recon_uvoffset;
-  int64_t intra_error = 0;
-  int64_t frame_avg_wavelet_energy = 0;
-  int64_t coded_error = 0;
-  int64_t sr_coded_error = 0;
-
-  int sum_mvr = 0, sum_mvc = 0;
-  int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int64_t sum_mvrs = 0, sum_mvcs = 0;
-  int mvcount = 0;
-  int intercount = 0;
-  int second_ref_count = 0;
-  const int intrapenalty = INTRA_MODE_PENALTY;
-  double neutral_count;
-  int intra_skip_count = 0;
-  int image_data_start_row = INVALID_ROW;
-  int new_mv_count = 0;
-  int sum_in_vectors = 0;
-  MV lastmv = kZeroMv;
-  TWO_PASS *twopass = &cpi->twopass;
-  int recon_y_stride, src_y_stride, recon_uv_stride, uv_mb_height;
-
-  const YV12_BUFFER_CONFIG *const lst_yv12 =
-      get_ref_frame_yv12_buf(cm, LAST_FRAME);
-  const YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-  YV12_BUFFER_CONFIG *const new_yv12 = &cm->cur_frame->buf;
-  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
-  double intra_factor;
-  double brightness_factor;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  MV last_mv = kZeroMv;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
-  const int mb_scale = mi_size_wide[BLOCK_16X16];
-
+  // Detect if the key frame is screen content type.
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    av1_set_screen_content_options(cpi, features);
+    cpi->is_screen_content_type = features->allow_screen_content_tools;
+  }
+  // First pass coding proceeds in raster scan order with unit size of 16x16.
+  const BLOCK_SIZE fp_block_size = BLOCK_16X16;
+  const int fp_block_size_width = block_size_high[fp_block_size];
+  const int fp_block_size_height = block_size_wide[fp_block_size];
   int *raw_motion_err_list;
   int raw_motion_err_counts = 0;
-  CHECK_MEM_ERROR(
-      cm, raw_motion_err_list,
-      aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
+  CHECK_MEM_ERROR(cm, raw_motion_err_list,
+                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
+                             sizeof(*raw_motion_err_list)));
+  // Tiling is ignored in the first pass.
+  TileInfo tile;
+  av1_tile_init(&tile, cm, 0, 0);
+  FRAME_STATS stats = { 0 };
+  stats.image_data_start_row = INVALID_ROW;
+
+  const YV12_BUFFER_CONFIG *const last_frame =
+      get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *golden_frame =
+      get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  const YV12_BUFFER_CONFIG *alt_ref_frame = NULL;
+  const int alt_ref_offset =
+      FIRST_PASS_ALT_REF_DISTANCE -
+      (current_frame->frame_number % FIRST_PASS_ALT_REF_DISTANCE);
+  if (alt_ref_offset < FIRST_PASS_ALT_REF_DISTANCE) {
+    const struct lookahead_entry *const alt_ref_frame_buffer =
+        av1_lookahead_peek(cpi->lookahead, alt_ref_offset,
+                           cpi->compressor_stage);
+    if (alt_ref_frame_buffer != NULL) {
+      alt_ref_frame = &alt_ref_frame_buffer->img;
+    }
+  }
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
   // First pass code requires valid last and new frame buffers.
-  assert(new_yv12 != NULL);
-  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+  assert(this_frame != NULL);
+  assert(frame_is_intra_only(cm) || (last_frame != NULL));
 
   av1_setup_frame_size(cpi);
   aom_clear_system_state();
 
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-  x->e_mbd.mi[0]->sb_type = BLOCK_16X16;
-
-  intra_factor = 0.0;
-  brightness_factor = 0.0;
-  neutral_count = 0.0;
+  set_mi_offsets(mi_params, xd, 0, 0);
+  xd->mi[0]->sb_type = fp_block_size;
 
   // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 
-  av1_set_quantizer(cm, qindex);
+  av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, qindex);
 
-  av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
+  av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);
 
-  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
-                       x->e_mbd.mi[0]->sb_type);
-  av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0,
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
+  av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
                        num_planes);
 
   if (!frame_is_intra_only(cm)) {
-    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes);
+    av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
   }
 
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
+  set_mi_offsets(mi_params, xd, 0, 0);
 
   // Don't store luma on the fist pass since chroma is not computed
   xd->cfl.store_y = 0;
   av1_frame_init_quantizer(cpi);
 
-  for (i = 0; i < num_planes; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-    p[i].eobs = ctx->eobs[i];
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  for (int i = 0; i < num_planes; ++i) {
+    x->plane[i].coeff = ctx->coeff[i];
+    x->plane[i].qcoeff = ctx->qcoeff[i];
+    x->plane[i].eobs = ctx->eobs[i];
+    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    xd->plane[i].dqcoeff = ctx->dqcoeff[i];
   }
 
   av1_init_mv_probs(cm);
   av1_initialize_rd_consts(cpi);
 
-  // Tiling is ignored in the first pass.
-  av1_tile_init(&tile, cm, 0, 0);
-  src_y_stride = cpi->source->y_stride;
-  recon_y_stride = new_yv12->y_stride;
-  recon_uv_stride = new_yv12->uv_stride;
-  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+  const int src_y_stride = cpi->source->y_stride;
+  const int recon_y_stride = this_frame->y_stride;
+  const int recon_uv_stride = this_frame->uv_stride;
+  const int uv_mb_height =
+      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
 
-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+  for (int mb_row = 0; mb_row < mi_params->mb_rows; ++mb_row) {
     MV best_ref_mv = kZeroMv;
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-    src_yoffset = (mb_row * src_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+    int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height);
+    int src_yoffset = (mb_row * src_y_stride * fp_block_size_height);
+    int recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+    int alt_ref_frame_yoffset =
+        (alt_ref_frame != NULL)
+            ? mb_row * alt_ref_frame->y_stride * fp_block_size_height
+            : -1;
 
     // Set up limit values for motion vectors to prevent them extending
     // outside the UMV borders.
-    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
-    x->mv_limits.row_max =
-        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << 2),
+                          (fp_block_size_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
 
-    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-      int this_error;
-      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
-      double log_intra;
-      int level_sample;
+    for (int mb_col = 0; mb_col < mi_params->mb_cols; ++mb_col) {
+      int this_intra_error = firstpass_intra_prediction(
+          cpi, this_frame, &tile, mb_row, mb_col, recon_yoffset, recon_uvoffset,
+          fp_block_size, qindex, &stats);
 
-      aom_clear_system_state();
-
-      const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
-      xd->mi = cm->mi_grid_visible + idx_str;
-      xd->mi[0] = cm->mi + idx_str;
-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
-      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
-      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
-      xd->left_available = (mb_col != 0);
-      xd->mi[0]->sb_type = bsize;
-      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
-                     mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows,
-                     cm->mi_cols);
-
-      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
-
-      // Do intra 16x16 prediction.
-      xd->mi[0]->segment_id = 0;
-      xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
-      xd->mi[0]->mode = DC_PRED;
-      xd->mi[0]->tx_size =
-          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
-      this_error = aom_get_mb_ss(x->plane[0].src_diff);
-
-      if (this_error < UL_INTRA_THRESH) {
-        ++intra_skip_count;
-      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
-        image_data_start_row = mb_row;
-      }
-
-      if (seq_params->use_highbitdepth) {
-        switch (seq_params->bit_depth) {
-          case AOM_BITS_8: break;
-          case AOM_BITS_10: this_error >>= 4; break;
-          case AOM_BITS_12: this_error >>= 8; break;
-          default:
-            assert(0 &&
-                   "seq_params->bit_depth should be AOM_BITS_8, "
-                   "AOM_BITS_10 or AOM_BITS_12");
-            return;
-        }
-      }
-
-      aom_clear_system_state();
-      log_intra = log(this_error + 1.0);
-      if (log_intra < 10.0)
-        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
-      else
-        intra_factor += 1.0;
-
-      if (seq_params->use_highbitdepth)
-        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
-      else
-        level_sample = x->plane[0].src.buf[0];
-      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
-        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
-      else
-        brightness_factor += 1.0;
-
-      // Intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (e.g. a plain black frame).
-      // We do not have special cases in first pass for 0,0 and nearest etc so
-      // all inter modes carry an overhead cost estimate for the mv.
-      // When the error score is very low this causes us to pick all or lots of
-      // INTRA modes and throw lots of key frames.
-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
-      this_error += intrapenalty;
-
-      // Accumulate the intra error.
-      intra_error += (int64_t)this_error;
-
-      const int hbd = is_cur_buf_hbd(xd);
-      const int stride = x->plane[0].src.stride;
-      uint8_t *buf = x->plane[0].src.buf;
-      for (int r8 = 0; r8 < 2; ++r8) {
-        for (int c8 = 0; c8 < 2; ++c8) {
-          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-              buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
-        }
-      }
-
-      // Set up limit values for motion vectors to prevent them extending
-      // outside the UMV borders.
-      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_limits.col_max =
-          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
-
-      if (!frame_is_intra_only(cm)) {  // Do a motion search
-        int tmp_err, motion_error, raw_motion_error;
-        // Assume 0,0 motion with no mv overhead.
-        MV mv = kZeroMv, tmp_mv = kZeroMv;
-        struct buf_2d unscaled_last_source_buf_2d;
-
-        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        if (is_cur_buf_hbd(xd)) {
-          motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-        } else {
-          motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                              &xd->plane[0].pre[0]);
-        }
-
-        // Compute the motion error of the 0,0 motion using the last source
-        // frame as the reference. Skip the further motion search on
-        // reconstructed frame if this error is small.
-        unscaled_last_source_buf_2d.buf =
-            cpi->unscaled_last_source->y_buffer + src_yoffset;
-        unscaled_last_source_buf_2d.stride =
-            cpi->unscaled_last_source->y_stride;
-        if (is_cur_buf_hbd(xd)) {
-          raw_motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
-        } else {
-          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                  &unscaled_last_source_buf_2d);
-        }
-
-        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25) {
-          // Test last reference frame using the previous best mv as the
-          // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-
-          // If the current best reference mv is not centered on 0,0 then do a
-          // 0,0 based search as well.
-          if (!is_zero_mv(&best_ref_mv)) {
-            tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
-
-            if (tmp_err < motion_error) {
-              motion_error = tmp_err;
-              mv = tmp_mv;
-            }
-          }
-
-          // Search in an older reference frame.
-          if ((current_frame->frame_number > 1) && gld_yv12 != NULL) {
-            // Assume 0,0 motion with no mv overhead.
-            int gf_motion_error;
-
-            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            if (is_cur_buf_hbd(xd)) {
-              gf_motion_error = highbd_get_prediction_error(
-                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-            } else {
-              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                     &xd->plane[0].pre[0]);
-            }
-
-            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv,
-                                     &gf_motion_error);
-
-            if (gf_motion_error < motion_error && gf_motion_error < this_error)
-              ++second_ref_count;
-
-            // Reset to last frame as reference buffer.
-            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
-            // In accumulating a score for the older reference frame take the
-            // best of the motion predicted score and the intra coded error
-            // (just as will be done for) accumulation of "coded_error" for
-            // the last frame.
-            if (gf_motion_error < this_error)
-              sr_coded_error += gf_motion_error;
-            else
-              sr_coded_error += this_error;
-          } else {
-            sr_coded_error += motion_error;
-          }
-        } else {
-          sr_coded_error += motion_error;
-        }
-
-        // Start by assuming that intra mode is best.
-        best_ref_mv.row = 0;
-        best_ref_mv.col = 0;
-
-        if (motion_error <= this_error) {
-          aom_clear_system_state();
-
-          // Keep a count of cases where the inter and intra were very close
-          // and very low. This helps with scene cut detection for example in
-          // cropped clips with black bars at the sides or top and bottom.
-          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count += 1.0;
-            // Also track cases where the intra is not much worse than the inter
-            // and use this in limiting the GF/arf group length.
-          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
-                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
-            neutral_count +=
-                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
-          }
-
-          mv.row *= 8;
-          mv.col *= 8;
-          this_error = motion_error;
-          xd->mi[0]->mode = NEWMV;
-          xd->mi[0]->mv[0].as_mv = mv;
-          xd->mi[0]->tx_size = TX_4X4;
-          xd->mi[0]->ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->ref_frame[1] = NONE_FRAME;
-          av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale,
-                                        mb_col * mb_scale, NULL, bsize,
-                                        AOM_PLANE_Y, AOM_PLANE_Y);
-          av1_encode_sby_pass1(cm, x, bsize);
-          sum_mvr += mv.row;
-          sum_mvr_abs += abs(mv.row);
-          sum_mvc += mv.col;
-          sum_mvc_abs += abs(mv.col);
-          sum_mvrs += mv.row * mv.row;
-          sum_mvcs += mv.col * mv.col;
-          ++intercount;
-
-          best_ref_mv = mv;
-
-          if (!is_zero_mv(&mv)) {
-            ++mvcount;
-
-            // Non-zero vector, was it different from the last non zero vector?
-            if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
-            lastmv = mv;
-
-            // Does the row vector point inwards or outwards?
-            if (mb_row < cm->mb_rows / 2) {
-              if (mv.row > 0)
-                --sum_in_vectors;
-              else if (mv.row < 0)
-                ++sum_in_vectors;
-            } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.row > 0)
-                ++sum_in_vectors;
-              else if (mv.row < 0)
-                --sum_in_vectors;
-            }
-
-            // Does the col vector point inwards or outwards?
-            if (mb_col < cm->mb_cols / 2) {
-              if (mv.col > 0)
-                --sum_in_vectors;
-              else if (mv.col < 0)
-                ++sum_in_vectors;
-            } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.col > 0)
-                ++sum_in_vectors;
-              else if (mv.col < 0)
-                --sum_in_vectors;
-            }
-          }
-        }
-        raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
+      if (!frame_is_intra_only(cm)) {
+        const int this_inter_error = firstpass_inter_prediction(
+            cpi, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col,
+            recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
+            fp_block_size, this_intra_error, raw_motion_err_counts,
+            raw_motion_err_list, &best_ref_mv, &last_mv, &stats);
+        stats.coded_error += this_inter_error;
+        ++raw_motion_err_counts;
       } else {
-        sr_coded_error += (int64_t)this_error;
+        stats.sr_coded_error += this_intra_error;
+        stats.tr_coded_error += this_intra_error;
+        stats.coded_error += this_intra_error;
       }
-      coded_error += (int64_t)this_error;
 
       // Adjust to the next column of MBs.
-      x->plane[0].src.buf += 16;
+      x->plane[0].src.buf += fp_block_size_width;
       x->plane[1].src.buf += uv_mb_height;
       x->plane[2].src.buf += uv_mb_height;
 
-      recon_yoffset += 16;
-      src_yoffset += 16;
+      recon_yoffset += fp_block_size_width;
+      src_yoffset += fp_block_size_width;
       recon_uvoffset += uv_mb_height;
+      alt_ref_frame_yoffset += fp_block_size_width;
     }
     // Adjust to the next row of MBs.
-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
-    x->plane[1].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-    x->plane[2].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-
-    aom_clear_system_state();
+    x->plane[0].src.buf += fp_block_size_height * x->plane[0].src.stride -
+                           fp_block_size_width * mi_params->mb_cols;
+    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * mi_params->mb_cols;
+    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * mi_params->mb_cols;
   }
   const double raw_err_stdev =
       raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
@@ -733,89 +1006,34 @@
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
-  if ((image_data_start_row > cm->mb_rows / 2) ||
-      (image_data_start_row == INVALID_ROW)) {
-    image_data_start_row = cm->mb_rows / 2;
+  if ((stats.image_data_start_row > mi_params->mb_rows / 2) ||
+      (stats.image_data_start_row == INVALID_ROW)) {
+    stats.image_data_start_row = mi_params->mb_rows / 2;
   }
   // Exclude any image dead zone
-  if (image_data_start_row > 0) {
-    intra_skip_count =
-        AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  if (stats.image_data_start_row > 0) {
+    stats.intra_skip_count =
+        AOMMAX(0, stats.intra_skip_count -
+                      (stats.image_data_start_row * mi_params->mb_cols * 2));
   }
 
-  {
-    FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit allocation to frames even
-    // in static regions. The allocation per MB declines for larger formats
-    // where the typical "real" energy per MB also falls.
-    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is proportional to the image area.
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    const double min_err = 200 * sqrt(num_mbs);
+  TWO_PASS *twopass = &cpi->twopass;
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : mi_params->MBs;
+  stats.intra_factor = stats.intra_factor / (double)num_mbs;
+  stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  update_firstpass_stats(cpi, &stats, raw_err_stdev,
+                         current_frame->frame_number, ts_duration);
 
-    intra_factor = intra_factor / (double)num_mbs;
-    brightness_factor = brightness_factor / (double)num_mbs;
-    fps.weight = intra_factor * brightness_factor;
-
-    fps.frame = current_frame->frame_number;
-    fps.coded_error = (double)(coded_error >> 8) + min_err;
-    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
-    fps.intra_error = (double)(intra_error >> 8) + min_err;
-    fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy;
-    fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / num_mbs;
-    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
-    fps.pcnt_neutral = (double)neutral_count / num_mbs;
-    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
-    fps.inactive_zone_rows = (double)image_data_start_row;
-    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
-    fps.raw_error_stdev = raw_err_stdev;
-
-    if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
-      fps.MVc = (double)sum_mvc / mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv =
-          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
-      fps.MVcv =
-          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
-      fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / num_mbs;
-    } else {
-      fps.MVr = 0.0;
-      fps.mvr_abs = 0.0;
-      fps.MVc = 0.0;
-      fps.mvc_abs = 0.0;
-      fps.MVrv = 0.0;
-      fps.MVcv = 0.0;
-      fps.mv_in_out_count = 0.0;
-      fps.new_mv_count = 0.0;
-      fps.pcnt_motion = 0.0;
-    }
-
-    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
-    // something less than the full time between subsequent values of
-    // cpi->source_time_stamp.
-    fps.duration = (double)ts_duration;
-
-    // Don't want to do output stats with a stack variable!
-    twopass->this_frame_stats = fps;
-    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
-    accumulate_stats(&twopass->total_stats, &fps);
-  }
-
-  // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also don't allow it to lag too far.
+  // Copy the previous Last Frame back into gf buffer if the prediction is good
+  // enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((current_frame->frame_number > 0) &&
-       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
-       ((twopass->this_frame_stats.intra_error /
-         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
-    if (gld_yv12 != NULL) {
+       (this_frame_stats->pcnt_inter > 0.20) &&
+       ((this_frame_stats->intra_error /
+         DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) {
+    if (golden_frame != NULL) {
       assign_frame_buffer_p(
           &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
           cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
@@ -825,7 +1043,7 @@
     ++twopass->sr_update_lag;
   }
 
-  aom_extend_frame_borders(new_yv12, num_planes);
+  aom_extend_frame_borders(this_frame, num_planes);
 
   // The frame we just compressed now becomes the last frame.
   assign_frame_buffer_p(
@@ -840,21 +1058,8 @@
         cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
   }
 
-  // Use this to see what the first pass reconstruction looks like.
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    snprintf(filename, sizeof(filename), "enc%04d.yuv",
-             (int)current_frame->frame_number);
-
-    if (current_frame->frame_number == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
+  print_reconstruction_frame(last_frame, current_frame->frame_number,
+                             /*do_print=*/0);
 
   ++current_frame->frame_number;
 }

diff --git a/libaom/av1/encoder/firstpass.h b/libaom/av1/encoder/firstpass.h
index 1b8636c..99d4445 100644
--- a/libaom/av1/encoder/firstpass.h
+++ b/libaom/av1/encoder/firstpass.h

@@ -12,8 +12,8 @@
 #ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
 #define AOM_AV1_ENCODER_FIRSTPASS_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -46,14 +46,18 @@
   double coded_error;
   // Best of intra pred error and inter pred error using golden frame as ref.
   double sr_coded_error;
+  // Best of intra pred error and inter pred error using altref frame as ref.
+  double tr_coded_error;
   // Percentage of blocks with inter pred error < intra pred error.
   double pcnt_inter;
   // Percentage of blocks using (inter prediction and) non-zero motion vectors.
   double pcnt_motion;
-  // Percentage of blocks where golden frame was the best reference. That is:
+  // Percentage of blocks where golden frame was better than last or intra:
   // inter pred error using golden frame < inter pred error using last frame and
   // inter pred error using golden frame < intra pred error
   double pcnt_second_ref;
+  // Percentage of blocks where altref frame was better than intra, last, golden
+  double pcnt_third_ref;
   // Percentage of blocks where intra and inter prediction errors were very
   // close. Note that this is a 'weighted count', that is, the so blocks may be
   // weighted by how close the two errors were.
@@ -95,17 +99,6 @@
   double raw_error_stdev;
 } FIRSTPASS_STATS;
 
-enum {
-  KF_UPDATE,
-  LF_UPDATE,
-  GF_UPDATE,
-  ARF_UPDATE,
-  OVERLAY_UPDATE,
-  INTNL_OVERLAY_UPDATE,  // Internal Overlay Frame
-  INTNL_ARF_UPDATE,      // Internal Altref Frame
-  FRAME_UPDATE_TYPES
-} UENUM1BYTE(FRAME_UPDATE_TYPE);
-
 #define FC_ANIMATION_THRESH 0.15
 enum {
   FC_NORMAL = 0,
@@ -115,25 +108,44 @@
 
 typedef struct {
   unsigned char index;
-  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_pos_in_gf[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char pyramid_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char pyramid_height;
-  unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
-  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+  // The number of frames displayed so far within the GOP at a given coding
+  // frame.
+  unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  unsigned char frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  int ref_frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  int ref_frame_gop_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+
+  // TODO(jingning): Unify the data structure used here after the new control
+  // mechanism is in place.
+  int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
+  int max_layer_depth;
+  int max_layer_depth_allowed;
+  // This is currently only populated for AOM_Q mode
+  unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
   int size;
 } GF_GROUP;
 
 typedef struct {
+  FIRSTPASS_STATS *stats_in_start;
+  FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS *stats_in_buf_end;
+  FIRSTPASS_STATS *total_stats;
+  FIRSTPASS_STATS *total_left_stats;
+} STATS_BUFFER_CTX;
+
+typedef struct {
   unsigned int section_intra_rating;
-  FIRSTPASS_STATS total_stats;
-  FIRSTPASS_STATS this_frame_stats;
+  // Circular queue of first pass stats stored for most recent frames.
+  // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
+  // here.
+  FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
+  int frame_stats_next_idx;  // Index to next unused element in frame_stats_arr.
   const FIRSTPASS_STATS *stats_in;
-  const FIRSTPASS_STATS *stats_in_start;
-  const FIRSTPASS_STATS *stats_in_end;
-  FIRSTPASS_STATS total_left_stats;
+  STATS_BUFFER_CTX *stats_buf_ctx;
   int first_pass_done;
   int64_t bits_left;
   double modified_error_min;
@@ -151,27 +163,26 @@
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
 
-  // The fraction for a kf groups total bits allocated to the inter frames
-  double kfgroup_inter_fraction;
+  // Over time correction for bits per macro block estimation
+  double bpm_factor;
+
+  // Record of target and actual bits spent in current ARF group
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
 
   int sr_update_lag;
 
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int active_worst_quality;
-  int baseline_active_worst_quality;
   int extend_minq;
   int extend_maxq;
   int extend_minq_fast;
-
-  GF_GROUP gf_group;
 } TWO_PASS;
 
 struct AV1_COMP;
 struct EncodeFrameParams;
 struct AV1EncoderConfig;
 
-void av1_init_first_pass(struct AV1_COMP *cpi);
 void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
 void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
 void av1_end_first_pass(struct AV1_COMP *cpi);

diff --git a/libaom/av1/encoder/global_motion.c b/libaom/av1/encoder/global_motion.c
index b8b13c3..9623ec3 100644
--- a/libaom/av1/encoder/global_motion.c
+++ b/libaom/av1/encoder/global_motion.c

@@ -28,7 +28,6 @@
 #include "av1/encoder/corner_match.h"
 #include "av1/encoder/ransac.h"
 
-#define MAX_CORNERS 4096
 #define MIN_INLIER_PROB 0.1
 
 #define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
@@ -65,9 +64,6 @@
   double *level_dy_buffer;
 } ImagePyramid;
 
-static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
-static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
-
 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
                                  int erroradv_type) {
   assert(erroradv_type < GM_ERRORADV_TR_TYPES);
@@ -165,13 +161,122 @@
   wm->wmtype = wmtype;
 }
 
-int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
-                                     TransformationType wmtype, int use_hbd,
-                                     int bd, uint8_t *ref, int r_width,
-                                     int r_height, int r_stride, uint8_t *dst,
-                                     int d_width, int d_height, int d_stride,
-                                     int n_refinements,
-                                     int64_t best_frame_error) {
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t highbd_warp_error(
+    WarpedMotionParams *wm, const uint16_t *const ref, int width, int height,
+    int stride, const uint16_t *const dst, int p_col, int p_row, int p_width,
+    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+    int64_t best_error, uint8_t *segment_map, int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  conv_params.use_dist_wtd_comp_avg = 0;
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      highbd_warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w,
+                        warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+                        bd, &conv_params);
+      gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK,
+                                               dst + j + i * p_stride, warp_w,
+                                               warp_h, p_stride, bd);
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+#endif
+
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+                          int width, int height, int stride,
+                          const uint8_t *const dst, int p_col, int p_row,
+                          int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error, uint8_t *segment_map,
+                          int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  int warp_w, warp_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
+
+      gm_sumerr +=
+          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+                               warp_w, warp_h, p_stride);
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride) {
+  if (wm->wmtype <= AFFINE)
+    if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height,
+                             stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row,
+                             p_width, p_height, p_stride, subsampling_x,
+                             subsampling_y, bd, best_error, segment_map,
+                             segment_map_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
+                    p_height, p_stride, subsampling_x, subsampling_y,
+                    best_error, segment_map, segment_map_stride);
+}
+
+// Factors used to calculate the thresholds for av1_warp_error
+static double thresh_factors[GM_REFINEMENT_COUNT] = { 1.25, 1.20, 1.15, 1.10,
+                                                      1.05 };
+
+static INLINE int64_t calc_approx_erroradv_threshold(
+    double scaling_factor, int64_t erroradv_threshold) {
+  return erroradv_threshold <
+                 (int64_t)(((double)INT64_MAX / scaling_factor) + 0.5)
+             ? (int64_t)(scaling_factor * erroradv_threshold + 0.5)
+             : INT64_MAX;
+}
+
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
+    int64_t erroradv_threshold) {
   static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
   const int border = ERRORADV_BORDER;
   int i = 0, p;
@@ -184,13 +289,16 @@
   int32_t best_param;
 
   force_wmtype(wm, wmtype);
-  best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
-                              dst + border * d_stride + border, border, border,
-                              d_width - 2 * border, d_height - 2 * border,
-                              d_stride, 0, 0, best_frame_error);
+  best_error =
+      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                     dst + border * d_stride + border, border, border,
+                     d_width - 2 * border, d_height - 2 * border, d_stride, 0,
+                     0, best_frame_error, segment_map, segment_map_stride);
   best_error = AOMMIN(best_error, best_frame_error);
   step = 1 << (n_refinements - 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
+    int64_t error_adv_thresh =
+        calc_approx_erroradv_threshold(thresh_factors[i], erroradv_threshold);
     for (p = 0; p < n_params; ++p) {
       int step_dir = 0;
       // Skip searches for parameters that are forced to be 0
@@ -203,7 +311,8 @@
           av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                          dst + border * d_stride + border, border, border,
                          d_width - 2 * border, d_height - 2 * border, d_stride,
-                         0, 0, best_error);
+                         0, 0, AOMMIN(best_error, error_adv_thresh),
+                         segment_map, segment_map_stride);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -216,7 +325,8 @@
           av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                          dst + border * d_stride + border, border, border,
                          d_width - 2 * border, d_height - 2 * border, d_stride,
-                         0, 0, best_error);
+                         0, 0, AOMMIN(best_error, error_adv_thresh),
+                         segment_map, segment_map_stride);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -232,7 +342,8 @@
             av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                            dst + border * d_stride + border, border, border,
                            d_width - 2 * border, d_height - 2 * border,
-                           d_stride, 0, 0, best_error);
+                           d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh),
+                           segment_map, segment_map_stride);
         if (step_error < best_error) {
           best_error = step_error;
           best_param = *param;
@@ -248,17 +359,7 @@
   return best_error;
 }
 
-static INLINE RansacFunc get_ransac_type(TransformationType type) {
-  switch (type) {
-    case AFFINE: return ransac_affine;
-    case ROTZOOM: return ransac_rotzoom;
-    case TRANSLATION: return ransac_translation;
-    default: assert(0); return NULL;
-  }
-}
-
-static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
-                                        int bit_depth) {
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) {
   int i, j;
   uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
   uint8_t *buf_8bit = frm->y_buffer_8bit;
@@ -275,54 +376,94 @@
   return buf_8bit;
 }
 
+static void get_inliers_from_indices(MotionModel *params,
+                                     int *correspondences) {
+  int *inliers_tmp = (int *)aom_malloc(2 * MAX_CORNERS * sizeof(*inliers_tmp));
+  memset(inliers_tmp, 0, 2 * MAX_CORNERS * sizeof(*inliers_tmp));
+
+  for (int i = 0; i < params->num_inliers; i++) {
+    int index = params->inliers[i];
+    inliers_tmp[2 * i] = correspondences[4 * index];
+    inliers_tmp[2 * i + 1] = correspondences[4 * index + 1];
+  }
+  memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS);
+  aom_free(inliers_tmp);
+}
+
+#define FEAT_COUNT_TR 3
+#define SEG_COUNT_TR 0.40
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers) {
+  int seg_count = 0;
+  memset(segment_map, 0, sizeof(*segment_map) * width * height);
+
+  for (int i = 0; i < num_inliers; i++) {
+    int x = inliers[i * 2];
+    int y = inliers[i * 2 + 1];
+    int seg_x = x >> WARP_ERROR_BLOCK_LOG;
+    int seg_y = y >> WARP_ERROR_BLOCK_LOG;
+    segment_map[seg_y * width + seg_x] += 1;
+  }
+
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      uint8_t feat_count = segment_map[i * width + j];
+      segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR);
+      seg_count += (segment_map[i * width + j]);
+    }
+  }
+
+  // If this motion does not make up a large enough portion of the frame,
+  // use the unsegmented version of the error metric
+  if (seg_count < (width * height * SEG_COUNT_TR))
+    memset(segment_map, 1, width * height * sizeof(*segment_map));
+}
+
 static int compute_global_motion_feature_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-    int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
-    int num_motions) {
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
   int i;
-  int num_frm_corners, num_ref_corners;
+  int num_ref_corners;
   int num_correspondences;
   int *correspondences;
-  int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
-  unsigned char *frm_buffer = frm->y_buffer;
+  int ref_corners[2 * MAX_CORNERS];
   unsigned char *ref_buffer = ref->y_buffer;
-  RansacFunc ransac = get_ransac_type(type);
+  RansacFunc ransac = av1_get_ransac_type(type);
 
-  if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // The frame buffer is 16-bit, so we need to convert to 8 bits for the
-    // following code. We cache the result until the frame is released.
-    frm_buffer = downconvert_frame(frm, bit_depth);
-  }
   if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = downconvert_frame(ref, bit_depth);
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
   }
 
-  // compute interest points in images using FAST features
-  num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
-                                       frm->y_stride, frm_corners, MAX_CORNERS);
-  num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
-                                       ref->y_stride, ref_corners, MAX_CORNERS);
+  num_ref_corners =
+      av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+                             ref->y_stride, ref_corners, MAX_CORNERS);
 
   // find correspondences between the two images
   correspondences =
       (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
-  num_correspondences = determine_correspondence(
+  num_correspondences = av1_determine_correspondence(
       frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
-      (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
-      frm->y_stride, ref->y_stride, correspondences);
+      (int *)ref_corners, num_ref_corners, frm_width, frm_height, frm_stride,
+      ref->y_stride, correspondences);
 
   ransac(correspondences, num_correspondences, num_inliers_by_motion,
          params_by_motion, num_motions);
 
-  free(correspondences);
-
   // Set num_inliers = 0 for motions with too few inliers so they are ignored.
   for (i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences ||
+        num_correspondences == 0) {
       num_inliers_by_motion[i] = 0;
+    } else {
+      get_inliers_from_indices(&params_by_motion[i], correspondences);
     }
   }
 
+  free(correspondences);
+
   // Return true if any one of the motions has inliers.
   for (i = 0; i < num_motions; ++i) {
     if (num_inliers_by_motion[i] > 0) return 1;
@@ -330,16 +471,6 @@
   return 0;
 }
 
-static INLINE RansacFuncDouble
-get_ransac_double_prec_type(TransformationType type) {
-  switch (type) {
-    case AFFINE: return ransac_affine_double_prec;
-    case ROTZOOM: return ransac_rotzoom_double_prec;
-    case TRANSLATION: return ransac_translation_double_prec;
-    default: assert(0); return NULL;
-  }
-}
-
 // Don't use points around the frame border since they are less reliable
 static INLINE int valid_point(int x, int y, int width, int height) {
   return (x > (PATCH_SIZE + PATCH_CENTER)) &&
@@ -369,22 +500,22 @@
   return num_correspondences;
 }
 
-double getCubicValue(double p[4], double x) {
+static double getCubicValue(double p[4], double x) {
   return p[1] + 0.5 * x *
                     (p[2] - p[0] +
                      x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
                           x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
 }
 
-void get_subcolumn(unsigned char *ref, double col[4], int stride, int x,
-                   int y_start) {
+static void get_subcolumn(unsigned char *ref, double col[4], int stride, int x,
+                          int y_start) {
   int i;
   for (i = 0; i < 4; ++i) {
     col[i] = ref[(i + y_start) * stride + x];
   }
 }
 
-double bicubic(unsigned char *ref, double x, double y, int stride) {
+static double bicubic(unsigned char *ref, double x, double y, int stride) {
   double arr[4];
   int k;
   int i = (int)x;
@@ -398,8 +529,8 @@
 }
 
 // Interpolate a warped block using bicubic interpolation when possible
-unsigned char interpolate(unsigned char *ref, double x, double y, int width,
-                          int height, int stride) {
+static unsigned char interpolate(unsigned char *ref, double x, double y,
+                                 int width, int height, int stride) {
   if (x < 0 && y < 0)
     return ref[0];
   else if (x < 0 && y > height - 1)
@@ -470,9 +601,9 @@
 }
 
 // Warps a block using flow vector [u, v] and computes the mse
-double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width,
-                              int height, int stride, int x, int y, double u,
-                              double v, int16_t *dt) {
+static double compute_warp_and_error(unsigned char *ref, unsigned char *frm,
+                                     int width, int height, int stride, int x,
+                                     int y, double u, double v, int16_t *dt) {
   int i, j;
   unsigned char warped;
   double x_w, y_w;
@@ -777,21 +908,17 @@
 }
 
 static int compute_global_motion_disflow_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-    int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
-    int num_motions) {
-  unsigned char *frm_buffer = frm->y_buffer;
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
   unsigned char *ref_buffer = ref->y_buffer;
-  const int frm_width = frm->y_width;
-  const int frm_height = frm->y_height;
   const int ref_width = ref->y_width;
   const int ref_height = ref->y_height;
   const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
-  int num_frm_corners;
   int num_correspondences;
   double *correspondences;
-  int frm_corners[2 * MAX_CORNERS];
-  RansacFuncDouble ransac = get_ransac_double_prec_type(type);
+  RansacFuncDouble ransac = av1_get_ransac_double_prec_type(type);
   assert(frm_width == ref_width);
   assert(frm_height == ref_height);
 
@@ -800,13 +927,8 @@
       frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height);
   const int n_levels = AOMMIN(msb, N_LEVELS);
 
-  if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // The frame buffer is 16-bit, so we need to convert to 8 bits for the
-    // following code. We cache the result until the frame is released.
-    frm_buffer = downconvert_frame(frm, bit_depth);
-  }
   if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = downconvert_frame(ref, bit_depth);
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
   }
 
   // TODO(sarahparker) We will want to do the source pyramid computation
@@ -819,8 +941,8 @@
   int compute_gradient = 1;
   ImagePyramid *frm_pyr =
       alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient);
-  compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm->y_stride,
-                        n_levels, pad_size, compute_gradient, frm_pyr);
+  compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels,
+                        pad_size, compute_gradient, frm_pyr);
   // Allocate ref image pyramids
   compute_gradient = 0;
   ImagePyramid *ref_pyr =
@@ -840,9 +962,6 @@
 
   compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v);
 
-  // compute interest points in images using FAST features
-  num_frm_corners = fast_corner_detect(frm_buffer, frm_width, frm_height,
-                                       frm->y_stride, frm_corners, MAX_CORNERS);
   // find correspondences between the two images using the flow field
   correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
   num_correspondences = determine_disflow_correspondence(
@@ -870,20 +989,25 @@
   return 0;
 }
 
-int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
-                              YV12_BUFFER_CONFIG *ref, int bit_depth,
+int av1_compute_global_motion(TransformationType type,
+                              unsigned char *frm_buffer, int frm_width,
+                              int frm_height, int frm_stride, int *frm_corners,
+                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
-                              double *params_by_motion, int num_motions) {
+                              MotionModel *params_by_motion, int num_motions) {
   switch (gm_estimation_type) {
     case GLOBAL_MOTION_FEATURE_BASED:
-      return compute_global_motion_feature_based(type, frm, ref, bit_depth,
-                                                 num_inliers_by_motion,
-                                                 params_by_motion, num_motions);
+      return compute_global_motion_feature_based(
+          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
+          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
     case GLOBAL_MOTION_DISFLOW_BASED:
-      return compute_global_motion_disflow_based(type, frm, ref, bit_depth,
-                                                 num_inliers_by_motion,
-                                                 params_by_motion, num_motions);
+      return compute_global_motion_disflow_based(
+          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
+          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
     default: assert(0 && "Unknown global motion estimation type");
   }
   return 0;

diff --git a/libaom/av1/encoder/global_motion.h b/libaom/av1/encoder/global_motion.h
index 2cfddad..0a6d0ec 100644
--- a/libaom/av1/encoder/global_motion.h
+++ b/libaom/av1/encoder/global_motion.h

@@ -15,34 +15,62 @@
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
 #include "av1/common/mv.h"
+#include "av1/common/warped_motion.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_CORNERS 4096
 #define RANSAC_NUM_MOTIONS 1
+#define GM_REFINEMENT_COUNT 5
 
 typedef enum {
   GLOBAL_MOTION_FEATURE_BASED,
   GLOBAL_MOTION_DISFLOW_BASED,
 } GlobalMotionEstimationType;
 
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth);
+
+typedef struct {
+  double params[MAX_PARAMDIM - 1];
+  int *inliers;
+  int num_inliers;
+} MotionModel;
+
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model);
 
+// TODO(sarahparker) These need to be retuned for speed 0 and 1 to
+// maximize gains from segmented error metric
+static const double erroradv_tr[] = { 0.65, 0.60, 0.65 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+
 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
                                  int erroradv_type);
 
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers);
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride);
+
 // Returns the av1_warp_error between "dst" and the result of applying the
 // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
 // modified in place.
-int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
-                                     TransformationType wmtype, int use_hbd,
-                                     int bd, uint8_t *ref, int r_width,
-                                     int r_height, int r_stride, uint8_t *dst,
-                                     int d_width, int d_height, int d_stride,
-                                     int n_refinements,
-                                     int64_t best_frame_error);
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
+    int64_t erroradv_threshold);
 
 /*
   Computes "num_motions" candidate global motion parameters between two frames.
@@ -59,11 +87,14 @@
   number of inlier feature points for each motion. Params for which the
   num_inliers entry is 0 should be ignored by the caller.
 */
-int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
-                              YV12_BUFFER_CONFIG *ref, int bit_depth,
+int av1_compute_global_motion(TransformationType type,
+                              unsigned char *frm_buffer, int frm_width,
+                              int frm_height, int frm_stride, int *frm_corners,
+                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
-                              double *params_by_motion, int num_motions);
+                              MotionModel *params_by_motion, int num_motions);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/gop_structure.c b/libaom/av1/encoder/gop_structure.c
index 73cb0ed..1ed71a0 100644
--- a/libaom/av1/encoder/gop_structure.c
+++ b/libaom/av1/encoder/gop_structure.c

@@ -19,67 +19,78 @@
 
 #include "aom_ports/system_state.h"
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
 
 // Set parameters for frames between 'start' and 'end' (excluding both).
-static void set_multi_layer_params(GF_GROUP *const gf_group, int start, int end,
-                                   int *frame_ind, int arf_ind, int level) {
-  assert(level >= MIN_PYRAMID_LVL);
+static void set_multi_layer_params(const TWO_PASS *twopass,
+                                   GF_GROUP *const gf_group, RATE_CONTROL *rc,
+                                   FRAME_INFO *frame_info, int start, int end,
+                                   int *cur_frame_idx, int *frame_ind,
+                                   int arf_ind, int layer_depth) {
   const int num_frames_to_process = end - start - 1;
   assert(num_frames_to_process >= 0);
   if (num_frames_to_process == 0) return;
 
   // Either we are at the last level of the pyramid, or we don't have enough
   // frames between 'l' and 'r' to create one more level.
-  if (level == MIN_PYRAMID_LVL || num_frames_to_process < 3) {
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
     // Leaf nodes.
     while (++start < end) {
       gf_group->update_type[*frame_ind] = LF_UPDATE;
       gf_group->arf_src_offset[*frame_ind] = 0;
-      gf_group->arf_pos_in_gf[*frame_ind] = 0;
-      gf_group->arf_update_idx[*frame_ind] = arf_ind;
-      gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
-      ++gf_group->pyramid_lvl_nodes[MIN_PYRAMID_LVL];
+      ++*cur_frame_idx;
+      gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+      gf_group->frame_disp_idx[*frame_ind] = start;
+      gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
+          twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+      gf_group->max_layer_depth =
+          AOMMAX(gf_group->max_layer_depth, layer_depth);
       ++(*frame_ind);
     }
   } else {
     const int m = (start + end) / 2;
-    const int arf_pos_in_gf = *frame_ind;
 
     // Internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
     gf_group->arf_src_offset[*frame_ind] = m - start - 1;
-    gf_group->arf_pos_in_gf[*frame_ind] = 0;
-    gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
-    gf_group->pyramid_level[*frame_ind] = level;
-    ++gf_group->pyramid_lvl_nodes[level];
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->frame_disp_idx[*frame_ind] = m;
+    gf_group->layer_depth[*frame_ind] = layer_depth;
+
+    // Get the boost factor for intermediate ARF frames.
+    gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
+        twopass, rc, frame_info, m, end - m, m - start, NULL, NULL);
     ++(*frame_ind);
 
     // Frames displayed before this internal ARF.
-    set_multi_layer_params(gf_group, start, m, frame_ind, 1, level - 1);
+    set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
+                           cur_frame_idx, frame_ind, 1, layer_depth + 1);
 
     // Overlay for internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
     gf_group->arf_src_offset[*frame_ind] = 0;
-    gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;  // For bit allocation.
-    gf_group->arf_update_idx[*frame_ind] = 1;
-    gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->frame_disp_idx[*frame_ind] = m;
+    gf_group->arf_boost[*frame_ind] = 0;
+    gf_group->layer_depth[*frame_ind] = layer_depth;
     ++(*frame_ind);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params(gf_group, m, end, frame_ind, arf_ind, level - 1);
+    set_multi_layer_params(twopass, gf_group, rc, frame_info, m, end,
+                           cur_frame_idx, frame_ind, arf_ind, layer_depth + 1);
   }
 }
 
 static int construct_multi_layer_gf_structure(
-    GF_GROUP *const gf_group, int gf_interval, int pyr_height,
+    AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
+    RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
     FRAME_UPDATE_TYPE first_frame_update_type) {
-  gf_group->pyramid_height = pyr_height;
-  av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
   int frame_index = 0;
 
   // Keyframe / Overlay frame / Golden frame.
@@ -87,30 +98,37 @@
   assert(first_frame_update_type == KF_UPDATE ||
          first_frame_update_type == OVERLAY_UPDATE ||
          first_frame_update_type == GF_UPDATE);
+
   gf_group->update_type[frame_index] = first_frame_update_type;
   gf_group->arf_src_offset[frame_index] = 0;
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->pyramid_level[frame_index] = MIN_PYRAMID_LVL;
+  gf_group->cur_frame_idx[frame_index] = 0;
+  gf_group->layer_depth[frame_index] =
+      first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
+  gf_group->max_layer_depth = 0;
   ++frame_index;
 
   // ALTREF.
-  const int use_altref = (gf_group->pyramid_height > 0);
+  const int use_altref = gf_group->max_layer_depth_allowed > 0;
   if (use_altref) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
     gf_group->arf_src_offset[frame_index] = gf_interval - 1;
-    gf_group->arf_pos_in_gf[frame_index] = 0;
-    gf_group->arf_update_idx[frame_index] = 0;
-    gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+    gf_group->cur_frame_idx[frame_index] = 0;
+    gf_group->frame_disp_idx[frame_index] = gf_interval;
+    gf_group->layer_depth[frame_index] = 1;
+    gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->max_layer_depth = 1;
     ++frame_index;
   }
 
+  int cur_frame_index = 0;
   // Rest of the frames.
-  const int next_height =
-      use_altref ? gf_group->pyramid_height - 1 : gf_group->pyramid_height;
-  assert(next_height >= MIN_PYRAMID_LVL);
-  set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
-                         next_height);
+  set_multi_layer_params(twopass, gf_group, rc, frame_info, 0, gf_interval,
+                         &cur_frame_index, &frame_index, 0, use_altref + 1);
+
+  // The end frame will be Overlay frame for an ARF GOP; otherwise set it to
+  // be GF, for consistency, which will be updated in the next GOP.
+  gf_group->update_type[frame_index] = use_altref ? OVERLAY_UPDATE : GF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
   return frame_index;
 }
 
@@ -125,7 +143,7 @@
   FILE *fid = fopen("GF_PARAMS.txt", "a");
 
   fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
-  for (int i = 0; i <= gf_group->size; ++i) {
+  for (int i = 0; i < gf_group->size; ++i) {
     fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
             update_type_strings[gf_group->update_type[i]],
             gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
@@ -141,50 +159,151 @@
 }
 #endif  // CHECK_GF_PARAMETER
 
-static INLINE int max_pyramid_height_from_width(int pyramid_width) {
-  if (pyramid_width > 12) return 4;
-  if (pyramid_width > 6) return 3;
-  if (pyramid_width > 3) return 2;
-  if (pyramid_width > 1) return 1;
-  return 0;
+#define REF_IDX(ref) ((ref)-LAST_FRAME)
+
+static INLINE void reset_ref_frame_idx(int *ref_idx, int reset_value) {
+  for (int i = 0; i < REF_FRAMES; ++i) ref_idx[i] = reset_value;
 }
 
-static int get_pyramid_height(const AV1_COMP *const cpi) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  assert(IMPLIES(cpi->oxcf.gf_max_pyr_height == MIN_PYRAMID_LVL,
-                 !rc->source_alt_ref_pending));  // define_gf_group() enforced.
-  if (!rc->source_alt_ref_pending) {
-    return MIN_PYRAMID_LVL;
+static INLINE void set_ref_frame_disp_idx(GF_GROUP *const gf_group) {
+  for (int i = 0; i < gf_group->size; ++i) {
+    for (int ref = 0; ref < INTER_REFS_PER_FRAME + 1; ++ref) {
+      int ref_gop_idx = gf_group->ref_frame_gop_idx[i][ref];
+      if (ref_gop_idx == -1) {
+        gf_group->ref_frame_disp_idx[i][ref] = -1;
+      } else {
+        gf_group->ref_frame_disp_idx[i][ref] =
+            gf_group->frame_disp_idx[ref_gop_idx];
+      }
+    }
   }
-  assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
-  if (!cpi->internal_altref_allowed) {
-    assert(MIN_PYRAMID_LVL + 1 <= cpi->oxcf.gf_max_pyr_height);
-    return MIN_PYRAMID_LVL + 1;
+}
+
+static void set_gop_ref_frame_map(GF_GROUP *const gf_group) {
+  // Initialize the reference slots as all -1.
+  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx)
+    reset_ref_frame_idx(gf_group->ref_frame_gop_idx[frame_idx], -1);
+
+  // Set the map for frames in the current gop
+  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) {
+    const FRAME_UPDATE_TYPE update_type = gf_group->update_type[frame_idx];
+    // TODO(yuec): need to figure out how to determine
+    // (1) whether a KEY_FRAME has show_frame on
+    // (2) whether a frame with INTNL_OVERLAY_UPDATE type has
+    //     show_existing_frame on
+    const int show_frame =
+        update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE;
+    const int show_existing_frame =
+        update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE;
+
+    int this_ref_map[INTER_REFS_PER_FRAME + 1];
+    memcpy(this_ref_map, gf_group->ref_frame_gop_idx[frame_idx],
+           sizeof(this_ref_map));
+    int *next_ref_map = &gf_group->ref_frame_gop_idx[frame_idx + 1][0];
+
+    switch (update_type) {
+      case KF_UPDATE:
+        if (show_frame) {
+          reset_ref_frame_idx(this_ref_map, frame_idx);
+        } else {
+          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(ALTREF2_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
+        }
+        break;
+      case LF_UPDATE: this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; break;
+      case GF_UPDATE:
+        this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+        this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
+        break;
+      case OVERLAY_UPDATE:
+        this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
+        break;
+      case ARF_UPDATE: this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; break;
+      case INTNL_OVERLAY_UPDATE:
+        if (!show_existing_frame)
+          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+        break;
+      case INTNL_ARF_UPDATE:
+        this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
+        break;
+      default: assert(0); break;
+    }
+
+    memcpy(next_ref_map, this_ref_map, sizeof(this_ref_map));
+
+    switch (update_type) {
+      case LF_UPDATE:
+      case GF_UPDATE:
+        next_ref_map[REF_IDX(LAST3_FRAME)] = this_ref_map[REF_IDX(LAST2_FRAME)];
+        next_ref_map[REF_IDX(LAST2_FRAME)] = this_ref_map[REF_IDX(LAST_FRAME)];
+        next_ref_map[REF_IDX(LAST_FRAME)] = this_ref_map[REF_IDX(LAST3_FRAME)];
+        break;
+      case INTNL_OVERLAY_UPDATE:
+        if (!show_existing_frame) {
+          next_ref_map[REF_IDX(LAST3_FRAME)] =
+              this_ref_map[REF_IDX(LAST2_FRAME)];
+          next_ref_map[REF_IDX(LAST2_FRAME)] =
+              this_ref_map[REF_IDX(LAST_FRAME)];
+          next_ref_map[REF_IDX(LAST_FRAME)] =
+              this_ref_map[REF_IDX(LAST3_FRAME)];
+        } else {
+          next_ref_map[REF_IDX(LAST_FRAME)] =
+              this_ref_map[REF_IDX(BWDREF_FRAME)];
+          next_ref_map[REF_IDX(LAST2_FRAME)] =
+              this_ref_map[REF_IDX(LAST_FRAME)];
+          next_ref_map[REF_IDX(LAST3_FRAME)] =
+              this_ref_map[REF_IDX(LAST2_FRAME)];
+          next_ref_map[REF_IDX(BWDREF_FRAME)] =
+              this_ref_map[REF_IDX(ALTREF2_FRAME)];
+          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
+              this_ref_map[REF_IDX(EXTREF_FRAME)];
+          next_ref_map[REF_IDX(EXTREF_FRAME)] =
+              this_ref_map[REF_IDX(LAST3_FRAME)];
+        }
+        break;
+      case INTNL_ARF_UPDATE:
+        if (!show_existing_frame) {
+          next_ref_map[REF_IDX(BWDREF_FRAME)] =
+              this_ref_map[REF_IDX(EXTREF_FRAME)];
+          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
+              this_ref_map[REF_IDX(BWDREF_FRAME)];
+          next_ref_map[REF_IDX(EXTREF_FRAME)] =
+              this_ref_map[REF_IDX(ALTREF2_FRAME)];
+        }
+        break;
+      case OVERLAY_UPDATE:
+        next_ref_map[REF_IDX(ALTREF_FRAME)] =
+            this_ref_map[REF_IDX(GOLDEN_FRAME)];
+        next_ref_map[REF_IDX(GOLDEN_FRAME)] =
+            this_ref_map[REF_IDX(ALTREF_FRAME)];
+        break;
+      default: break;
+    }
   }
-  return AOMMIN(max_pyramid_height_from_width(rc->baseline_gf_interval),
-                cpi->oxcf.gf_max_pyr_height);
+
+  // Set the map in display order index by converting from gop indices in the
+  // above map
+  set_ref_frame_disp_idx(gf_group);
 }
 
 void av1_gop_setup_structure(AV1_COMP *cpi,
                              const EncodeFrameParams *const frame_params) {
   RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
   TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
   const int key_frame = (frame_params->frame_type == KEY_FRAME);
   const FRAME_UPDATE_TYPE first_frame_update_type =
       key_frame ? KF_UPDATE
                 : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
   gf_group->size = construct_multi_layer_gf_structure(
-      gf_group, rc->baseline_gf_interval, get_pyramid_height(cpi),
+      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval,
       first_frame_update_type);
 
-  // We need to configure the frame at the end of the sequence + 1 that
-  // will be the start frame for the next group. Otherwise prior to the
-  // call to av1_get_second_pass_params(), the data will be undefined.
-  gf_group->update_type[gf_group->size] =
-      (rc->source_alt_ref_pending) ? OVERLAY_UPDATE : GF_UPDATE;
-  gf_group->arf_update_idx[gf_group->size] = 0;
-  gf_group->arf_pos_in_gf[gf_group->size] = 0;
+  set_gop_ref_frame_map(gf_group);
 
 #if CHECK_GF_PARAMETER
   check_frame_params(gf_group, rc->baseline_gf_interval);

diff --git a/libaom/av1/encoder/gop_structure.h b/libaom/av1/encoder/gop_structure.h
index d9d5ae7..0c775c7 100644
--- a/libaom/av1/encoder/gop_structure.h
+++ b/libaom/av1/encoder/gop_structure.h

@@ -12,7 +12,7 @@
 #ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
 #define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/encoder/ratectrl.h"
 
 #ifdef __cplusplus
@@ -22,6 +22,9 @@
 struct AV1_COMP;
 struct EncodeFrameParams;
 
+#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+
 // Set up the Group-Of-Pictures structure for this GF_GROUP.  This involves
 // deciding where to place the various FRAME_UPDATE_TYPEs in the group.  It does
 // this primarily by setting the contents of
@@ -29,6 +32,10 @@
 void av1_gop_setup_structure(
     struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
 
+int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+                       FRAME_INFO *frame_info, int offset, int f_frames,
+                       int b_frames, int *num_fpstats_used,
+                       int *num_fpstats_required);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/hash.c b/libaom/av1/encoder/hash.c
index 180115d..3091037 100644
--- a/libaom/av1/encoder/hash.c
+++ b/libaom/av1/encoder/hash.c

@@ -14,9 +14,9 @@
 static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
                                         uint8_t *pData, uint32_t dataLength) {
   for (uint32_t i = 0; i < dataLength; i++) {
-    const uint8_t index =
+    const uint8_t index = (uint8_t)(
         (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
-        pData[i];
+        pData[i]);
     p_crc_calculator->remainder <<= 8;
     p_crc_calculator->remainder ^= p_crc_calculator->table[index];
   }
@@ -61,8 +61,8 @@
   crc_calculator_init_table(p_crc_calculator);
 }
 
-uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) {
-  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length) {
   crc_calculator_reset(p_crc_calculator);
   crc_calculator_process_data(p_crc_calculator, p, length);
   return crc_calculator_get_crc(p_crc_calculator);
@@ -99,10 +99,10 @@
 /* Table-driven software version as a fall-back.  This is about 15 times slower
  than using the hardware instructions.  This assumes little-endian integers,
  as is the case on Intel processors that the assembler code here is for. */
-uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) {
+uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) {
   const uint8_t *next = (const uint8_t *)(buf);
   uint64_t crc;
-
+  CRC32C *p = (CRC32C *)c;
   crc = 0 ^ 0xffffffff;
   while (len && ((uintptr_t)next & 7) != 0) {
     crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);

diff --git a/libaom/av1/encoder/hash.h b/libaom/av1/encoder/hash.h
index 826c004..d8e8cc3 100644
--- a/libaom/av1/encoder/hash.h
+++ b/libaom/av1/encoder/hash.h

@@ -32,7 +32,8 @@
 // calling av1_get_crc_value().
 void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
                              uint32_t truncPoly);
-uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length);
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length);
 
 // CRC32C: POLY = 0x82f63b78;
 typedef struct _CRC32C {

diff --git a/libaom/av1/encoder/hash_motion.c b/libaom/av1/encoder/hash_motion.c
index 00915e5..310cde8 100644
--- a/libaom/av1/encoder/hash_motion.c
+++ b/libaom/av1/encoder/hash_motion.c

@@ -17,28 +17,16 @@
 #include "av1/encoder/hash.h"
 #include "av1/encoder/hash_motion.h"
 
-static const int crc_bits = 16;
-static const int block_size_bits = 3;
-
-static void hash_table_clear_all(hash_table *p_hash_table) {
-  if (p_hash_table->p_lookup_table == NULL) {
-    return;
-  }
-  int max_addr = 1 << (crc_bits + block_size_bits);
-  for (int i = 0; i < max_addr; i++) {
-    if (p_hash_table->p_lookup_table[i] != NULL) {
-      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
-      aom_free(p_hash_table->p_lookup_table[i]);
-      p_hash_table->p_lookup_table[i] = NULL;
-    }
-  }
-}
+#define kSrcBits 16
+#define kBlockSizeBits 3
+#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
 
 // TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
 // If yes, fix this function
-static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
+static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
+                                                     int stride,
                                                      uint8_t *p_pixels_in1D) {
-  uint8_t *p_pel = y_src;
+  const uint8_t *p_pel = y_src;
   int index = 0;
   for (int i = 0; i < 2; i++) {
     for (int j = 0; j < 2; j++) {
@@ -48,10 +36,10 @@
   }
 }
 
-static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src,
+static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
                                                       int stride,
                                                       uint16_t *p_pixels_in1D) {
-  uint16_t *p_pel = y_src;
+  const uint16_t *p_pel = y_src;
   int index = 0;
   for (int i = 0; i < 2; i++) {
     for (int j = 0; j < 2; j++) {
@@ -61,28 +49,28 @@
   }
 }
 
-static int is_block_2x2_row_same_value(uint8_t *p) {
+static int is_block_2x2_row_same_value(const uint8_t *p) {
   if (p[0] != p[1] || p[2] != p[3]) {
     return 0;
   }
   return 1;
 }
 
-static int is_block16_2x2_row_same_value(uint16_t *p) {
+static int is_block16_2x2_row_same_value(const uint16_t *p) {
   if (p[0] != p[1] || p[2] != p[3]) {
     return 0;
   }
   return 1;
 }
 
-static int is_block_2x2_col_same_value(uint8_t *p) {
+static int is_block_2x2_col_same_value(const uint8_t *p) {
   if ((p[0] != p[2]) || (p[1] != p[3])) {
     return 0;
   }
   return 1;
 }
 
-static int is_block16_2x2_col_same_value(uint16_t *p) {
+static int is_block16_2x2_col_same_value(const uint16_t *p) {
   if ((p[0] != p[2]) || (p[1] != p[3])) {
     return 0;
   }
@@ -104,31 +92,43 @@
   }
 }
 
-void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
-  if (x->g_crc_initialized == 0) {
-    av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
-    av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
-    x->g_crc_initialized = 1;
+void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) {
+  if (!intrabc_hash_info->g_crc_initialized) {
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB);
+    intrabc_hash_info->g_crc_initialized = 1;
   }
-  p_hash_table->p_lookup_table = NULL;
+  intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL;
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  for (int i = 0; i < kMaxAddr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
 }
 
 void av1_hash_table_destroy(hash_table *p_hash_table) {
-  hash_table_clear_all(p_hash_table);
+  av1_hash_table_clear_all(p_hash_table);
   aom_free(p_hash_table->p_lookup_table);
   p_hash_table->p_lookup_table = NULL;
 }
 
 void av1_hash_table_create(hash_table *p_hash_table) {
   if (p_hash_table->p_lookup_table != NULL) {
-    hash_table_clear_all(p_hash_table);
+    av1_hash_table_clear_all(p_hash_table);
     return;
   }
-  const int max_addr = 1 << (crc_bits + block_size_bits);
   p_hash_table->p_lookup_table =
-      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
   memset(p_hash_table->p_lookup_table, 0,
-         sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+         sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
 }
 
 static void hash_table_add_to_table(hash_table *p_hash_table,
@@ -170,22 +170,26 @@
   Iterator iterator =
       aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
   Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
-  for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
-    if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
+  for (; !aom_iterator_equals(&iterator, &last);
+       aom_iterator_increment(&iterator)) {
+    if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 ==
+        hash_value2) {
       return 1;
     }
   }
   return 0;
 }
 
-void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
                                        uint32_t *pic_block_hash[2],
-                                       int8_t *pic_block_same_info[3],
-                                       MACROBLOCK *x) {
+                                       int8_t *pic_block_same_info[3]) {
   const int width = 2;
   const int height = 2;
   const int x_end = picture->y_crop_width - width + 1;
   const int y_end = picture->y_crop_height - height + 1;
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
 
   const int length = width * 2;
   if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -200,10 +204,10 @@
         pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
         pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
 
-        pic_block_hash[0][pos] = av1_get_crc_value(
-            &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
-        pic_block_hash[1][pos] = av1_get_crc_value(
-            &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
         pos++;
       }
       pos += width - 1;
@@ -220,9 +224,9 @@
         pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
 
         pic_block_hash[0][pos] =
-            av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
+            av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
         pic_block_hash[1][pos] =
-            av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
+            av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
         pos++;
       }
       pos += width - 1;
@@ -230,13 +234,16 @@
   }
 }
 
-void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
                                    int block_size,
                                    uint32_t *src_pic_block_hash[2],
                                    uint32_t *dst_pic_block_hash[2],
                                    int8_t *src_pic_block_same_info[3],
-                                   int8_t *dst_pic_block_same_info[3],
-                                   MACROBLOCK *x) {
+                                   int8_t *dst_pic_block_same_info[3]) {
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
   const int pic_width = picture->y_crop_width;
   const int x_end = picture->y_crop_width - block_size + 1;
   const int y_end = picture->y_crop_height - block_size + 1;
@@ -255,14 +262,14 @@
       p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
       p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
       dst_pic_block_hash[0][pos] =
-          av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
+          av1_get_crc_value(calc_1, (uint8_t *)p, length);
 
       p[0] = src_pic_block_hash[1][pos];
       p[1] = src_pic_block_hash[1][pos + src_size];
       p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
       p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
       dst_pic_block_hash[1][pos] =
-          av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
+          av1_get_crc_value(calc_2, (uint8_t *)p, length);
 
       dst_pic_block_same_info[0][pos] =
           src_pic_block_same_info[0][pos] &&
@@ -313,8 +320,8 @@
 
   int add_value = hash_block_size_to_index(block_size);
   assert(add_value >= 0);
-  add_value <<= crc_bits;
-  const int crc_mask = (1 << crc_bits) - 1;
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
 
   for (int x_pos = 0; x_pos < x_end; x_pos++) {
     for (int y_pos = 0; y_pos < y_end; y_pos++) {
@@ -389,14 +396,19 @@
   return 1;
 }
 
-void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
                               uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth, MACROBLOCK *x) {
-  uint32_t to_hash[4];
+                              int use_highbitdepth) {
   int add_value = hash_block_size_to_index(block_size);
   assert(add_value >= 0);
-  add_value <<= crc_bits;
-  const int crc_mask = (1 << crc_bits) - 1;
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
+
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+  uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0];
+  uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1];
 
   // 2x2 subblock hash values in current CU
   int sub_block_in_width = (block_size >> 1);
@@ -409,12 +421,10 @@
         get_pixels_in_1D_short_array_by_block_2x2(
             y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
         assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        x->hash_value_buffer[0][0][pos] =
-            av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
-                              sizeof(pixel_to_hash));
-        x->hash_value_buffer[1][0][pos] =
-            av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
-                              sizeof(pixel_to_hash));
+        buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
+        buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
       }
     }
   } else {
@@ -425,10 +435,10 @@
         get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
                                                  stride, pixel_to_hash);
         assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
-            &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
-        x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
-            &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+        buf_1[0][pos] =
+            av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
+        buf_2[0][pos] =
+            av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
       }
     }
   }
@@ -440,6 +450,7 @@
   int dst_idx = 0;
 
   // 4x4 subblock hash values to current block hash values
+  uint32_t to_hash[4];
   for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
     src_idx = 1 - src_idx;
     dst_idx = 1 - dst_idx;
@@ -453,24 +464,20 @@
         assert(srcPos + src_sub_block_in_width + 1 <
                AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
         assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
-        to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
-        to_hash[2] =
-            x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
-        to_hash[3] = x->hash_value_buffer[0][src_idx]
-                                         [srcPos + src_sub_block_in_width + 1];
+        to_hash[0] = buf_1[src_idx][srcPos];
+        to_hash[1] = buf_1[src_idx][srcPos + 1];
+        to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1];
 
-        x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
-            &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+        buf_1[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash));
 
-        to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
-        to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
-        to_hash[2] =
-            x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
-        to_hash[3] = x->hash_value_buffer[1][src_idx]
-                                         [srcPos + src_sub_block_in_width + 1];
-        x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
-            &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+        to_hash[0] = buf_2[src_idx][srcPos];
+        to_hash[1] = buf_2[src_idx][srcPos + 1];
+        to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1];
+        buf_2[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash));
         dst_pos++;
       }
     }
@@ -479,6 +486,6 @@
     sub_block_in_width >>= 1;
   }
 
-  *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
-  *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
+  *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = buf_2[dst_idx][0];
 }

diff --git a/libaom/av1/encoder/hash_motion.h b/libaom/av1/encoder/hash_motion.h
index ed9bb6e..e4ea1f3 100644
--- a/libaom/av1/encoder/hash_motion.h
+++ b/libaom/av1/encoder/hash_motion.h

@@ -16,11 +16,15 @@
 
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
+#include "av1/encoder/hash.h"
 #include "third_party/vector/vector.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// Block size used for force_integer_mv decisions
+#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8
+
 // store a block's hash info.
 // x and y are the position from the top left of the picture
 // hash_value2 is used to store the second hash value
@@ -34,7 +38,23 @@
   Vector **p_lookup_table;
 } hash_table;
 
-void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
+struct intrabc_hash_info;
+
+typedef struct intrabc_hash_info {
+  // buffer for hash value calculation of a block
+  // used only in av1_get_block_hash_value()
+  // [first hash/second hash]
+  // [two buffers used ping-pong]
+  uint32_t *hash_value_buffer[2][2];
+  hash_table intrabc_hash_table;
+
+  CRC_CALCULATOR crc_calculator1;
+  CRC_CALCULATOR crc_calculator2;
+  int g_crc_initialized;
+} IntraBCHashInfo;
+
+void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
 void av1_hash_table_destroy(hash_table *p_hash_table);
 void av1_hash_table_create(hash_table *p_hash_table);
 int32_t av1_hash_table_count(const hash_table *p_hash_table,
@@ -43,17 +63,17 @@
                                      uint32_t hash_value);
 int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
                             uint32_t hash_value2);
-void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
                                        uint32_t *pic_block_hash[2],
-                                       int8_t *pic_block_same_info[3],
-                                       struct macroblock *x);
-void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                       int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
                                    int block_size,
                                    uint32_t *src_pic_block_hash[2],
                                    uint32_t *dst_pic_block_hash[2],
                                    int8_t *src_pic_block_same_info[3],
-                                   int8_t *dst_pic_block_same_info[3],
-                                   struct macroblock *x);
+                                   int8_t *dst_pic_block_same_info[3]);
 void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
                                                  uint32_t *pic_hash[2],
                                                  int8_t *pic_is_same,
@@ -68,9 +88,11 @@
 // block_size x block_size has the same color in all columns
 int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
                                  int block_size, int x_start, int y_start);
-void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
                               uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth, struct macroblock *x);
+                              int use_highbitdepth);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/interp_search.c b/libaom/av1/encoder/interp_search.c
new file mode 100644
index 0000000..6b7317b
--- /dev/null
+++ b/libaom/av1/encoder/interp_search.c

@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// return mv_diff
+static INLINE int is_interp_filter_good_match(
+    const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
+    int skip_level) {
+  const int is_comp = has_second_ref(mi);
+  int i;
+
+  for (i = 0; i < 1 + is_comp; ++i) {
+    if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX;
+  }
+
+  if (skip_level == 1 && is_comp) {
+    if (st->comp_type != mi->interinter_comp.type) return INT_MAX;
+    if (st->compound_idx != mi->compound_idx) return INT_MAX;
+  }
+
+  int mv_diff = 0;
+  for (i = 0; i < 1 + is_comp; ++i) {
+    mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) +
+               abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col);
+  }
+  return mv_diff;
+}
+
+static INLINE int save_interp_filter_search_stat(
+    MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+                                        { mbmi->mv[0], mbmi->mv[1] },
+                                        { mbmi->ref_frame[0],
+                                          mbmi->ref_frame[1] },
+                                        mbmi->interinter_comp.type,
+                                        mbmi->compound_idx,
+                                        rd,
+                                        pred_sse };
+    interp_filter_stats[interp_filter_stats_idx] = stat;
+    interp_filter_stats_idx++;
+  }
+  return interp_filter_stats_idx;
+}
+
+static INLINE int find_interp_filter_in_stats(
+    MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx, int skip_level) {
+  // [skip_levels][single or comp]
+  const int thr[2][2] = { { 0, 0 }, { 3, 7 } };
+  const int is_comp = has_second_ref(mbmi);
+
+  // Find good enough match.
+  // TODO(yunqing): Separate single-ref mode and comp mode stats for fast
+  // search.
+  int best = INT_MAX;
+  int match = -1;
+  for (int j = 0; j < interp_filter_stats_idx; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j];
+    const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level);
+    // Exact match is found.
+    if (mv_diff == 0) {
+      match = j;
+      break;
+    } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) {
+      best = mv_diff;
+      match = j;
+    }
+  }
+
+  if (match != -1) {
+    mbmi->interp_filters = interp_filter_stats[match].filters;
+    return match;
+  }
+  return -1;  // no match result found
+}
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  int match_found_idx = -1;
+  if (cpi->sf.interp_sf.use_interp_filter && need_search)
+    match_found_idx = find_interp_filter_in_stats(
+        mbmi, interp_filter_stats, interp_filter_stats_idx,
+        cpi->sf.interp_sf.use_interp_filter);
+
+  if (!need_search || match_found_idx == -1)
+    set_default_interp_filters(mbmi, assign_filter);
+  return match_found_idx;
+}
+
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+                                      const int_interpfilters filters,
+                                      const int ctx[2]) {
+  int inter_filter_cost;
+  const InterpFilter filter0 = filters.as_filters.y_filter;
+  const InterpFilter filter1 = filters.as_filters.x_filter;
+  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// Build inter predictor and calculate model rd
+// for a given plane.
+static INLINE void interp_model_rd_eval(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
+    RD_STATS *rd_stats, int is_skip_build_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS tmp_rd_stats;
+  av1_init_rd_stats(&tmp_rd_stats);
+
+  // Skip inter predictor if the predictor is already avilable.
+  if (!is_skip_build_pred) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  plane_from, plane_to);
+  }
+
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+                     ? MODELRD_LEGACY
+                     : MODELRD_TYPE_INTERP_FILTER](
+      cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
+      &tmp_rd_stats.dist, &tmp_rd_stats.skip, &tmp_rd_stats.sse, NULL, NULL,
+      NULL);
+
+  av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
+    RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2],
+    const int skip_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS this_rd_stats_luma, this_rd_stats;
+
+  // Initialize rd_stats structures to default values.
+  av1_init_rd_stats(&this_rd_stats_luma);
+  this_rd_stats = *rd_stats_luma;
+  const int_interpfilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
+  if (min_rd > *rd) {
+    mbmi->interp_filters = last_best;
+    return 0;
+  }
+
+  (void)tile_data;
+
+  assert(skip_pred != 2);
+  assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
+  assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
+  assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
+  assert((rd_stats_luma->skip == 0) || (rd_stats_luma->skip == 1));
+  assert((rd_stats->skip == 0) || (rd_stats->skip == 1));
+  assert((skip_pred >= 0) &&
+         (skip_pred <= interp_search_flags->default_interp_skip_flags));
+
+  // When skip pred is equal to default_interp_skip_flags,
+  // skip both luma and chroma MC.
+  // For mono-chrome images:
+  // num_planes = 1 and cpi->default_interp_skip_flags = 1,
+  // skip_pred = 1: skip both luma and chroma
+  // skip_pred = 0: Evaluate luma and as num_planes=1,
+  // skip chroma evaluation
+  int tmp_skip_pred =
+      (skip_pred == interp_search_flags->default_interp_skip_flags)
+          ? INTERP_SKIP_LUMA_SKIP_CHROMA
+          : skip_pred;
+
+  switch (tmp_skip_pred) {
+    case INTERP_EVAL_LUMA_EVAL_CHROMA:
+      // skip_pred = 0: Evaluate both luma and chroma.
+      // Luma MC
+      interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                           &this_rd_stats_luma, 0);
+      this_rd_stats = this_rd_stats_luma;
+#if CONFIG_COLLECT_RD_STATS == 3
+      RD_STATS rd_stats_y;
+      av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+      AOM_FALLTHROUGH_INTENDED;
+    case INTERP_SKIP_LUMA_EVAL_CHROMA:
+      // skip_pred = 1: skip luma evaluation (retain previous best luma stats)
+      // and do chroma evaluation.
+      for (int plane = 1; plane < num_planes; ++plane) {
+        int64_t tmp_rd =
+            RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+        if (tmp_rd >= *rd) {
+          mbmi->interp_filters = last_best;
+          return 0;
+        }
+        interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane,
+                             &this_rd_stats, 0);
+      }
+      break;
+    case INTERP_SKIP_LUMA_SKIP_CHROMA:
+      // both luma and chroma evaluation is skipped
+      this_rd_stats = *rd_stats;
+      break;
+    case INTERP_EVAL_INVALID:
+    default: assert(0); return 0;
+  }
+  int64_t tmp_rd =
+      RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    if (skip_pred != interp_search_flags->default_interp_skip_flags) {
+      if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) {
+        // Overwrite the data as current filter is the best one
+        *rd_stats_luma = this_rd_stats_luma;
+        *rd_stats = this_rd_stats;
+        // As luma MC data is computed, no need to recompute after the search
+        x->recalc_luma_mc_data = 0;
+      } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) {
+        // As luma MC data is not computed, update of luma data can be skipped
+        *rd_stats = this_rd_stats;
+        // As luma MC data is not recomputed and current filter is the best,
+        // indicate the possibility of recomputing MC data
+        // If current buffer contains valid MC data, toggle to indicate that
+        // luma MC data needs to be recomputed
+        x->recalc_luma_mc_data ^= 1;
+      }
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+    const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int_interpfilters *af, int_interpfilters *lf) {
+  const AV1_COMMON *cm = &cpi->common;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int bsl = mi_size_wide_log2[bsize];
+  int is_horiz_eq = 0, is_vert_eq = 0;
+
+  if (above_mbmi && is_inter_block(above_mbmi))
+    *af = above_mbmi->interp_filters;
+
+  if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters;
+
+  if (af->as_filters.x_filter != INTERP_INVALID)
+    is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter;
+  if (af->as_filters.y_filter != INTERP_INVALID)
+    is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter;
+
+  INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int pred_filter_enable =
+      cpi->sf.interp_sf.cb_pred_filter_search
+          ? (((mi_row + mi_col) >> bsl) +
+             get_chessboard_index(cm->current_frame.frame_number)) &
+                0x1
+          : 0;
+  pred_filter_enable &= is_horiz_eq || is_vert_eq;
+  // pred_filter_search = 0: pred_filter is disabled
+  // pred_filter_search = 1: pred_filter is enabled and only horz pred matching
+  // pred_filter_search = 2: pred_filter is enabled and only vert pred matching
+  // pred_filter_search = 3: pred_filter is enabled and
+  //                         both vert, horz pred matching
+  return pred_filter_enable * pred_filter_type;
+}
+
+static DUAL_FILTER_TYPE find_best_interp_rd_facade(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) {
+  int tmp_skip_pred = skip_pred;
+  DUAL_FILTER_TYPE best_filt_type = REG_REG;
+
+  // If no filter are set to be evaluated, return from function
+  if (allow_interp_mask == 0x0) return best_filt_type;
+  // For block width or height is 4, skip the pred evaluation of SHARP_SHARP
+  tmp_skip_pred = is_w4_or_h4
+                      ? cpi->interp_search_flags.default_interp_skip_flags
+                      : skip_pred;
+
+  // Loop over the all filter types and evaluate for only allowed filter types
+  for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) {
+    const int is_filter_allowed =
+        get_interp_filter_allowed_mask(allow_interp_mask, filt_type);
+    if (is_filter_allowed)
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, filt_type, switchable_ctx,
+                                  tmp_skip_pred))
+        best_filt_type = filt_type;
+    tmp_skip_pred = skip_pred;
+  }
+  return best_filt_type;
+}
+
+static INLINE void pred_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af,
+    int_interpfilters *lf) {
+  (void)lf;
+  assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ);
+  assert(pred_filt_type < INTERP_PRED_TYPE_ALL);
+  uint16_t allowed_interp_mask = 0;
+
+  if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) {
+    // pred_filter_search = 1: Only horizontal filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter];
+  } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) {
+    // pred_filter_search = 2: Only vertical filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter];
+  } else {
+    // pred_filter_search = 3: Both horizontal and vertical filter are matching
+    int filt_type =
+        af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS;
+    set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type);
+  }
+  // REG_REG is already been evaluated in the beginning
+  reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+  find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y,
+                             rd_stats, switchable_rate, dst_bufs,
+                             switchable_ctx, skip_pred, allowed_interp_mask, 0);
+}
+// Evaluate dual filter type
+// a) Using above, left block interp filter
+// b) Find the best horizontal filter and
+//    then evaluate corresponding vertical filters.
+static INLINE void fast_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_hor, const int skip_ver) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+  int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+  int_interpfilters lf = af;
+
+  if (!have_newmv_in_inter_mode(mbmi->mode)) {
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+  }
+
+  if (pred_filter_type) {
+    pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, (skip_hor & skip_ver),
+                               pred_filter_type, &af, &lf);
+  } else {
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    int best_dual_mode = 0;
+    int skip_pred =
+        bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor;
+    // TODO(any): Make use of find_best_interp_rd_facade()
+    // if speed impact is negligible
+    for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, i, switchable_ctx, skip_pred)) {
+        best_dual_mode = i;
+      }
+      skip_pred = skip_hor;
+    }
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    skip_pred =
+        bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver;
+    for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      skip_pred = skip_ver;
+    }
+  }
+}
+
+// Find the best interp filter if dual_interp_filter = 0
+static INLINE void find_best_non_dual_interp_filter(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_ver, const int skip_hor) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  int8_t i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  uint16_t interp_filter_search_mask =
+      interp_search_flags->interp_filter_search_mask;
+
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
+    const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
+    const int *switchable_interp_p0 =
+        cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
+    const int *switchable_interp_p1 =
+        cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
+
+    static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
+    const int thresh = thr[update_type];
+    for (i = 0; i < SWITCHABLE_FILTERS; i++) {
+      // For non-dual case, the 2 dir's prob should be identical.
+      assert(switchable_interp_p0[i] == switchable_interp_p1[i]);
+      if (switchable_interp_p0[i] < thresh &&
+          switchable_interp_p1[i] < thresh) {
+        DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i;
+        reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type);
+      }
+    }
+  }
+
+  // Regular filter evaluation should have been done and hence the same should
+  // be the winner
+  assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int);
+  if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) {
+    INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+    int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+    int_interpfilters lf = af;
+
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+    if (pred_filter_type) {
+      assert(af.as_filters.x_filter != INTERP_INVALID);
+      int filter_idx = SWITCHABLE * af.as_filters.x_filter;
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[filter_idx].as_filters.x_filter ==
+             filter_sets[filter_idx].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask,
+                                           filter_idx))) {
+        return;
+      }
+      if (filter_idx) {
+        interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                                filter_idx, switchable_ctx,
+                                (skip_hor & skip_ver));
+      }
+      return;
+    }
+  }
+  // Reuse regular filter's modeled rd data for sharp filter for following
+  // cases
+  // 1) When bsize is 4x4
+  // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
+  // direction is full-pel
+  // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
+  // direction is full-pel
+  // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
+  // alone is full-pel
+
+  if ((bsize == BLOCK_4X4) ||
+      (block_size_wide[bsize] == 4 &&
+       skip_ver == interp_search_flags->default_interp_skip_flags) ||
+      (block_size_high[bsize] == 4 &&
+       skip_hor == interp_search_flags->default_interp_skip_flags)) {
+    int skip_pred = skip_hor & skip_ver;
+    uint16_t allowed_interp_mask = 0;
+
+    // REG_REG filter type is evaluated beforehand, hence skip it
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP);
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH);
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search)
+      allowed_interp_mask &= interp_filter_search_mask;
+
+    find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, skip_pred, allowed_interp_mask,
+                               1);
+  } else {
+    int skip_pred = (skip_hor & skip_ver);
+    for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE;
+         i += (SWITCHABLE_FILTERS + 1)) {
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[i].as_filters.x_filter ==
+             filter_sets[i].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) {
+        continue;
+      }
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      // In first iteration, smooth filter is evaluated. If smooth filter
+      // (which is less sharper) is the winner among regular and smooth filters,
+      // sharp filter evaluation is skipped
+      // TODO(any): Refine this gating based on modelled rd only (i.e., by not
+      // accounting switchable filter rate)
+      if (cpi->sf.interp_sf.skip_sharp_interp_filter_search &&
+          skip_pred != interp_search_flags->default_interp_skip_flags) {
+        if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int)
+          break;
+      }
+    }
+  }
+}
+
+static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+                                              const AV1_COMP *const cpi,
+                                              int *skip_hor, int *skip_ver) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  const int is_compound = has_second_ref(mbmi);
+  assert(is_intrabc_block(mbmi) == 0);
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]);
+    // TODO(any): Refine skip flag calculation considering scaling
+    if (av1_is_scaled(sf)) {
+      *skip_hor = 0;
+      *skip_ver = 0;
+      break;
+    }
+    const MV mv = mbmi->mv[ref].as_mv;
+    int skip_hor_plane = 0;
+    int skip_ver_plane = 0;
+    for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1));
+         ++plane_idx) {
+      struct macroblockd_plane *const pd = &xd->plane[plane_idx];
+      const int bw = pd->width;
+      const int bh = pd->height;
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      skip_hor_plane |= ((sub_x == 0) << plane_idx);
+      skip_ver_plane |= ((sub_y == 0) << plane_idx);
+    }
+    *skip_hor &= skip_hor_plane;
+    *skip_ver &= skip_ver_plane;
+    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+    assert(*skip_hor != 2);
+    assert(*skip_ver != 2);
+  }
+  // When compond prediction type is compound segment wedge, luma MC and chroma
+  // MC need to go hand in hand as mask generated during luma MC is reuired for
+  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+  // vertical filter decision may be incorrect as temporary MC evaluation
+  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+  // populated during luma MC
+  if (is_compound && mbmi->compound_idx == 1 &&
+      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+    assert(mbmi->comp_group_idx == 1);
+    if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0;
+  }
+}
+
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && !cpi->sf.rt_sf.skip_interp_filter_search;
+  const int ref_frame = xd->mi[0]->ref_frame[0];
+  RD_STATS rd_stats_luma, rd_stats;
+
+  // Initialization of rd_stats structures with default values
+  av1_init_rd_stats(&rd_stats_luma);
+  av1_init_rd_stats(&rd_stats);
+
+  int match_found_idx = -1;
+  const InterpFilter assign_filter = cm->features.interp_filter;
+
+  match_found_idx = av1_find_interp_filter_match(
+      mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
+      args->interp_filter_stats_idx);
+
+  if (match_found_idx != -1) {
+    *rd = args->interp_filter_stats[match_found_idx].rd;
+    x->pred_sse[ref_frame] =
+        args->interp_filter_stats[match_found_idx].pred_sse;
+    return 0;
+  }
+
+  int switchable_ctx[2];
+  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+  *switchable_rate =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  // Do MC evaluation for default filter_type.
+  // Luma MC
+  interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                       &rd_stats_luma, *skip_build_pred);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+  RD_STATS rd_stats_y;
+  av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+  // Chroma MC
+  if (num_planes > 1) {
+    interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V,
+                         &rd_stats, *skip_build_pred);
+  }
+  *skip_build_pred = 1;
+
+  av1_merge_rd_stats(&rd_stats, &rd_stats_luma);
+
+  assert(rd_stats.rate >= 0);
+
+  *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist);
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  if (assign_filter != SWITCHABLE || match_found_idx != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
+    return 0;
+  }
+  if (args->modelled_rd != NULL) {
+    if (has_second_ref(mbmi)) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+      const int mode0 = compound_ref0_mode(mbmi->mode);
+      const int mode1 = compound_ref1_mode(mbmi->mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+        return INT64_MAX;
+      }
+    }
+  }
+
+  x->recalc_luma_mc_data = 0;
+  // skip_flag=xx (in binary form)
+  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
+  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
+  // Skip_flag=2 is not a valid case
+  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+  int skip_hor = interp_search_flags->default_interp_skip_flags;
+  int skip_ver = interp_search_flags->default_interp_skip_flags;
+  calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver);
+
+  // do interp_filter search
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  // Evaluate dual interp filters
+  if (cm->seq_params.enable_dual_filter) {
+    if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
+      fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx, skip_hor, skip_ver);
+    } else {
+      // Use full interpolation filter search
+      uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK;
+      // REG_REG filter type is evaluated beforehand, so loop is repeated over
+      // REG_SMOOTH to SHARP_SHARP for full interpolation filter search
+      reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+      find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx,
+                                 (skip_hor & skip_ver), allowed_interp_mask, 0);
+    }
+  } else {
+    // Evaluate non-dual interp filters
+    find_best_non_dual_interp_filter(
+        x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats,
+        switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor);
+  }
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // Recompute final MC data if required
+  if (x->recalc_luma_mc_data == 1) {
+    // Recomputing final luma MC data is required only if the same was skipped
+    // in either of the directions  Condition below is necessary, but not
+    // sufficient
+    assert((skip_hor == 1) || (skip_ver == 1));
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  // save search results
+  if (cpi->sf.interp_sf.use_interp_filter) {
+    assert(match_found_idx == -1);
+    args->interp_filter_stats_idx = save_interp_filter_search_stat(
+        mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats,
+        args->interp_filter_stats_idx);
+  }
+  return 0;
+}

diff --git a/libaom/av1/encoder/interp_search.h b/libaom/av1/encoder/interp_search.h
new file mode 100644
index 0000000..401e14f
--- /dev/null
+++ b/libaom/av1/encoder/interp_search.h

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_INTERP_FILTER_STATS 128
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+
+typedef struct {
+  int_interpfilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+  COMPOUND_TYPE comp_type;
+  int compound_idx;
+  int64_t rd;
+  unsigned int pred_sse;
+} INTERPOLATION_FILTER_STATS;
+
+typedef struct {
+  // OBMC secondary prediction buffers and respective strides
+  uint8_t *above_pred_buf[MAX_MB_PLANE];
+  int above_pred_stride[MAX_MB_PLANE];
+  uint8_t *left_pred_buf[MAX_MB_PLANE];
+  int left_pred_stride[MAX_MB_PLANE];
+  int_mv (*single_newmv)[REF_FRAMES];
+  // Pointer to array of motion vectors to use for each ref and their rates
+  // Should point to first of 2 arrays in 2D array
+  int (*single_newmv_rate)[REF_FRAMES];
+  int (*single_newmv_valid)[REF_FRAMES];
+  // Pointer to array of predicted rate-distortion
+  // Should point to first of 2 arrays in 2D array
+  int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int ref_frame_cost;
+  int single_comp_cost;
+  int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int skip_motion_mode;
+  INTERINTRA_MODE *inter_intra_mode;
+  int single_ref_first_pass;
+  SimpleRDState *simple_rd_state;
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx;
+} HandleInterModeArgs;
+
+static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  { 0x00000000 }, { 0x00010000 }, { 0x00020000 },  // y = 0
+  { 0x00000001 }, { 0x00010001 }, { 0x00020001 },  // y = 1
+  { 0x00000002 }, { 0x00010002 }, { 0x00020002 },  // y = 2
+};
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx);
+
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_

diff --git a/libaom/av1/encoder/intra_mode_search.c b/libaom/av1/encoder/intra_mode_search.c
new file mode 100644
index 0000000..43192a9
--- /dev/null
+++ b/libaom/av1/encoder/intra_mode_search.c

@@ -0,0 +1,2132 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/tx_search.h"
+
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
+};
+
+#define BINS 32
+static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static void generate_hog(const uint8_t *src, int stride, int rows, int cols,
+                         float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static void generate_hog_hbd(const uint8_t *src8, int stride, int rows,
+                             int cols, float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                      float th,
+                                      uint8_t *directional_mode_skip_mask) {
+  aom_clear_system_state();
+
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int rows =
+      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
+  const int cols =
+      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+  float hist[BINS] = { 0.0f };
+  if (is_cur_buf_hbd(xd)) {
+    generate_hog_hbd(src, src_stride, rows, cols, hist);
+  } else {
+    generate_hog(src, src_stride, rows, cols, hist);
+  }
+
+  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
+    float this_score = intra_hog_model_bias[i];
+    const float *weights = &intra_hog_model_weights[i * BINS];
+    for (int j = 0; j < BINS; ++j) {
+      this_score += weights[j] * hist[j];
+    }
+    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
+  }
+
+  aom_clear_system_state();
+}
+
+#undef BINS
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+    }
+  }
+  // RD estimation.
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
+                                                   : MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
+      &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost +=
+        x->angle_delta_cost[mbmi->mode - V_PRED]
+                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+                   x->filter_intra_mode_cost[mode];
+    } else {
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
+    }
+  }
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+  return this_rd;
+}
+
+// Update the intra model yrd and prune the current mode if the new estimate
+// y_rd > 1.5 * best_model_rd.
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int mode_info_cost,
+                                                int64_t *best_model_rd) {
+  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
+    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
+    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
+    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
+    uint8_t *best_blk_skip, int skip_model_rd) {
+  RD_STATS tokenonly_rd_stats;
+  int64_t this_rd;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  assert(!is_inter_block(mbmi));
+  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
+  if (!skip_model_rd) {
+    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
+      return INT64_MAX;
+    }
+  }
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    best_rd_in);
+  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+  int this_rate =
+      mode_cost + tokenonly_rd_stats.rate +
+      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+  if (this_rd < *best_rd) {
+    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
+    *best_tx_size = mbmi->tx_size;
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+static INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                  int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                  .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+                                       [MAX_ANGLE_DELTA +
+                                        mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += x->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                   int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+                                             MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int mode_cost,
+                                    int64_t *best_rd, int64_t *best_model_rd,
+                                    PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int filter_intra_selected_flag = 0;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_8X8;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  (void)ctx;
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    int64_t this_rd;
+    RD_STATS tokenonly_rd_stats;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
+      continue;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) continue;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mbmi->tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth, int *val_count) {
+  assert(bit_depth <= 12);
+  const int max_pix_val = 1 << bit_depth;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      if (this_val >= max_pix_val) return 0;
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int *centroids) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
+  }
+}
+
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params.use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
+                                best_model_rd)) {
+    return;
+  }
+
+  RD_STATS tokenonly_rd_stats;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+      txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip;
+    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
+  }
+}
+
+static AOM_INLINE int perform_top_color_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
+    const int *const top_colors, int start_n, int end_n, int step_size,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) top_color_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return top_color_winner;
+}
+
+static AOM_INLINE int perform_k_means_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int k_means_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) k_means_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return k_means_winner;
+}
+
+// Perform palette search for top colors from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+// Perform k-means based palette search from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+
+#define START_N_STAGE2(x)                         \
+  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
+                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
+#define END_N_STAGE2(x, end_n) \
+  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
+
+static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
+                                                int *end_n_stage2,
+                                                int *step_size_stage2,
+                                                int winner, int end_n) {
+  *start_n_stage2 = START_N_STAGE2(winner);
+  *end_n_stage2 = END_N_STAGE2(winner, end_n);
+  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
+}
+
+// Start index and step size below are chosen to evaluate unique
+// candidates in neighbor search, in case a winner candidate is found in
+// coarse search. Example,
+// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+// (3) and 8 (7).
+// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+// as for 8 colors) then step size should also be 2, to cover all
+// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+// coarse search will evaluate 3 and 6. For the winner, unique neighbors
+// (3: 2,4 or 6: 5,7) would be evaluated.
+
+// start index for coarse palette search for dominant colors and k-means
+static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                    3, 3, 2,
+                                                                    3, 3, 2 };
+// step size for coarse palette search for dominant colors and k-means
+static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                      3, 3, 3,
+                                                                      3, 3, 3 };
+
+static void rd_pick_palette_intra_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, int *beat_best_rd,
+    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  int colors;
+  if (is_hbd) {
+    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
+                                     count_buf);
+  } else {
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  if (colors > 1 && colors <= 64) {
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub;
+    if (is_hbd) {
+      int *data_pt = data;
+      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
+      lb = ub = src_pt[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    } else {
+      int *data_pt = data;
+      const uint8_t *src_pt = src;
+      lb = ub = src[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    }
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      assert(PALETTE_MAX_SIZE == 8);
+      assert(PALETTE_MIN_SIZE == 2);
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int start_n = start_n_lookup_table[end_n];
+      const int step_size = step_size_lookup_table[end_n];
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, top_color_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+          rows * cols);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, k_means_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols);
+      }
+    } else {
+      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                end_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search from start_n
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
+          end_n - 1, -1, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+
+      if (top_color_winner > end_n) {
+        // Perform top color palette search in reverse order for the remaining
+        // colors
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
+            top_color_winner, 1, color_cache, n_cache, best_mbmi,
+            best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, best_mbmi, best_palette_color_map,
+                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                     NULL);
+      } else {
+        // Perform k-means palette search from start_n
+        const int k_means_winner = perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
+            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
+            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+            rows * cols);
+        if (k_means_winner > end_n) {
+          // Perform k-means palette search in reverse order for the remaining
+          // colors
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
+              k_means_winner, 1, color_cache, n_cache, best_mbmi,
+              best_palette_color_map, best_rd, best_model_rd, rate,
+              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+              best_blk_skip, tx_type_map, color_map, rows * cols);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+}
+
+static AOM_INLINE void rd_pick_palette_intra_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
+    uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->sb_type));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  if (seq_params->use_highbitdepth) {
+    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+  } else {
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
+
+static AOM_INLINE void choose_intra_uv_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv,
+    int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  init_sbuv_mode(mbmi);
+  if (!xd->is_chroma_ref) {
+    *rate_uv = 0;
+    *rate_uv_tokenonly = 0;
+    *dist_uv = 0;
+    *skip_uv = 1;
+    *mode_uv = UV_DC_PRED;
+    return;
+  }
+
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+                                 DRY_RUN_NORMAL,
+                                 cpi->optimize_seg_arr[mbmi->segment_id]);
+    xd->cfl.store_y = 0;
+  }
+  av1_rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                              skip_uv, bsize, max_tx_size);
+  *mode_uv = mbmi->uv_mode;
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+    int *best_angle_delta, int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  assert(!is_inter_block(mbmi));
+  int this_rate;
+  int64_t this_rd;
+  RD_STATS tokenonly_rd_stats;
+
+  if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+    return INT64_MAX;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd, int *rate,
+                                    RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int i, angle_delta, best_angle_delta = 0;
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip = 0;
+  rd_stats->dist = INT64_MAX;
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+                                              best_rd_in, rate, rd_stats,
+                                              &best_angle_delta, &best_rd);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (angle_delta == 0) {
+        if (this_rd == INT64_MAX) return 0;
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                      rate, rd_stats, &best_angle_delta,
+                                      &best_rd);
+      }
+    }
+  }
+
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+  return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+
+  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  if (!xd->lossless[mbmi->segment_id]) {
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+  }
+
+  xd->cfl.use_dc_pred_cache = 1;
+  const int64_t mode_rd =
+      RDCOST(x->rdmult,
+             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif  // CONFIG_DEBUG
+
+  const int skip_trellis = 0;
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    RD_STATS rd_stats;
+    av1_init_rd_stats(&rd_stats);
+    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+      best_rd_uv[joint_sign][plane] = INT64_MAX;
+      best_c[joint_sign][plane] = 0;
+    }
+    // Collect RD stats for an alpha value of zero in this plane.
+    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+      const int8_t joint_sign =
+          PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+      if (i == CFL_SIGN_NEG) {
+        mbmi->cfl_alpha_idx = 0;
+        mbmi->cfl_alpha_signs = joint_sign;
+        av1_txfm_rd_in_plane(
+            x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
+            cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+        if (rd_stats.rate == INT_MAX) break;
+      }
+      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      best_rd_uv[joint_sign][plane] =
+          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+      best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+    }
+  }
+
+  int8_t best_joint_sign = -1;
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+      int progress = 0;
+      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+        int flag = 0;
+        RD_STATS rd_stats;
+        if (c > 2 && progress < c) break;
+        av1_init_rd_stats(&rd_stats);
+        for (int i = 0; i < CFL_SIGNS; i++) {
+          const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+          if (i == 0) {
+            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+            mbmi->cfl_alpha_signs = joint_sign;
+            av1_txfm_rd_in_plane(
+                x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
+                cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+            if (rd_stats.rate == INT_MAX) break;
+          }
+          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          int64_t this_rd =
+              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+          best_rd_uv[joint_sign][plane] = this_rd;
+          best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+          best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+          flag = 2;
+          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+          if (this_rd >= best_rd) continue;
+          best_rd = this_rd;
+          best_joint_sign = joint_sign;
+        }
+        progress += flag;
+      }
+    }
+  }
+
+  int best_rate_overhead = INT_MAX;
+  uint8_t ind = 0;
+  if (best_joint_sign >= 0) {
+    const int u = best_c[best_joint_sign][CFL_PRED_U];
+    const int v = best_c[best_joint_sign][CFL_PRED_V];
+    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+                   best_rate_overhead +
+                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
+                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif  // CONFIG_DEBUG
+  } else {
+    best_joint_sign = 0;
+  }
+
+  mbmi->cfl_alpha_idx = ind;
+  mbmi->cfl_alpha_signs = best_joint_sign;
+  xd->cfl.use_dc_pred_cache = 0;
+  xd->cfl.dc_pred_is_cached[0] = 0;
+  xd->cfl.dc_pred_is_cached[1] = 0;
+  return best_rate_overhead;
+}
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+    if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+          (1 << mode)))
+      continue;
+    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+        mode <= UV_SMOOTH_H_PRED)
+      continue;
+
+    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+
+    mbmi->uv_mode = mode;
+    int cfl_alpha_rate = 0;
+    if (mode == UV_CFL_PRED) {
+      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
+      assert(!is_directional_mode);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
+        cpi->oxcf.enable_angle_delta) {
+      const int rate_overhead =
+          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                    &this_rate, &tokenonly_rd_stats))
+        continue;
+    } else {
+      if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+        continue;
+      }
+    }
+    const int mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        cfl_alpha_rate;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+    if (mode == UV_CFL_PRED) {
+      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+#if CONFIG_DEBUG
+      if (!xd->lossless[mbmi->segment_id])
+        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif  // CONFIG_DEBUG
+    }
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+    }
+  }
+
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->sb_type);
+  if (try_palette) {
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(
+        cpi, x,
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
+  }
+
+  *mbmi = best_mbmi;
+  // Make sure we actually chose a mode
+  assert(best_rd < INT64_MAX);
+  return best_rd;
+}
+
+int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
+                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                            PALETTE_MODE_INFO *const pmi,
+                            unsigned int *ref_costs_single,
+                            IntraModeSearchState *intra_search_state,
+                            int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
+          best_model_rd_palette = INT64_MAX;
+  int skippable = 0;
+  TX_SIZE uv_tx = TX_4X4;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  RD_STATS rd_stats_y;
+  av1_invalid_rd_stats(&rd_stats_y);
+  rd_pick_palette_intra_sby(
+      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
+      best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
+      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip, NULL, ctx,
+      best_blk_skip, best_tx_type_map);
+  if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+    this_rd_cost->rdcost = INT64_MAX;
+    return skippable;
+  }
+
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+
+  skippable = rd_stats_y.skip;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + ref_costs_single[INTRA_FRAME];
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (intra_search_state->rate_uv_intra == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
+          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
+          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      intra_search_state->pmi_uv = *pmi;
+      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+    mbmi->uv_mode = intra_search_state->mode_uv;
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+    skippable = skippable && intra_search_state->skip_uvs;
+    distortion2 += intra_search_state->dist_uvs;
+    rate2 += intra_search_state->rate_uv_intra;
+  }
+
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  this_rd_cost->rate = rate2;
+  this_rd_cost->dist = distortion2;
+  this_rd_cost->rdcost = this_rd;
+  return skippable;
+}
+
+// Given selected prediction mode, search for the best tx type and size.
+static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, const int *bmode_costs,
+                                      int64_t *best_rd, int *rate,
+                                      int *rate_tokenonly, int64_t *distortion,
+                                      int *skippable, MB_MODE_INFO *best_mbmi,
+                                      PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  // In order to improve txfm search avoid rd based breakouts during winner
+  // mode evaluation. Hence passing ref_best_rd as a maximum value
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+  if (rd_stats.rate == INT_MAX) return 0;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip;
+    av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+    av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  }
+  return 0;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int *rate, RD_STATS *rd_stats,
+                                       BLOCK_SIZE bsize, int mode_cost,
+                                       int64_t best_rd, int64_t *best_model_rd,
+                                       int skip_model_rd_for_zero_deg) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+
+  int best_angle_delta = 0;
+  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  int first_try = 1;
+  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      const int64_t best_rd_in =
+          (best_rd == INT64_MAX) ? INT64_MAX
+                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      const int64_t this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
+          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
+          (skip_model_rd_for_zero_deg & !angle_delta));
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (first_try && this_rd == INT64_MAX) return best_rd;
+      first_try = 0;
+      if (angle_delta == 0) {
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      const int64_t rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        calc_rd_given_intra_angle(
+            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
+            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
+      }
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+    const int n4 = bsize_to_num_blk(bsize);
+    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
+  }
+  return best_rd;
+}
+
+int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
+                              const AV1_COMP *cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_frame_cost,
+                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                              RD_STATS *rd_stats_uv, int64_t best_rd,
+                              int64_t *best_intra_rd, int8_t best_mbmode_skip) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int mode_cost =
+      x->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+      cm->seq_params.bit_depth);
+  const int skip_ctx = av1_get_skip_context(xd);
+
+  int known_rate = mode_cost;
+  known_rate += ref_frame_cost;
+  if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
+  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+  if (known_rd > best_rd) {
+    intra_search_state->skip_intra_modes = 1;
+    return INT64_MAX;
+  }
+
+  const int is_directional_mode = av1_is_directional_mode(mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize) &&
+      cpi->oxcf.enable_angle_delta) {
+    if (sf->intra_sf.intra_pruning_with_hog &&
+        !intra_search_state->angle_stats_ready) {
+      prune_intra_mode_with_hog(x, bsize,
+                                cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
+                                intra_search_state->directional_mode_skip_mask);
+      intra_search_state->angle_stats_ready = 1;
+    }
+    if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX;
+    av1_init_rd_stats(rd_stats_y);
+    rd_stats_y->rate = INT_MAX;
+    int64_t model_rd = INT64_MAX;
+    int rate_dummy;
+    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
+                            best_rd, &model_rd, 0);
+
+  } else {
+    av1_init_rd_stats(rd_stats_y);
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+  }
+
+  // Pick filter intra modes.
+  if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    int try_filter_intra = 0;
+    int64_t best_rd_so_far = INT64_MAX;
+    if (rd_stats_y->rate != INT_MAX) {
+      const int tmp_rate =
+          rd_stats_y->rate + x->filter_intra_cost[bsize][0] + mode_cost;
+      best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+      try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+    } else {
+      try_filter_intra = !best_mbmode_skip;
+    }
+
+    if (try_filter_intra) {
+      RD_STATS rd_stats_y_fi;
+      int filter_intra_selected_flag = 0;
+      TX_SIZE best_tx_size = mbmi->tx_size;
+      FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+      uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+      uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      mbmi->filter_intra_mode_info.use_filter_intra = 1;
+      for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+           fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+        mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize,
+                                          best_rd);
+        if (rd_stats_y_fi.rate == INT_MAX) continue;
+        const int this_rate_tmp =
+            rd_stats_y_fi.rate +
+            intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+        const int64_t this_rd_tmp =
+            RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+        if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+          break;
+        }
+        if (this_rd_tmp < best_rd_so_far) {
+          best_tx_size = mbmi->tx_size;
+          av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+          memcpy(best_blk_skip, x->blk_skip,
+                 sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+          best_fi_mode = fi_mode;
+          *rd_stats_y = rd_stats_y_fi;
+          filter_intra_selected_flag = 1;
+          best_rd_so_far = this_rd_tmp;
+        }
+      }
+
+      mbmi->tx_size = best_tx_size;
+      av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+      memcpy(x->blk_skip, best_blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+      if (filter_intra_selected_flag) {
+        mbmi->filter_intra_mode_info.use_filter_intra = 1;
+        mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+      } else {
+        mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      }
+    }
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+
+  const int mode_cost_y =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+    const int try_palette =
+        cpi->oxcf.enable_palette &&
+        av1_allow_palette(cm->features.allow_screen_content_tools,
+                          mbmi->sb_type);
+    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (intra_search_state->rate_uv_intra == INT_MAX) {
+      const int rate_y =
+          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+      const int64_t rdy =
+          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+      if (best_rd < (INT64_MAX / 2) && rdy > (best_rd + (best_rd >> 2))) {
+        intra_search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
+          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
+          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      if (try_palette) intra_search_state->pmi_uv = *pmi;
+      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+      const int uv_rate = intra_search_state->rate_uv_tokenonly;
+      const int64_t uv_dist = intra_search_state->dist_uvs;
+      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+      if (uv_rd > best_rd) {
+        intra_search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+    }
+
+    rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+    rd_stats_uv->dist = intra_search_state->dist_uvs;
+    rd_stats_uv->skip = intra_search_state->skip_uvs;
+    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+    mbmi->uv_mode = intra_search_state->mode_uv;
+    if (try_palette) {
+      pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+  }
+
+  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rd_stats_y->rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  if (num_planes > 1 && xd->is_chroma_ref) {
+    const int uv_mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+    rd_stats->rate +=
+        rd_stats_uv->rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+  if (mode != DC_PRED && mode != PAETH_PRED) {
+    rd_stats->rate += intra_cost_penalty;
+  }
+
+  // Intra block is always coded as non-skip
+  rd_stats->skip = 0;
+  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
+  // Add in the cost of the no skip flag.
+  rd_stats->rate += x->skip_cost[skip_ctx][0];
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  // Keep record of best intra rd
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    intra_search_state->best_intra_mode = mode;
+  }
+
+  if (sf->intra_sf.skip_intra_in_interframe) {
+    if (best_rd < (INT64_MAX / 2) && this_rd > (best_rd + (best_rd >> 1)))
+      intra_search_state->skip_intra_modes = 1;
+  }
+
+  if (!disable_skip) {
+    for (int i = 0; i < REFERENCE_MODES; ++i) {
+      intra_search_state->best_pred_rd[i] =
+          AOMMIN(intra_search_state->best_pred_rd[i], this_rd);
+    }
+  }
+  return this_rd;
+}
+
+// This function is used only for intra_only frames
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, int *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int64_t best_model_rd = INT64_MAX;
+  int is_directional_mode;
+  uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
+  // Flag to check rd of any intra mode is better than best_rd passed to this
+  // function
+  int beat_best_rd = 0;
+  const int *bmode_costs;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->sb_type);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  if (cpi->sf.intra_sf.intra_pruning_with_hog) {
+    prune_intra_mode_with_hog(x, bsize,
+                              cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
+                              directional_mode_skip_mask);
+  }
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  pmi->palette_size[0] = 0;
+
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  av1_zero(x->winner_mode_stats);
+  x->winner_mode_count = 0;
+
+  /* Y Search for intra prediction mode */
+  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+    RD_STATS this_rd_stats;
+    int this_rate, this_rate_tokenonly, s;
+    int64_t this_distortion, this_rd;
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    if ((!cpi->oxcf.enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode],
+                                  &best_model_rd)) {
+      continue;
+    }
+
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
+    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+    if (is_directional_mode && av1_use_angle_delta(bsize) &&
+        cpi->oxcf.enable_angle_delta) {
+      this_rd_stats.rate = INT_MAX;
+      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
+                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
+                              1);
+    } else {
+      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+    }
+    this_rate_tokenonly = this_rd_stats.rate;
+    this_distortion = this_rd_stats.dist;
+    s = this_rd_stats.skip;
+
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    if (!xd->lossless[mbmi->segment_id] &&
+        block_signals_txsize(mbmi->sb_type)) {
+      // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
+      // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
+      // coded (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+    }
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      // Setting beat_best_rd flag because current mode rd is better than
+      // best_rd passed to this function
+      beat_best_rd = 1;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
+  }
+
+  if (try_palette) {
+    rd_pick_palette_intra_sby(
+        cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
+        &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
+        &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
+  }
+
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                                 skippable, bsize, bmode_costs[DC_PRED],
+                                 &best_rd, &best_model_rd, ctx)) {
+      best_mbmi = *mbmi;
+    }
+  }
+  // No mode is identified with less rd value than best_rd passed to this
+  // function. In such cases winner mode processing is not necessary and return
+  // best_rd as INT64_MAX to indicate best mode is not identified
+  if (!beat_best_rd) return INT64_MAX;
+
+  // In multi-winner mode processing, perform tx search for few best modes
+  // identified during mode evaluation. Winner mode processing uses best tx
+  // configuration for tx search.
+  if (cpi->sf.winner_mode_sf.enable_multiwinner_mode_process) {
+    int best_mode_idx = 0;
+    int block_width, block_height;
+    uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+
+    for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
+      *mbmi = x->winner_mode_stats[mode_idx].mbmi;
+      if (is_winner_mode_processing_enabled(cpi, mbmi, mbmi->mode)) {
+        // Restore color_map of palette mode before winner mode processing
+        if (mbmi->palette_mode_info.palette_size[0] > 0) {
+          uint8_t *color_map_src =
+              x->winner_mode_stats[mode_idx].color_index_map;
+          memcpy(color_map_dst, color_map_src,
+                 block_width * block_height * sizeof(*color_map_src));
+        }
+        // Set params for winner mode evaluation
+        set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+        // Winner mode processing
+        // If previous searches use only the default tx type/no R-D optimization
+        // of quantized coeffs, do an extra search for the best tx type/better
+        // R-D optimization of quantized coeffs
+        if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                            rate_tokenonly, distortion, skippable, &best_mbmi,
+                            ctx))
+          best_mode_idx = mode_idx;
+      }
+    }
+    // Copy color_map of palette mode for final winner mode
+    if (best_mbmi.palette_mode_info.palette_size[0] > 0) {
+      uint8_t *color_map_src =
+          x->winner_mode_stats[best_mode_idx].color_index_map;
+      memcpy(color_map_dst, color_map_src,
+             block_width * block_height * sizeof(*color_map_src));
+    }
+  } else {
+    // If previous searches use only the default tx type/no R-D optimization of
+    // quantized coeffs, do an extra search for the best tx type/better R-D
+    // optimization of quantized coeffs
+    if (is_winner_mode_processing_enabled(cpi, mbmi, best_mbmi.mode)) {
+      // Set params for winner mode evaluation
+      set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+      *mbmi = best_mbmi;
+      intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                      rate_tokenonly, distortion, skippable, &best_mbmi, ctx);
+    }
+  }
+  *mbmi = best_mbmi;
+  av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+  return best_rd;
+}

diff --git a/libaom/av1/encoder/intra_mode_search.h b/libaom/av1/encoder/intra_mode_search.h
new file mode 100644
index 0000000..4b5d31c
--- /dev/null
+++ b/libaom/av1/encoder/intra_mode_search.h

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct IntraModeSearchState {
+  int skip_intra_modes;
+  PREDICTION_MODE best_intra_mode;
+  int angle_stats_ready;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  int rate_uv_intra;
+  int rate_uv_tokenonly;
+  int64_t dist_uvs;
+  int skip_uvs;
+  UV_PREDICTION_MODE mode_uv;
+  PALETTE_MODE_INFO pmi_uv;
+  int8_t uv_angle_delta;
+  int64_t best_pred_rd[REFERENCE_MODES];
+} IntraModeSearchState;
+
+void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x);
+int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
+                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                            PALETTE_MODE_INFO *const pmi,
+                            unsigned int *ref_costs_single,
+                            IntraModeSearchState *intra_search_state,
+                            int64_t best_rd);
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
+                              const AV1_COMP *cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_frame_cost,
+                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                              RD_STATS *rd_stats_uv, int64_t best_rd,
+                              int64_t *best_intra_rd, int8_t best_mbmode_skip);
+
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, int *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx);
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_

diff --git a/libaom/av1/encoder/level.c b/libaom/av1/encoder/level.c
index 1668bdf..3403a3a 100644
--- a/libaom/av1/encoder/level.c
+++ b/libaom/av1/encoder/level.c

@@ -235,6 +235,9 @@
   DISPLAY_RATE_TOO_HIGH,
   DECODE_RATE_TOO_HIGH,
   CR_TOO_SMALL,
+  TILE_SIZE_HEADER_RATE_TOO_HIGH,
+  BITRATE_TOO_HIGH,
+  DECODER_MODEL_FAIL,
 
   TARGET_LEVEL_FAIL_IDS,
   TARGET_LEVEL_OK,
@@ -258,25 +261,528 @@
   "The display luma sample rate is too high.",
   "The decoded luma sample rate is too high.",
   "The compression ratio is too small.",
+  "The product of max tile size and header rate is too high.",
+  "The bitrate is too high.",
+  "The decoder model fails.",
 };
 
+static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier,
+                              BITSTREAM_PROFILE profile) {
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+  const double bitrate_basis =
+      (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6;
+  const double bitrate_profile_factor =
+      profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0);
+  return bitrate_basis * bitrate_profile_factor;
+}
+
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile) {
+  assert(is_valid_seq_level_idx(level_index));
+  return get_max_bitrate(&av1_level_defs[level_index], tier, profile);
+}
+
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  *max_tiles = level_spec->max_tiles;
+  *max_tile_cols = level_spec->max_tile_cols;
+}
+
+// We assume time t to be valid if and only if t >= 0.0.
+// So INVALID_TIME can be defined as anything less than 0.
+#define INVALID_TIME (-1.0)
+
+// This corresponds to "free_buffer" in the spec.
+static void release_buffer(DECODER_MODEL *const decoder_model, int idx) {
+  assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE);
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  this_buffer->decoder_ref_count = 0;
+  this_buffer->player_ref_count = 0;
+  this_buffer->display_index = -1;
+  this_buffer->presentation_time = INVALID_TIME;
+}
+
+static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    release_buffer(decoder_model, i);
+  }
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    decoder_model->vbi[i] = -1;
+  }
+}
+
+static int get_free_buffer(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0 &&
+        this_buffer->player_ref_count == 0)
+      return i;
+  }
+  return -1;
+}
+
+static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx,
+                               int refresh_frame_flags) {
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (refresh_frame_flags & (1 << i)) {
+      const int pre_idx = decoder_model->vbi[i];
+      if (pre_idx != -1) {
+        --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count;
+      }
+      decoder_model->vbi[i] = idx;
+      ++this_buffer->decoder_ref_count;
+    }
+  }
+}
+
+// The time (in seconds) required to decode a frame.
+static double time_to_decode_frame(const AV1_COMMON *const cm,
+                                   int64_t max_decode_rate) {
+  if (cm->show_existing_frame) return 0.0;
+
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  int luma_samples = 0;
+  if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) {
+    luma_samples = cm->superres_upscaled_width * cm->height;
+  } else {
+    const int spatial_layer_dimensions_present_flag = 0;
+    if (spatial_layer_dimensions_present_flag) {
+      assert(0 && "Spatial layer dimensions not supported yet.");
+    } else {
+      const SequenceHeader *const seq_params = &cm->seq_params;
+      const int max_frame_width = seq_params->max_frame_width;
+      const int max_frame_height = seq_params->max_frame_height;
+      luma_samples = max_frame_width * max_frame_height;
+    }
+  }
+
+  return luma_samples / (double)max_decode_rate;
+}
+
+// Release frame buffers that are no longer needed for decode or display.
+// It corresponds to "start_decode_at_removal_time" in the spec.
+static void release_processed_frames(DECODER_MODEL *const decoder_model,
+                                     double removal_time) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->player_ref_count > 0) {
+      if (this_buffer->presentation_time >= 0.0 &&
+          this_buffer->presentation_time <= removal_time) {
+        this_buffer->player_ref_count = 0;
+        if (this_buffer->decoder_ref_count == 0) {
+          release_buffer(decoder_model, i);
+        }
+      }
+    }
+  }
+}
+
+static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) {
+  int frames_in_pool = 0;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count > 0 ||
+        this_buffer->player_ref_count > 0) {
+      ++frames_in_pool;
+    }
+  }
+  return frames_in_pool;
+}
+
+static double get_presentation_time(const DECODER_MODEL *const decoder_model,
+                                    int display_index) {
+  if (decoder_model->mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE NOT SUPPORTED");
+    return INVALID_TIME;
+  } else {
+    const double initial_presentation_delay =
+        decoder_model->initial_presentation_delay;
+    // Can't decide presentation time until the initial presentation delay is
+    // known.
+    if (initial_presentation_delay < 0.0) return INVALID_TIME;
+
+    return initial_presentation_delay +
+           display_index * decoder_model->num_ticks_per_picture *
+               decoder_model->display_clock_tick;
+  }
+}
+
+#define MAX_TIME 1e16
+double time_next_buffer_is_free(const DECODER_MODEL *const decoder_model) {
+  if (decoder_model->num_decoded_frame == 0) {
+    return (double)decoder_model->decoder_buffer_delay / 90000.0;
+  }
+
+  double buf_free_time = MAX_TIME;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0) {
+      if (this_buffer->player_ref_count == 0) {
+        return decoder_model->current_time;
+      }
+      const double presentation_time = this_buffer->presentation_time;
+      if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
+        buf_free_time = presentation_time;
+      }
+    }
+  }
+  return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME;
+}
+#undef MAX_TIME
+
+static double get_removal_time(const DECODER_MODEL *const decoder_model) {
+  if (decoder_model->mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
+    return INVALID_TIME;
+  } else {
+    return time_next_buffer_is_free(decoder_model);
+  }
+}
+
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
+  printf(
+      "\n status %d, num_frame %3d, num_decoded_frame %3d, "
+      "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, "
+      "presentation delay %6.2f, total interval %6.2f\n",
+      decoder_model->status, decoder_model->num_frame,
+      decoder_model->num_decoded_frame, decoder_model->num_shown_frame,
+      decoder_model->current_time, frames_in_buffer_pool(decoder_model),
+      decoder_model->initial_presentation_delay,
+      decoder_model->dfg_interval_queue.total_interval);
+  for (int i = 0; i < 10; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    printf("buffer %d, decode count %d, display count %d, present time %6.4f\n",
+           i, this_buffer->decoder_ref_count, this_buffer->player_ref_count,
+           this_buffer->presentation_time);
+  }
+}
+
+// op_index is the operating point index.
+void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model) {
+  aom_clear_system_state();
+
+  decoder_model->status = DECODER_MODEL_OK;
+  decoder_model->level = level;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  decoder_model->bit_rate = get_max_bitrate(
+      av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
+
+  // TODO(huisu or anyone): implement SCHEDULE_MODE.
+  decoder_model->mode = RESOURCE_MODE;
+  decoder_model->encoder_buffer_delay = 20000;
+  decoder_model->decoder_buffer_delay = 70000;
+  decoder_model->is_low_delay_mode = false;
+
+  decoder_model->first_bit_arrival_time = 0.0;
+  decoder_model->last_bit_arrival_time = 0.0;
+  decoder_model->coded_bits = 0;
+
+  decoder_model->removal_time = INVALID_TIME;
+  decoder_model->presentation_time = INVALID_TIME;
+  decoder_model->decode_samples = 0;
+  decoder_model->display_samples = 0;
+  decoder_model->max_decode_rate = 0.0;
+  decoder_model->max_display_rate = 0.0;
+
+  decoder_model->num_frame = -1;
+  decoder_model->num_decoded_frame = -1;
+  decoder_model->num_shown_frame = -1;
+  decoder_model->current_time = 0.0;
+
+  initialize_buffer_pool(decoder_model);
+
+  DFG_INTERVAL_QUEUE *const dfg_interval_queue =
+      &decoder_model->dfg_interval_queue;
+  dfg_interval_queue->total_interval = 0.0;
+  dfg_interval_queue->head = 0;
+  dfg_interval_queue->size = 0;
+
+  if (seq_params->timing_info_present) {
+    decoder_model->num_ticks_per_picture =
+        seq_params->timing_info.num_ticks_per_picture;
+    decoder_model->display_clock_tick =
+        seq_params->timing_info.num_units_in_display_tick /
+        seq_params->timing_info.time_scale;
+  } else {
+    decoder_model->num_ticks_per_picture = 1;
+    decoder_model->display_clock_tick = 1.0 / cpi->framerate;
+  }
+
+  decoder_model->initial_display_delay =
+      seq_params->op_params[op_index].initial_display_delay;
+  decoder_model->initial_presentation_delay = INVALID_TIME;
+  decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
+}
+
+void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model) {
+  if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
+
+  aom_clear_system_state();
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int luma_pic_size = cm->superres_upscaled_width * cm->height;
+  const int show_existing_frame = cm->show_existing_frame;
+  const int show_frame = cm->show_frame || show_existing_frame;
+  ++decoder_model->num_frame;
+  if (!show_existing_frame) ++decoder_model->num_decoded_frame;
+  if (show_frame) ++decoder_model->num_shown_frame;
+  decoder_model->coded_bits += coded_bits;
+
+  int display_idx = -1;
+  if (show_existing_frame) {
+    display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show];
+    if (display_idx < 0) {
+      decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY;
+      return;
+    }
+    if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) {
+      update_ref_buffers(decoder_model, display_idx, 0xFF);
+    }
+  } else {
+    const double removal_time = get_removal_time(decoder_model);
+    if (removal_time < 0.0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+
+    const int previous_decode_samples = decoder_model->decode_samples;
+    const double previous_removal_time = decoder_model->removal_time;
+    assert(previous_removal_time < removal_time);
+    decoder_model->removal_time = removal_time;
+    decoder_model->decode_samples = luma_pic_size;
+    const double this_decode_rate =
+        previous_decode_samples / (removal_time - previous_removal_time);
+    decoder_model->max_decode_rate =
+        AOMMAX(decoder_model->max_decode_rate, this_decode_rate);
+
+    // A frame with show_existing_frame being false indicates the end of a DFG.
+    // Update the bits arrival time of this DFG.
+    const double buffer_delay = (decoder_model->encoder_buffer_delay +
+                                 decoder_model->decoder_buffer_delay) /
+                                90000.0;
+    const double latest_arrival_time = removal_time - buffer_delay;
+    decoder_model->first_bit_arrival_time =
+        AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+    decoder_model->last_bit_arrival_time =
+        decoder_model->first_bit_arrival_time +
+        (double)decoder_model->coded_bits / decoder_model->bit_rate;
+    // Smoothing buffer underflows if the last bit arrives after the removal
+    // time.
+    if (decoder_model->last_bit_arrival_time > removal_time &&
+        !decoder_model->is_low_delay_mode) {
+      decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW;
+      return;
+    }
+    // Reset the coded bits for the next DFG.
+    decoder_model->coded_bits = 0;
+
+    // Check if the smoothing buffer overflows.
+    DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+    if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+      assert(0);
+    }
+    const double first_bit_arrival_time = decoder_model->first_bit_arrival_time;
+    const double last_bit_arrival_time = decoder_model->last_bit_arrival_time;
+    // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+    while (queue->buf[queue->head].removal_time <= last_bit_arrival_time &&
+           queue->size > 0) {
+      if (queue->buf[queue->head].removal_time - first_bit_arrival_time +
+              queue->total_interval >
+          1.0) {
+        decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+        return;
+      }
+      queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time -
+                               queue->buf[queue->head].first_bit_arrival_time;
+      queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE;
+      --queue->size;
+    }
+    // Push current DFG into the queue.
+    const int queue_index =
+        (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE;
+    queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time;
+    queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time;
+    queue->buf[queue_index].removal_time = removal_time;
+    queue->total_interval += last_bit_arrival_time - first_bit_arrival_time;
+    // The smoothing buffer can hold at most "bit_rate" bits, which is
+    // equivalent to 1 second of total interval.
+    if (queue->total_interval > 1.0) {
+      decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+      return;
+    }
+
+    release_processed_frames(decoder_model, removal_time);
+    decoder_model->current_time =
+        removal_time + time_to_decode_frame(cm, decoder_model->decode_rate);
+
+    const int cfbi = get_free_buffer(decoder_model);
+    if (cfbi < 0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+    const CurrentFrame *const current_frame = &cm->current_frame;
+    decoder_model->frame_buffer_pool[cfbi].frame_type =
+        cm->current_frame.frame_type;
+    display_idx = cfbi;
+    update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags);
+
+    if (decoder_model->initial_presentation_delay < 0.0) {
+      // Display can begin after required number of frames have been buffered.
+      if (frames_in_buffer_pool(decoder_model) >=
+          decoder_model->initial_display_delay) {
+        decoder_model->initial_presentation_delay = decoder_model->current_time;
+        // Update presentation time for each shown frame in the frame buffer.
+        for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+          FRAME_BUFFER *const this_buffer =
+              &decoder_model->frame_buffer_pool[i];
+          if (this_buffer->player_ref_count == 0) continue;
+          assert(this_buffer->display_index >= 0);
+          this_buffer->presentation_time =
+              get_presentation_time(decoder_model, this_buffer->display_index);
+        }
+      }
+    }
+  }
+
+  // Display.
+  if (show_frame) {
+    assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE);
+    FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[display_idx];
+    ++this_buffer->player_ref_count;
+    this_buffer->display_index = decoder_model->num_shown_frame;
+    const double presentation_time =
+        get_presentation_time(decoder_model, this_buffer->display_index);
+    this_buffer->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 &&
+        decoder_model->current_time > presentation_time) {
+      decoder_model->status = DISPLAY_FRAME_LATE;
+      return;
+    }
+
+    const int previous_display_samples = decoder_model->display_samples;
+    const double previous_presentation_time = decoder_model->presentation_time;
+    decoder_model->display_samples = luma_pic_size;
+    decoder_model->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) {
+      assert(previous_presentation_time < presentation_time);
+      const double this_display_rate =
+          previous_display_samples /
+          (presentation_time - previous_presentation_time);
+      decoder_model->max_display_rate =
+          AOMMAX(decoder_model->max_display_rate, this_display_rate);
+    }
+  }
+}
+
+void av1_init_level_info(AV1_COMP *cpi) {
+  for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
+    AV1LevelInfo *const this_level_info =
+        cpi->level_params.level_info[op_index];
+    if (!this_level_info) continue;
+    memset(this_level_info, 0, sizeof(*this_level_info));
+    AV1LevelSpec *const level_spec = &this_level_info->level_spec;
+    level_spec->level = SEQ_LEVEL_MAX;
+    AV1LevelStats *const level_stats = &this_level_info->level_stats;
+    level_stats->min_cropped_tile_width = INT_MAX;
+    level_stats->min_cropped_tile_height = INT_MAX;
+    level_stats->min_frame_width = INT_MAX;
+    level_stats->min_frame_height = INT_MAX;
+    level_stats->tile_width_is_valid = 1;
+    level_stats->min_cr = 1e8;
+
+    FrameWindowBuffer *const frame_window_buffer =
+        &this_level_info->frame_window_buffer;
+    frame_window_buffer->num = 0;
+    frame_window_buffer->start = 0;
+
+    const AV1_COMMON *const cm = &cpi->common;
+    const int upscaled_width = cm->superres_upscaled_width;
+    const int height = cm->height;
+    const int pic_size = upscaled_width * height;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      DECODER_MODEL *const this_model = &this_level_info->decoder_models[level];
+      const AV1LevelSpec *const spec = &av1_level_defs[level];
+      if (upscaled_width > spec->max_h_size || height > spec->max_v_size ||
+          pic_size > spec->max_picture_size) {
+        // Turn off decoder model for this level as the frame size already
+        // exceeds level constraints.
+        this_model->status = DECODER_MODEL_DISABLED;
+      } else {
+        av1_decoder_model_init(cpi, level, op_index, this_model);
+      }
+    }
+  }
+}
+
 static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
                          int is_still_picture, int64_t decoded_sample_rate) {
   if (is_still_picture) return 0.8;
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
   const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
   const double speed_adj =
       (double)decoded_sample_rate / level_spec->max_display_rate;
   return AOMMAX(min_cr_basis * speed_adj, 0.8);
 }
 
-static TARGET_LEVEL_FAIL_ID check_level_constraints(
-    const AV1LevelSpec *const target_level_spec,
-    const AV1LevelSpec *const level_spec,
-    const AV1LevelStats *const level_stats, int tier, int is_still_picture) {
-  const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
-                                   level_spec->max_decode_rate);
-  TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  return get_min_cr(level_spec, tier, is_still_picture,
+                    level_spec->max_decode_rate);
+}
 
+static void get_temporal_parallel_params(int scalability_mode_idc,
+                                         int *temporal_parallel_num,
+                                         int *temporal_parallel_denom) {
+  if (scalability_mode_idc < 0) {
+    *temporal_parallel_num = 1;
+    *temporal_parallel_denom = 1;
+    return;
+  }
+
+  // TODO(huisu@): handle scalability cases.
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    (void)scalability_mode_idc;
+  } else {
+    (void)scalability_mode_idc;
+  }
+}
+
+#define MAX_TILE_SIZE (4096 * 2304)
+#define MIN_CROPPED_TILE_WIDTH 8
+#define MIN_CROPPED_TILE_HEIGHT 8
+#define MIN_FRAME_WIDTH 16
+#define MIN_FRAME_HEIGHT 16
+#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+    const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier,
+    int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) {
+  const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level];
+  const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status;
+  if (decoder_model_status != DECODER_MODEL_OK &&
+      decoder_model_status != DECODER_MODEL_DISABLED) {
+    return DECODER_MODEL_FAIL;
+  }
+
+  const AV1LevelSpec *const level_spec = &level_info->level_spec;
+  const AV1LevelSpec *const target_level_spec = &av1_level_defs[level];
+  const AV1LevelStats *const level_stats = &level_info->level_stats;
+  TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
   do {
     if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
       fail_id = LUMA_PIC_SIZE_TOO_LARGE;
@@ -308,11 +814,15 @@
       break;
     }
 
-    if (level_spec->max_display_rate > target_level_spec->max_display_rate) {
+    if (decoder_model->max_display_rate >
+        (double)target_level_spec->max_display_rate) {
       fail_id = DISPLAY_RATE_TOO_HIGH;
       break;
     }
 
+    // TODO(huisu): we are not using max decode rate calculated by the decoder
+    // model because the model in resource availability mode always returns
+    // MaxDecodeRate(as in the level definitions) as the max decode rate.
     if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
       fail_id = DECODE_RATE_TOO_HIGH;
       break;
@@ -323,7 +833,7 @@
       break;
     }
 
-    if (level_stats->max_tile_size > 4096 * 2304) {
+    if (level_stats->max_tile_size > MAX_TILE_SIZE) {
       fail_id = TILE_TOO_LARGE;
       break;
     }
@@ -333,22 +843,22 @@
       break;
     }
 
-    if (level_stats->min_cropped_tile_width < 8) {
+    if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) {
       fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
       break;
     }
 
-    if (level_stats->min_cropped_tile_height < 8) {
+    if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) {
       fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
       break;
     }
 
-    if (level_stats->min_frame_width < 16) {
+    if (level_stats->min_frame_width < MIN_FRAME_WIDTH) {
       fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
       break;
     }
 
-    if (level_stats->min_frame_height < 16) {
+    if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) {
       fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
       break;
     }
@@ -358,32 +868,51 @@
       break;
     }
 
+    const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+                                     level_spec->max_decode_rate);
     if (level_stats->min_cr < min_cr) {
       fail_id = CR_TOO_SMALL;
       break;
     }
+
+    if (check_bitrate) {
+      // Check average bitrate instead of max_bitrate.
+      const double bitrate_limit =
+          get_max_bitrate(target_level_spec, tier, profile);
+      const double avg_bitrate = level_stats->total_compressed_size * 8.0 /
+                                 level_stats->total_time_encoded;
+      if (avg_bitrate > bitrate_limit) {
+        fail_id = BITRATE_TOO_HIGH;
+        break;
+      }
+    }
+
+    if (target_level_spec->level > SEQ_LEVEL_5_1) {
+      int temporal_parallel_num;
+      int temporal_parallel_denom;
+      const int scalability_mode_idc = -1;
+      get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num,
+                                   &temporal_parallel_denom);
+      const int val = level_stats->max_tile_size * level_spec->max_header_rate *
+                      temporal_parallel_denom / temporal_parallel_num;
+      if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) {
+        fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH;
+        break;
+      }
+    }
   } while (0);
 
   return fail_id;
 }
 
-static INLINE int is_in_operating_point(int operating_point,
-                                        int temporal_layer_id,
-                                        int spatial_layer_id) {
-  if (!operating_point) return 1;
-
-  return ((operating_point >> temporal_layer_id) & 1) &&
-         ((operating_point >> (spatial_layer_id + 8)) & 1);
-}
-
-static void get_tile_stats(const AV1_COMP *const cpi, int *max_tile_size,
-                           int *max_superres_tile_width,
+static void get_tile_stats(const AV1_COMMON *const cm,
+                           const TileDataEnc *const tile_data,
+                           int *max_tile_size, int *max_superres_tile_width,
                            int *min_cropped_tile_width,
                            int *min_cropped_tile_height,
                            int *tile_width_valid) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   const int superres_scale_denominator = cm->superres_scale_denominator;
 
   *max_tile_size = 0;
@@ -395,7 +924,7 @@
   for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
       const TileInfo *const tile_info =
-          &cpi->tile_data[tile_row * cm->tile_cols + tile_col].tile_info;
+          &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info;
       const int tile_width =
           (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
       const int tile_height =
@@ -417,7 +946,8 @@
       *min_cropped_tile_height =
           AOMMIN(*min_cropped_tile_height, cropped_tile_height);
 
-      const int is_right_most_tile = tile_info->mi_col_end == cm->mi_cols;
+      const int is_right_most_tile =
+          tile_info->mi_col_end == cm->mi_params.mi_cols;
       if (!is_right_most_tile) {
         if (av1_superres_scaled(cm))
           *tile_width_valid &= tile_width >= 128;
@@ -428,7 +958,8 @@
   }
 }
 
-static int store_frame_record(int64_t ts_start, int64_t ts_end, int pic_size,
+static int store_frame_record(int64_t ts_start, int64_t ts_end,
+                              size_t encoded_size, int pic_size,
                               int frame_header_count, int tiles, int show_frame,
                               int show_existing_frame,
                               FrameWindowBuffer *const buffer) {
@@ -441,6 +972,7 @@
   FrameRecord *const record = &buffer->buf[new_idx];
   record->ts_start = ts_start;
   record->ts_end = ts_end;
+  record->encoded_size_in_bytes = encoded_size;
   record->pic_size = pic_size;
   record->frame_header_count = frame_header_count;
   record->tiles = tiles;
@@ -476,13 +1008,15 @@
 // Scan previously encoded frames and update level metrics accordingly.
 static void scan_past_frames(const FrameWindowBuffer *const buffer,
                              int num_frames_to_scan,
-                             AV1LevelSpec *const level_spec) {
+                             AV1LevelSpec *const level_spec,
+                             AV1LevelStats *const level_stats) {
   const int num_frames_in_buffer = buffer->num;
   int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
   int frame_headers = 0;
   int tiles = 0;
   int64_t display_samples = 0;
   int64_t decoded_samples = 0;
+  size_t encoded_size_in_bytes = 0;
   for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
     const FrameRecord *const record = &buffer->buf[index];
     if (!record->show_existing_frame) {
@@ -493,76 +1027,71 @@
       display_samples += record->pic_size;
     }
     tiles += record->tiles;
+    encoded_size_in_bytes += record->encoded_size_in_bytes;
     --index;
     if (index < 0) index = FRAME_WINDOW_SIZE - 1;
   }
   level_spec->max_header_rate =
       AOMMAX(level_spec->max_header_rate, frame_headers);
+  // TODO(huisu): we can now compute max display rate with the decoder model, so
+  // these couple of lines can be removed. Keep them here for a while for
+  // debugging purpose.
   level_spec->max_display_rate =
       AOMMAX(level_spec->max_display_rate, display_samples);
   level_spec->max_decode_rate =
       AOMMAX(level_spec->max_decode_rate, decoded_samples);
   level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+  level_stats->max_bitrate =
+      AOMMAX(level_stats->max_bitrate, (int)encoded_size_in_bytes * 8);
 }
 
 void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
                            int64_t ts_end) {
   AV1_COMMON *const cm = &cpi->common;
+  const AV1LevelParams *const level_params = &cpi->level_params;
+
   const int upscaled_width = cm->superres_upscaled_width;
   const int width = cm->width;
   const int height = cm->height;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
   const int tiles = tile_cols * tile_rows;
   const int luma_pic_size = upscaled_width * height;
-  const int frame_header_count = cpi->frame_header_count;
+  const int frame_header_count = level_params->frame_header_count;
   const int show_frame = cm->show_frame;
   const int show_existing_frame = cm->show_existing_frame;
 
-  // Store info. of current frame into FrameWindowBuffer.
-  FrameWindowBuffer *const buffer = &cpi->frame_window_buffer;
-  store_frame_record(ts_start, ts_end, luma_pic_size, frame_header_count, tiles,
-                     show_frame, show_existing_frame, buffer);
-  // Count the number of frames encoded in the past 1 second.
-  const int encoded_frames_in_last_second =
-      show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
-
   int max_tile_size;
   int min_cropped_tile_width;
   int min_cropped_tile_height;
   int max_superres_tile_width;
   int tile_width_is_valid;
-  get_tile_stats(cpi, &max_tile_size, &max_superres_tile_width,
+  get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width,
                  &min_cropped_tile_width, &min_cropped_tile_height,
                  &tile_width_is_valid);
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const BITSTREAM_PROFILE profile = seq_params->profile;
-  const int pic_size_profile_factor =
-      profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
-  const size_t frame_compressed_size = (size > 129 ? size - 128 : 1);
-  const size_t frame_uncompressed_size =
-      (luma_pic_size * pic_size_profile_factor) >> 3;
-
   aom_clear_system_state();
-  const double compression_ratio =
-      frame_uncompressed_size / (double)frame_compressed_size;
+  const double compression_ratio = av1_get_compression_ratio(cm, size);
   const double total_time_encoded =
-      (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+      (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
       (double)TICKS_PER_SEC;
 
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
   const int is_still_picture = seq_params->still_picture;
   // update level_stats
   // TODO(kyslov@) fix the implementation according to buffer model
   for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
     if (!is_in_operating_point(seq_params->operating_point_idc[i],
-                               temporal_layer_id, spatial_layer_id)) {
+                               temporal_layer_id, spatial_layer_id) ||
+        !((level_params->keep_level_stats >> i) & 1)) {
       continue;
     }
 
-    AV1LevelInfo *const level_info = &cpi->level_info[i];
+    AV1LevelInfo *const level_info = level_params->level_info[i];
+    assert(level_info != NULL);
     AV1LevelStats *const level_stats = &level_info->level_stats;
 
     level_stats->max_tile_size =
@@ -577,9 +1106,8 @@
     level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
     level_stats->min_frame_height =
         AOMMIN(level_stats->min_frame_height, height);
-    level_stats->total_compressed_size += frame_compressed_size;
-    if (show_frame) level_stats->total_time_encoded = total_time_encoded;
     level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+    level_stats->total_compressed_size += (double)size;
 
     // update level_spec
     // TODO(kyslov@) update all spec fields
@@ -592,21 +1120,35 @@
     level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
     level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
 
+    // Store info. of current frame into FrameWindowBuffer.
+    FrameWindowBuffer *const buffer = &level_info->frame_window_buffer;
+    store_frame_record(ts_start, ts_end, size, luma_pic_size,
+                       frame_header_count, tiles, show_frame,
+                       show_existing_frame, buffer);
     if (show_frame) {
-      scan_past_frames(buffer, encoded_frames_in_last_second, level_spec);
+      // Count the number of frames encoded in the past 1 second.
+      const int encoded_frames_in_last_second =
+          show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+      scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
+                       level_stats);
+      level_stats->total_time_encoded = total_time_encoded;
+    }
+
+    DECODER_MODEL *const decoder_models = level_info->decoder_models;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
     }
 
     // Check whether target level is met.
-    const AV1_LEVEL target_seq_level_idx = cpi->target_seq_level_idx[i];
-    if (target_seq_level_idx < SEQ_LEVELS) {
-      const AV1LevelSpec *const target_level_spec =
-          av1_level_defs + target_seq_level_idx;
+    const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
+    if (target_level < SEQ_LEVELS) {
+      assert(is_valid_seq_level_idx(target_level));
       const int tier = seq_params->tier[i];
       const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
-          target_level_spec, level_spec, level_stats, tier, is_still_picture);
+          level_info, target_level, tier, is_still_picture, profile, 0);
       if (fail_id != TARGET_LEVEL_OK) {
-        const int target_level_major = 2 + (target_seq_level_idx >> 2);
-        const int target_level_minor = target_seq_level_idx & 3;
+        const int target_level_major = 2 + (target_level >> 2);
+        const int target_level_minor = target_level & 3;
         aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                            "Failed to encode to the target level %d_%d. %s",
                            target_level_major, target_level_minor,
@@ -616,26 +1158,21 @@
   }
 }
 
-aom_codec_err_t av1_get_seq_level_idx(const AV1_COMP *cpi, int *seq_level_idx) {
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  if (!cpi->keep_level_stats) {
-    for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
-      seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
-    }
-    return AOM_CODEC_OK;
-  }
-
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
+                                      int *seq_level_idx) {
   const int is_still_picture = seq_params->still_picture;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
   for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
     seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    if (!((level_params->keep_level_stats >> op) & 1)) continue;
     const int tier = seq_params->tier[op];
-    const AV1LevelInfo *const level_info = &cpi->level_info[op];
-    const AV1LevelStats *const level_stats = &level_info->level_stats;
-    const AV1LevelSpec *const level_spec = &level_info->level_spec;
+    const AV1LevelInfo *const level_info = level_params->level_info[op];
+    assert(level_info != NULL);
     for (int level = 0; level < SEQ_LEVELS; ++level) {
-      const AV1LevelSpec *const target_level_spec = av1_level_defs + level;
+      if (!is_valid_seq_level_idx(level)) continue;
       const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
-          target_level_spec, level_spec, level_stats, tier, is_still_picture);
+          level_info, level, tier, is_still_picture, profile, 1);
       if (fail_id == TARGET_LEVEL_OK) {
         seq_level_idx[op] = level;
         break;

diff --git a/libaom/av1/encoder/level.h b/libaom/av1/encoder/level.h
index 9f1664d..5e0cce2 100644
--- a/libaom/av1/encoder/level.h
+++ b/libaom/av1/encoder/level.h

@@ -37,6 +37,7 @@
 typedef struct {
   int64_t ts_start;
   int64_t ts_end;
+  size_t encoded_size_in_bytes;
   int pic_size;
   int frame_header_count;
   int tiles;
@@ -52,9 +53,8 @@
   int start;  // Buffer index of the first FrameRecord.
 } FrameWindowBuffer;
 
-// Used to keep track of AV1 Level Stats. Currently unimplemented.
 typedef struct {
-  uint64_t total_compressed_size;
+  int max_bitrate;  // Max bitrate in any 1-second window, in bps.
   int max_tile_size;
   int max_superres_tile_width;
   int min_cropped_tile_width;
@@ -62,20 +62,150 @@
   int tile_width_is_valid;
   int min_frame_width;
   int min_frame_height;
-  double total_time_encoded;
+  double total_compressed_size;  // In bytes.
+  double total_time_encoded;     // In seconds.
   double min_cr;
 } AV1LevelStats;
 
+// The following data structures are for the decoder model.
+typedef struct {
+  int decoder_ref_count;
+  int player_ref_count;
+  int display_index;
+  FRAME_TYPE frame_type;
+  double presentation_time;
+} FRAME_BUFFER;
+
+// Interval of bits transmission for a DFG(Decodable Frame Group).
+typedef struct {
+  double first_bit_arrival_time;  // Time when the first bit arrives.
+  double last_bit_arrival_time;   // Time when the last bit arrives.
+  // Removal time means the time when the bits to be decoded are removed from
+  // the smoothing buffer. Removal time is essentially the time when the
+  // decoding of the frame starts.
+  double removal_time;
+} DFG_INTERVAL;
+
+#define DFG_INTERVAL_QUEUE_SIZE 64
+typedef struct {
+  int head;
+  int size;
+  double total_interval;
+  DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE];
+} DFG_INTERVAL_QUEUE;
+
+enum {
+  RESOURCE_MODE = 0,  // Resource availability mode.
+  SCHEDULE_MODE       // Decoding schedule mode.
+} UENUM1BYTE(DECODER_MODEL_MODE);
+
+enum {
+  DECODER_MODEL_OK = 0,
+  DECODE_BUFFER_AVAILABLE_LATE,
+  DECODE_FRAME_BUF_UNAVAILABLE,
+  DECODE_EXISTING_FRAME_BUF_EMPTY,
+  DISPLAY_FRAME_LATE,
+  SMOOTHING_BUFFER_UNDERFLOW,
+  SMOOTHING_BUFFER_OVERFLOW,
+  DECODER_MODEL_DISABLED
+} UENUM1BYTE(DECODER_MODEL_STATUS);
+
+#define BUFFER_POOL_MAX_SIZE 10
+typedef struct {
+  DECODER_MODEL_STATUS status;
+  DECODER_MODEL_MODE mode;
+  bool is_low_delay_mode;
+  AV1_LEVEL level;
+  int encoder_buffer_delay;  // In units of 1/90000 seconds.
+  int decoder_buffer_delay;  // In units of 1/90000 seconds.
+  int num_ticks_per_picture;
+  int initial_display_delay;  // In units of frames.
+  int64_t decode_rate;
+  double display_clock_tick;          // In units of seconds.
+  double current_time;                // In units of seconds.
+  double initial_presentation_delay;  // In units of seconds.
+  double bit_rate;                    // Bits per second.
+
+  int num_frame;
+  int num_decoded_frame;
+  int num_shown_frame;
+  int vbi[REF_FRAMES];  // Virtual buffer index.
+  FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE];
+  DFG_INTERVAL_QUEUE dfg_interval_queue;
+
+  // Information for the DFG(Decodable Frame Group) being processed.
+  double first_bit_arrival_time;
+  double last_bit_arrival_time;
+  size_t coded_bits;
+
+  // Information for the frame being processed.
+  double removal_time;
+  double presentation_time;
+  int decode_samples;
+  int display_samples;
+
+  double max_display_rate;
+  double max_decode_rate;
+} DECODER_MODEL;
+
 typedef struct {
   AV1LevelStats level_stats;
   AV1LevelSpec level_spec;
+  FrameWindowBuffer frame_window_buffer;
+  DECODER_MODEL decoder_models[SEQ_LEVELS];
 } AV1LevelInfo;
 
+typedef struct AV1LevelParams {
+  // Specifies the level that the coded video sequence conforms to for each
+  // operating point.
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to indicate whether to keep level stats for corresponding
+  // operating points.
+  uint32_t keep_level_stats;
+  // Level information for each operating point.
+  AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
+  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+  int frame_header_count;
+} AV1LevelParams;
+
+static INLINE int is_in_operating_point(int operating_point,
+                                        int temporal_layer_id,
+                                        int spatial_layer_id) {
+  if (!operating_point) return 1;
+
+  return ((operating_point >> temporal_layer_id) & 1) &&
+         ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+void av1_init_level_info(struct AV1_COMP *cpi);
+
 void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
                            int64_t ts_end);
 
 // Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
-aom_codec_err_t av1_get_seq_level_idx(const struct AV1_COMP *cpi,
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
                                       int *seq_level_idx);
 
+// Print the status of the decoder model(for debugging).
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model);
+
+// Return max bitrate(bps) for given level.
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile);
+
+// Get max number of tiles and tile columns for given level.
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols);
+
+// Return minimum compression ratio for given level.
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture);
 #endif  // AOM_AV1_ENCODER_LEVEL_H_

diff --git a/libaom/av1/encoder/lookahead.c b/libaom/av1/encoder/lookahead.c
index f5298f7..0f7c819 100644
--- a/libaom/av1/encoder/lookahead.c
+++ b/libaom/av1/encoder/lookahead.c

@@ -13,6 +13,7 @@
 
 #include "config/aom_config.h"
 
+#include "aom_scale/yv12config.h"
 #include "av1/common/common.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/extend.h"
@@ -44,11 +45,13 @@
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
     unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int is_scale) {
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers) {
   struct lookahead_ctx *ctx = NULL;
+  int lag_in_frames = AOMMAX(1, depth);
 
-  // Clamp the lookahead queue depth
-  depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+  // Add the lags to depth and clamp
+  depth += num_lap_buffers;
+  depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
 
   // Allocate memory to keep previous source frames available.
   depth += MAX_PRE_FRAMES;
@@ -56,43 +59,35 @@
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
-    const int legacy_byte_alignment = 0;
     unsigned int i;
     ctx->max_sz = depth;
+    ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - MAX_PRE_FRAMES;
+    ctx->read_ctxs[ENCODE_STAGE].valid = 1;
+    if (num_lap_buffers) {
+      ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
+      ctx->read_ctxs[LAP_STAGE].valid = 1;
+    }
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
-    if (!ctx->buf) goto bail;
-    for (i = 0; i < depth; i++)
-      if (is_scale) {
-        if (aom_alloc_frame_buffer(
-                &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
-                use_highbitdepth, border_in_pixels, legacy_byte_alignment))
-          goto bail;
-      } else {
-        aom_free_frame_buffer(&ctx->buf[i].img);
-        if (aom_realloc_lookahead_buffer(
-                &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
-                use_highbitdepth, AOM_ENC_LOOKAHEAD_BORDER,
-                legacy_byte_alignment, NULL, NULL, NULL))
-          goto bail;
-      }
+    if (!ctx->buf) goto fail;
+    for (i = 0; i < depth; i++) {
+      aom_free_frame_buffer(&ctx->buf[i].img);
+      if (aom_realloc_frame_buffer(&ctx->buf[i].img, width, height,
+                                   subsampling_x, subsampling_y,
+                                   use_highbitdepth, border_in_pixels,
+                                   byte_alignment, NULL, NULL, NULL))
+        goto fail;
+    }
   }
   return ctx;
-bail:
+fail:
   av1_lookahead_destroy(ctx);
   return NULL;
 }
 
-#define USE_PARTIAL_COPY 0
-
 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
-  int row, col, active_end;
-  int mb_rows = (src->y_height + 15) >> 4;
-  int mb_cols = (src->y_width + 15) >> 4;
-#endif
   int width = src->y_crop_width;
   int height = src->y_crop_height;
   int uv_width = src->uv_crop_width;
@@ -101,8 +96,13 @@
   int subsampling_y = src->subsampling_y;
   int larger_dimensions, new_dimensions;
 
-  if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
-  ctx->sz++;
+  assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
+  if (ctx->read_ctxs[ENCODE_STAGE].sz + 1 + MAX_PRE_FRAMES > ctx->max_sz)
+    return 1;
+  ctx->read_ctxs[ENCODE_STAGE].sz++;
+  if (ctx->read_ctxs[LAP_STAGE].valid) {
+    ctx->read_ctxs[LAP_STAGE].sz++;
+  }
   buf = pop(ctx, &ctx->write_idx);
 
   new_dimensions = width != buf->img.y_crop_width ||
@@ -114,101 +114,69 @@
                       uv_height > buf->img.uv_height;
   assert(!larger_dimensions || new_dimensions);
 
-#if USE_PARTIAL_COPY
-  // TODO(jkoleszar): This is disabled for now, as
-  // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
-  // Only do this partial copy if the following conditions are all met:
-  // 1. Lookahead queue has has size of 1.
-  // 2. Active map is provided.
-  // 3. This is not a key frame, golden nor altref frame.
-  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
-    for (row = 0; row < mb_rows; ++row) {
-      col = 0;
-
-      while (1) {
-        // Find the first active macroblock in this row.
-        for (; col < mb_cols; ++col) {
-          if (active_map[col]) break;
-        }
-
-        // No more active macroblock in this row.
-        if (col == mb_cols) break;
-
-        // Find the end of active region in this row.
-        active_end = col;
-
-        for (; active_end < mb_cols; ++active_end) {
-          if (!active_map[active_end]) break;
-        }
-
-        // Only copy this active region.
-        av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
-                                            16, (active_end - col) << 4);
-
-        // Start again from the end of this active region.
-        col = active_end;
-      }
-
-      active_map += mb_cols;
-    }
-  } else {
-#endif
-    if (larger_dimensions) {
-      YV12_BUFFER_CONFIG new_img;
-      memset(&new_img, 0, sizeof(new_img));
-      if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y, use_highbitdepth,
-                                 AOM_BORDER_IN_PIXELS, 0))
-        return 1;
-      aom_free_frame_buffer(&buf->img);
-      buf->img = new_img;
-    } else if (new_dimensions) {
-      buf->img.y_crop_width = src->y_crop_width;
-      buf->img.y_crop_height = src->y_crop_height;
-      buf->img.uv_crop_width = src->uv_crop_width;
-      buf->img.uv_crop_height = src->uv_crop_height;
-      buf->img.subsampling_x = src->subsampling_x;
-      buf->img.subsampling_y = src->subsampling_y;
-    }
-    // Partial copy not implemented yet
-    av1_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y, use_highbitdepth,
+                               AOM_BORDER_IN_PIXELS, 0))
+      return 1;
+    aom_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
   }
-#endif
+  // Partial copy not implemented yet
+  av1_copy_and_extend_frame(src, &buf->img);
 
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
   buf->flags = flags;
+  aom_remove_metadata_from_frame_buffer(&buf->img);
+  aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata);
   return 0;
 }
 
-struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx,
-                                          int drain) {
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage) {
   struct lookahead_entry *buf = NULL;
-
-  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
-    buf = pop(ctx, &ctx->read_idx);
-    ctx->sz--;
+  if (ctx) {
+    struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+    assert(read_ctx->valid == 1);
+    if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) {
+      buf = pop(ctx, &read_ctx->read_idx);
+      read_ctx->sz--;
+    }
   }
   return buf;
 }
 
-struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
-                                           int index) {
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage) {
   struct lookahead_entry *buf = NULL;
+  struct read_ctx *read_ctx = NULL;
+  if (ctx == NULL) {
+    return buf;
+  }
 
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
   if (index >= 0) {
     // Forward peek
-    if (index < ctx->sz) {
-      index += ctx->read_idx;
+    if (index < read_ctx->sz) {
+      index += read_ctx->read_idx;
       if (index >= ctx->max_sz) index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
   } else if (index < 0) {
     // Backward peek
     if (-index <= MAX_PRE_FRAMES) {
-      index += (int)(ctx->read_idx);
+      index += (int)(read_ctx->read_idx);
       if (index < 0) index += (int)(ctx->max_sz);
       buf = ctx->buf + index;
     }
@@ -217,4 +185,21 @@
   return buf;
 }
 
-unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage) {
+  struct read_ctx *read_ctx = NULL;
+  assert(ctx != NULL);
+
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->sz;
+}
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
+  struct read_ctx *read_ctx = NULL;
+  assert(ctx != NULL);
+
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->pop_sz;
+}

diff --git a/libaom/av1/encoder/lookahead.h b/libaom/av1/encoder/lookahead.h
index 3b2d94b..03693d3 100644
--- a/libaom/av1/encoder/lookahead.h
+++ b/libaom/av1/encoder/lookahead.h

@@ -19,7 +19,10 @@
 extern "C" {
 #endif
 
-#define MAX_LAG_BUFFERS 25
+#define MAX_LAG_BUFFERS 35
+#define MAX_LAP_BUFFERS 35
+#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
+#define LAP_LAG_IN_FRAMES 17
 
 struct lookahead_entry {
   YV12_BUFFER_CONFIG img;
@@ -31,12 +34,20 @@
 // The max of past frames we want to keep in the queue.
 #define MAX_PRE_FRAMES 1
 
+enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE);
+
+struct read_ctx {
+  int sz;       /* Number of buffers currently in the queue */
+  int read_idx; /* Read index */
+  int pop_sz;   /* Size to check for pop condition */
+  int valid;    /* Is this ctx valid? */
+};
+
 struct lookahead_ctx {
-  int max_sz;                  /* Absolute size of the queue */
-  int sz;                      /* Number of buffers currently in the queue */
-  int read_idx;                /* Read index */
-  int write_idx;               /* Write index */
-  struct lookahead_entry *buf; /* Buffer list */
+  int max_sz;                            /* Absolute size of the queue */
+  int write_idx;                         /* Write index */
+  struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
+  struct lookahead_entry *buf;           /* Buffer list */
 };
 
 /**\brief Initializes the lookahead stage
@@ -47,7 +58,7 @@
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
     unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int is_scale);
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers);
 
 /**\brief Destroys the lookahead stage
  */
@@ -82,7 +93,8 @@
  * \retval NULL, if drain set and queue is empty
  * \retval NULL, if drain not set and queue not of the configured depth
  */
-struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage);
 
 /**\brief Get a future source buffer to encode
  *
@@ -91,14 +103,17 @@
  *
  * \retval NULL, if no buffer exists at the specified index
  */
-struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
-                                           int index);
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage);
 
 /**\brief Get the number of frames currently in the lookahead queue
  *
  * \param[in] ctx       Pointer to the lookahead context
  */
-unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage);
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/mbgraph.c b/libaom/av1/encoder/mbgraph.c
deleted file mode 100644
index 0cb6286..0000000
--- a/libaom/av1/encoder/mbgraph.c
+++ /dev/null

@@ -1,401 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <limits.h>
-
-#include "config/av1_rtcd.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/system_state.h"
-#include "av1/common/blockd.h"
-#include "av1/common/reconinter.h"
-#include "av1/common/reconintra.h"
-#include "av1/encoder/mcomp.h"
-#include "av1/encoder/reconinter_enc.h"
-#include "av1/encoder/segmentation.h"
-
-static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
-                                              int mb_row, int mb_col) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  MV ref_full;
-  int cost_list[5];
-
-  // Further step/diamond searches as necessary
-  int step_param = mv_sf->reduce_first_step_size;
-  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  av1_set_mv_search_range(&x->mv_limits, ref_mv);
-
-  ref_full.col = ref_mv->col >> 3;
-  ref_full.row = ref_mv->row >> 3;
-
-  /*cpi->sf.search_method == HEX*/
-  av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
-
-  // Try sub-pixel MC
-  // if (bestsme > error_thresh && bestsme < INT_MAX)
-  if (cpi->common.cur_frame_force_integer_mv == 1) {
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-  } else {
-    int distortion;
-    unsigned int sse;
-    cpi->find_fractional_mv_step(
-        x, &cpi->common, mb_row, mb_col, ref_mv,
-        cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
-        mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
-        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0, 1);
-  }
-
-  if (has_second_ref(xd->mi[0]))
-    xd->mi[0]->mode = NEW_NEWMV;
-  else
-    xd->mi[0]->mode = NEWMV;
-
-  xd->mi[0]->mv[0] = x->best_mv;
-  xd->mi[0]->ref_frame[1] = NONE_FRAME;
-
-  av1_enc_build_inter_predictor(&cpi->common, xd, mb_row, mb_col, NULL,
-                                BLOCK_16X16, AOM_PLANE_Y, AOM_PLANE_Y);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-}
-
-static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
-                                  int mb_col) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int err, tmp_err;
-  MV best_mv;
-
-  // Try zero MV first
-  // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-  best_mv.col = best_mv.row = 0;
-
-  // Test last reference frame using the previous best mv as the
-  // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
-  if (tmp_err < err) {
-    err = tmp_err;
-    best_mv = x->best_mv.as_mv;
-  }
-
-  // If the current best reference mv is not centered on 0,0 then do a 0,0
-  // based search as well.
-  if (ref_mv->row != 0 || ref_mv->col != 0) {
-    MV zero_ref_mv = kZeroMv;
-
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
-    if (tmp_err < err) {
-      err = tmp_err;
-      best_mv = x->best_mv.as_mv;
-    }
-  }
-
-  x->best_mv.as_mv = best_mv;
-  return err;
-}
-
-static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int err;
-
-  // Try zero MV first
-  // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-
-  dst_mv->as_int = 0;
-
-  return err;
-}
-static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  PREDICTION_MODE best_mode = -1, mode;
-  unsigned int best_err = INT_MAX;
-
-  // calculate SATD for each intra prediction mode;
-  // we're intentionally not doing 4x4, we just want a rough estimate
-  for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
-    unsigned int err;
-
-    xd->mi[0]->mode = mode;
-    av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0,
-                            FILTER_INTRA_MODES, x->plane[0].src.buf,
-                            x->plane[0].src.stride, xd->plane[0].dst.buf,
-                            xd->plane[0].dst.stride, 0, 0, 0);
-    err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-
-    // find best
-    if (err < best_err) {
-      best_err = err;
-      best_mode = mode;
-    }
-  }
-
-  if (pbest_mode) *pbest_mode = best_mode;
-
-  return best_err;
-}
-
-static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats,
-                                    YV12_BUFFER_CONFIG *buf, int mb_y_offset,
-                                    YV12_BUFFER_CONFIG *golden_ref,
-                                    const MV *prev_golden_ref_mv,
-                                    YV12_BUFFER_CONFIG *alt_ref, int mb_row,
-                                    int mb_col) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int intra_error;
-  AV1_COMMON *cm = &cpi->common;
-
-  // FIXME in practice we're completely ignoring chroma here
-  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
-  x->plane[0].src.stride = buf->y_stride;
-
-  xd->plane[0].dst.buf = cm->cur_frame->buf.y_buffer + mb_y_offset;
-  xd->plane[0].dst.stride = cm->cur_frame->buf.y_stride;
-
-  // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
-  if (intra_error <= 0) intra_error = 1;
-  stats->ref[INTRA_FRAME].err = intra_error;
-
-  // Golden frame MV search, if it exists and is different than last frame
-  if (golden_ref) {
-    int g_motion_error;
-    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
-    xd->plane[0].pre[0].stride = golden_ref->y_stride;
-    g_motion_error =
-        do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
-    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
-    stats->ref[GOLDEN_FRAME].err = g_motion_error;
-  } else {
-    stats->ref[GOLDEN_FRAME].err = INT_MAX;
-    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
-  }
-
-  // Do an Alt-ref frame MV search, if it exists and is different than
-  // last/golden frame.
-  if (alt_ref) {
-    int a_motion_error;
-    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
-    xd->plane[0].pre[0].stride = alt_ref->y_stride;
-    a_motion_error =
-        do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
-
-    stats->ref[ALTREF_FRAME].err = a_motion_error;
-  } else {
-    stats->ref[ALTREF_FRAME].err = INT_MAX;
-    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
-  }
-}
-
-static void update_mbgraph_frame_stats(AV1_COMP *cpi,
-                                       MBGRAPH_FRAME_STATS *stats,
-                                       YV12_BUFFER_CONFIG *buf,
-                                       YV12_BUFFER_CONFIG *golden_ref,
-                                       YV12_BUFFER_CONFIG *alt_ref) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  AV1_COMMON *const cm = &cpi->common;
-
-  int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
-  MV gld_top_mv = kZeroMv;
-  MB_MODE_INFO mi_local;
-
-  av1_zero(mi_local);
-  // Set up limit values for motion vectors to prevent them extending outside
-  // the UMV borders.
-  x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
-  x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
-  xd->up_available = 0;
-  xd->plane[0].dst.stride = buf->y_stride;
-  xd->plane[0].pre[0].stride = buf->y_stride;
-  xd->plane[1].dst.stride = buf->uv_stride;
-  xd->mi[0] = &mi_local;
-  mi_local.sb_type = BLOCK_16X16;
-  mi_local.ref_frame[0] = LAST_FRAME;
-  mi_local.ref_frame[1] = NONE_FRAME;
-
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    MV gld_left_mv = gld_top_mv;
-    int mb_y_in_offset = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
-
-    // Set up limit values for motion vectors to prevent them extending outside
-    // the UMV borders.
-    x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
-    x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
-    xd->left_available = 0;
-
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
-
-      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
-                              &gld_left_mv, alt_ref, mb_row, mb_col);
-      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
-      if (mb_col == 0) {
-        gld_top_mv = gld_left_mv;
-      }
-      xd->left_available = 1;
-      mb_y_in_offset += 16;
-      gld_y_in_offset += 16;
-      arf_y_in_offset += 16;
-      x->mv_limits.col_min -= 16;
-      x->mv_limits.col_max -= 16;
-    }
-    xd->up_available = 1;
-    mb_y_offset += buf->y_stride * 16;
-    gld_y_offset += golden_ref->y_stride * 16;
-    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
-    x->mv_limits.row_min -= 16;
-    x->mv_limits.row_max -= 16;
-    offset += cm->mb_cols;
-  }
-}
-
-// void separate_arf_mbs_byzz
-static void separate_arf_mbs(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  int mb_col, mb_row, offset, i;
-  int mi_row, mi_col;
-  int ncnt[4] = { 0 };
-  int n_frames = cpi->mbgraph_n_frames;
-
-  int *arf_not_zz;
-
-  CHECK_MEM_ERROR(
-      cm, arf_not_zz,
-      aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
-
-  // We are not interested in results beyond the alt ref itself.
-  if (n_frames > cpi->rc.frames_till_gf_update_due)
-    n_frames = cpi->rc.frames_till_gf_update_due;
-
-  // defer cost to reference frames
-  for (i = n_frames - 1; i >= 0; i--) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-
-    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
-         offset += cm->mb_cols, mb_row++) {
-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
-
-        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
-        int intra_err = mb_stats->ref[INTRA_FRAME].err;
-        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
-
-        // Test for altref vs intra and gf and that its mv was 0,0.
-        if (altref_err > 1000 || altref_err > intra_err ||
-            altref_err > golden_err) {
-          arf_not_zz[offset + mb_col]++;
-        }
-      }
-    }
-  }
-
-  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
-  // of bound access in segmentation_map
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
-      // If any of the blocks in the sequence failed then the MB
-      // goes in segment 0
-      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
-        ncnt[0]++;
-        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
-      } else {
-        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
-        ncnt[1]++;
-      }
-    }
-  }
-
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
-
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure uninitialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    av1_enable_segmentation(&cm->seg);
-  } else {
-    cpi->static_mb_pct = 0;
-    av1_disable_segmentation(&cm->seg);
-  }
-
-  // Free localy allocated storage
-  aom_free(arf_not_zz);
-}
-
-void av1_update_mbgraph_stats(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  int i, n_frames = av1_lookahead_depth(cpi->lookahead);
-  YV12_BUFFER_CONFIG *golden_ref = &get_ref_frame_buf(cm, GOLDEN_FRAME)->buf;
-
-  assert(golden_ref != NULL);
-
-  // we need to look ahead beyond where the ARF transitions into
-  // being a GF - so exit if we don't look ahead beyond that
-  if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
-
-  if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
-
-  cpi->mbgraph_n_frames = n_frames;
-  for (i = 0; i < n_frames; i++) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    memset(frame_stats->mb_stats, 0,
-           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
-  }
-
-  // do motion search to find contribution of each reference to data
-  // later on in this GF group
-  // FIXME really, the GF/last MC search should be done forward, and
-  // the ARF MC search backwards, to get optimal results for MV caching
-  for (i = 0; i < n_frames; i++) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i);
-
-    assert(q_cur != NULL);
-
-    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
-                               cpi->source);
-  }
-
-  aom_clear_system_state();
-
-  separate_arf_mbs(cpi);
-}

diff --git a/libaom/av1/encoder/mbgraph.h b/libaom/av1/encoder/mbgraph.h
deleted file mode 100644
index ba08476..0000000
--- a/libaom/av1/encoder/mbgraph.h
+++ /dev/null

@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
-#define AOM_AV1_ENCODER_MBGRAPH_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-  struct {
-    int err;
-    union {
-      int_mv mv;
-      PREDICTION_MODE mode;
-    } m;
-  } ref[REF_FRAMES];
-} MBGRAPH_MB_STATS;
-
-typedef struct {
-  MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
-
-struct AV1_COMP;
-
-void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AV1_ENCODER_MBGRAPH_H_

diff --git a/libaom/av1/encoder/mcomp.c b/libaom/av1/encoder/mcomp.c
index f077a4e..43f7f5c 100644
--- a/libaom/av1/encoder/mcomp.c
+++ b/libaom/av1/encoder/mcomp.c

@@ -19,37 +19,120 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/common.h"
+#include "av1/common/filter.h"
 #include "av1/common/mvref_common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
-#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 
-// #define NEW_DIAMOND_SEARCH
-
-static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
-                                             const MV *mv) {
-  return &buf->buf[mv->row * buf->stride + mv->col];
+static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+                                       const MACROBLOCK *x, const MV *ref_mv) {
+  mv_cost_params->ref_mv = ref_mv;
+  mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
+  mv_cost_params->error_per_bit = x->errorperbit;
+  mv_cost_params->sad_per_bit = x->sadperbit;
+  mv_cost_params->mvjcost = x->nmv_vec_cost;
+  mv_cost_params->mvcost[0] = x->mv_cost_stack[0];
+  mv_cost_params->mvcost[1] = x->mv_cost_stack[1];
+  mv_cost_params->mv_cost_type = x->mv_cost_type;
 }
 
-void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
-  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
-  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
-  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
-  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+  ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
+  ms_buffers->src = &x->plane[0].src;
 
-  col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
-  row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
-  col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
-  row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+  av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
+
+  ms_buffers->wsrc = x->wsrc_buf;
+  ms_buffers->obmc_mask = x->mask_buf;
+}
+
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    const search_site_config *search_sites) {
+  // High level params
+  ms_params->bsize = bsize;
+  ms_params->vfp = &cpi->fn_ptr[bsize];
+
+  init_ms_buffers(&ms_params->ms_buffers, x);
+
+  ms_params->search_method = cpi->sf.mv_sf.search_method;
+  ms_params->search_sites = search_sites;
+
+  ms_params->mesh_patterns[0] = cpi->sf.mv_sf.mesh_patterns;
+  ms_params->mesh_patterns[1] = cpi->sf.mv_sf.intrabc_mesh_patterns;
+  ms_params->force_mesh_thresh = cpi->sf.mv_sf.exhaustive_searches_thresh;
+  ms_params->prune_mesh_search = cpi->sf.mv_sf.prune_mesh_search;
+  ms_params->run_mesh_search = 0;
+
+  ms_params->is_intra_mode = 0;
+
+  ms_params->fast_obmc_search = cpi->sf.mv_sf.obmc_full_pixel_search_level;
+
+  ms_params->mv_limits = x->mv_limits;
+  av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+}
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list) {
+  const AV1_COMMON *cm = &cpi->common;
+  // High level params
+  ms_params->allow_hp = cm->features.allow_high_precision_mv;
+  ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
+  ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
+  ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
+
+  av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+
+  // Subpel variance params
+  ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
+  ms_params->var_params.subpel_search_type =
+      cpi->sf.mv_sf.use_accurate_subpel_search;
+  ms_params->var_params.w = block_size_wide[bsize];
+  ms_params->var_params.h = block_size_high[bsize];
+
+  // Ref and src buffers
+  MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
+  init_ms_buffers(ms_buffers, x);
+}
+
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+  return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+                                                 const FULLPEL_MV *mv) {
+  return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
+  int col_min =
+      GET_MV_RAWPEL(mv->col) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min =
+      GET_MV_RAWPEL(mv->row) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = GET_MV_RAWPEL(mv->col) + MAX_FULL_PEL_VAL;
+  int row_max = GET_MV_RAWPEL(mv->row) + MAX_FULL_PEL_VAL;
+
+  col_min = AOMMAX(col_min, GET_MV_RAWPEL(MV_LOW) + 1);
+  row_min = AOMMAX(row_min, GET_MV_RAWPEL(MV_LOW) + 1);
+  col_max = AOMMIN(col_max, GET_MV_RAWPEL(MV_UPP) - 1);
+  row_max = AOMMIN(row_max, GET_MV_RAWPEL(MV_UPP) - 1);
 
   // Get intersection of UMV window and valid MV window to reduce # of checks
   // in diamond search.
@@ -59,21 +142,6 @@
   if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
 }
 
-static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min,
-                                       int *col_max, int *row_min, int *row_max,
-                                       const MV *ref_mv) {
-  const int max_mv = MAX_FULL_PEL_VAL * 8;
-  const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
-  const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
-  const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv);
-  const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv);
-
-  *col_min = AOMMAX(MV_LOW + 1, minc);
-  *col_max = AOMMIN(MV_UPP - 1, maxc);
-  *row_min = AOMMAX(MV_LOW + 1, minr);
-  *row_max = AOMMIN(MV_UPP - 1, maxr);
-}
-
 int av1_init_search_range(int size) {
   int sr = 0;
   // Minimum search size no matter what the passed in value.
@@ -85,912 +153,242 @@
   return sr;
 }
 
+// ============================================================================
+//  Cost of motion vectors
+// ============================================================================
+// TODO(any): Adaptively adjust the regularization strength based on image size
+// and motion activity instead of using hard-coded values. It seems like we
+// roughly half the lambda for each increase in resolution
+// These are multiplier used to perform regularization in motion compensation
+// when x->mv_cost_type is set to MV_COST_L1.
+// LOWRES
+#define SSE_LAMBDA_LOWRES 2   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_LOWRES 32  // Used by mvsad_err_cost during full pixel search
+// MIDRES
+#define SSE_LAMBDA_MIDRES 0   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_MIDRES 15  // Used by mvsad_err_cost during full pixel search
+// HDRES
+#define SSE_LAMBDA_HDRES 1  // Used by mv_cost_err_fn
+#define SAD_LAMBDA_HDRES 8  // Used by mvsad_err_cost during full pixel search
+
+// Returns the rate of encoding the current motion vector based on the
+// joint_cost and comp_cost. joint_costs covers the cost of transmitting
+// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
+// vector.
 static INLINE int mv_cost(const MV *mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
+                          const int *const comp_cost[2]) {
   return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
          comp_cost[1][mv->col];
 }
 
-int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
+// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
+// is defined as the rate required to encode diff * weight, rounded to the
+// nearest 2 ** 7.
+// This is NOT used during motion compensation.
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
                     int *mvcost[2], int weight) {
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
-}
-
-#define PIXEL_TRANSFORM_ERROR_SCALE 4
-static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
-                       int *mvcost[2], int error_per_bit) {
-  if (mvcost) {
-    const MV diff = { mv->row - ref->row, mv->col - ref->col };
-    return (int)ROUND_POWER_OF_TWO_64(
-        (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
-        RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
-            PIXEL_TRANSFORM_ERROR_SCALE);
-  }
-  return 0;
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int sad_per_bit) {
-  const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
   return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmv_vec_cost, x->mv_cost_stack) * sad_per_bit,
-      AV1_PROB_COST_SHIFT);
+      mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
 }
 
+// Returns the cost of using the current mv during the motion search. This is
+// used when var is used as the error metric.
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+                              const int *mvjcost, const int *const mvcost[2],
+                              int error_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+  const MV abs_diff = { abs(diff.row), abs(diff.col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      if (mvcost) {
+        return (int)ROUND_POWER_OF_TWO_64(
+            (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+            RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+                PIXEL_TRANSFORM_ERROR_SCALE);
+      }
+      return 0;
+    case MV_COST_L1_LOWRES:
+      return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mv_err_cost_(const MV *mv,
+                               const MV_COST_PARAMS *mv_cost_params) {
+  return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
+                     mv_cost_params->mvcost, mv_cost_params->error_per_bit,
+                     mv_cost_params->mv_cost_type);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// only used during full pixel motion search when sad is used as the error
+// metric
+static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+                                 const int *mvjcost, const int *const mvcost[2],
+                                 int sad_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
+                    GET_MV_SUBPEL(mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return ROUND_POWER_OF_TWO(
+          (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
+              sad_per_bit,
+          AV1_PROB_COST_SHIFT);
+    case MV_COST_L1_LOWRES:
+      return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+                                  const MV_COST_PARAMS *mv_cost_params) {
+  return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
+                        mv_cost_params->mvjcost, mv_cost_params->mvcost,
+                        mv_cost_params->sad_per_bit,
+                        mv_cost_params->mv_cost_type);
+}
+
+// =============================================================================
+//  Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
 void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
+  int ss_count = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
 
-  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
-  cfg->ss[0].offset = 0;
+  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
+  cfg->ss[stage_index][0].offset = 0;
+  cfg->stride = stride;
 
-  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
-    // Generate offsets for 4 search sites per step.
-    const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+    int num_search_pts = 8;
+
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },           { -radius, 0 },      { radius, 0 },
+      { 0, -radius },     { 0, radius },       { -radius, -radius },
+      { radius, radius }, { -radius, radius }, { radius, -radius },
+    };
+
     int i;
-    for (i = 0; i < 4; ++i) {
-      search_site *const ss = &cfg->ss[ss_count++];
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
       ss->mv = ss_mvs[i];
-      ss->offset = ss->mv.row * stride + ss->mv.col;
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
     }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    --stage_index;
+    ++ss_count;
   }
-
   cfg->ss_count = ss_count;
-  cfg->searches_per_step = 4;
+}
+
+void av1_init_motion_fpf(search_site_config *cfg, int stride) {
+  int ss_count = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
+  cfg->ss[stage_index][0].offset = 0;
+  cfg->stride = stride;
+
+  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+    // Generate offsets for 8 search sites per step.
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if (radius == 1) num_search_pts = 8;
+
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
+
+    int i;
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
+      ss->mv = ss_mvs[i];
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    --stage_index;
+    ++ss_count;
+  }
+  cfg->ss_count = ss_count;
 }
 
 void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
+  int ss_count = 0;
+  int stage_index = 0;
+  cfg->stride = stride;
+  int radius = 1;
+  for (stage_index = 0; stage_index < 15; ++stage_index) {
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if (radius <= 5) {
+      tan_radius = radius;
+      num_search_pts = 8;
+    }
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
 
-  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
-  cfg->ss[0].offset = 0;
-
-  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
-    // Generate offsets for 8 search sites per step.
-    const MV ss_mvs[8] = { { -len, 0 },   { len, 0 },     { 0, -len },
-                           { 0, len },    { -len, -len }, { -len, len },
-                           { len, -len }, { len, len } };
-    int i;
-    for (i = 0; i < 8; ++i) {
-      search_site *const ss = &cfg->ss[ss_count++];
+    for (int i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
       ss->mv = ss_mvs[i];
-      ss->offset = ss->mv.row * stride + ss->mv.col;
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
     }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    ++ss_count;
+    if (stage_index < 12)
+      radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
   }
-
   cfg->ss_count = ss_count;
-  cfg->searches_per_step = 8;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
-// convert motion vector component to offset for sv[a]f calc
-static INLINE int sp(int x) { return x & 7; }
-
-static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
-  const int offset = (r >> 3) * stride + (c >> 3);
-  return buf + offset;
-}
-
-/* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    MV this_mv = { r, c };                                                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
-    if (second_pred == NULL) {                                            \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
-                         src_address, src_stride, &sse);                  \
-    } else if (mask) {                                                    \
-      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, second_pred, mask,     \
-                          mask_stride, invert_mask, &sse);                \
-    } else {                                                              \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, &sse, second_pred);    \
-    }                                                                     \
-    v += thismse;                                                         \
-    if (v < besterr) {                                                    \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
-  }
-
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-/* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                  \
-    MV this_mv = { r, c };                                                 \
-    thismse = upsampled_pref_error(                                        \
-        xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
-        pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
-        mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
-    v += thismse;                                                          \
-    if (v < besterr) {                                                     \
-      besterr = v;                                                         \
-      br = r;                                                              \
-      bc = c;                                                              \
-      *distortion = thismse;                                               \
-      *sse1 = sse;                                                         \
-    }                                                                      \
-  } else {                                                                 \
-    v = INT_MAX;                                                           \
-  }
-
-#define FIRST_LEVEL_CHECKS                                       \
-  {                                                              \
-    unsigned int left, right, up, down, diag;                    \
-    CHECK_BETTER(left, tr, tc - hstep);                          \
-    CHECK_BETTER(right, tr, tc + hstep);                         \
-    CHECK_BETTER(up, tr - hstep, tc);                            \
-    CHECK_BETTER(down, tr + hstep, tc);                          \
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);     \
-    switch (whichdir) {                                          \
-      case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
-      case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
-      case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
-      case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
-    }                                                            \
-  }
-
-#define SECOND_LEVEL_CHECKS                                       \
-  {                                                               \
-    int kr, kc;                                                   \
-    unsigned int second;                                          \
-    if (tr != br && tc != bc) {                                   \
-      kr = br - tr;                                               \
-      kc = bc - tc;                                               \
-      CHECK_BETTER(second, tr + kr, tc + 2 * kc);                 \
-      CHECK_BETTER(second, tr + 2 * kr, tc + kc);                 \
-    } else if (tr == br && tc != bc) {                            \
-      kc = bc - tc;                                               \
-      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);              \
-      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);              \
-      switch (whichdir) {                                         \
-        case 0:                                                   \
-        case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
-        case 2:                                                   \
-        case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
-      }                                                           \
-    } else if (tr != br && tc == bc) {                            \
-      kr = br - tr;                                               \
-      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);              \
-      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);              \
-      switch (whichdir) {                                         \
-        case 0:                                                   \
-        case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
-        case 1:                                                   \
-        case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
-      }                                                           \
-    }                                                             \
-  }
-
-// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
-// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
-// later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST(k)                \
-  {                                                \
-    unsigned int second;                           \
-    int br0 = br;                                  \
-    int bc0 = bc;                                  \
-    assert(tr == br || tc == bc);                  \
-    if (tr == br && tc != bc) {                    \
-      kc = bc - tc;                                \
-    } else if (tr != br && tc == bc) {             \
-      kr = br - tr;                                \
-    }                                              \
-    CHECK_BETTER##k(second, br0 + kr, bc0);        \
-    CHECK_BETTER##k(second, br0, bc0 + kc);        \
-    if (br0 != br || bc0 != bc) {                  \
-      CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
-    }                                              \
-  }
-
-#define SETUP_SUBPEL_SEARCH                                             \
-  const uint8_t *const src_address = x->plane[0].src.buf;               \
-  const int src_stride = x->plane[0].src.stride;                        \
-  const MACROBLOCKD *xd = &x->e_mbd;                                    \
-  unsigned int besterr = INT_MAX;                                       \
-  unsigned int sse;                                                     \
-  unsigned int whichdir;                                                \
-  int thismse;                                                          \
-  MV *bestmv = &x->best_mv.as_mv;                                       \
-  const unsigned int halfiters = iters_per_step;                        \
-  const unsigned int quarteriters = iters_per_step;                     \
-  const unsigned int eighthiters = iters_per_step;                      \
-  const int y_stride = xd->plane[0].pre[0].stride;                      \
-  const int offset = bestmv->row * y_stride + bestmv->col;              \
-  const uint8_t *const y = xd->plane[0].pre[0].buf;                     \
-                                                                        \
-  int br = bestmv->row * 8;                                             \
-  int bc = bestmv->col * 8;                                             \
-  int hstep = 4;                                                        \
-  int minc, maxc, minr, maxr;                                           \
-  int tr = br;                                                          \
-  int tc = bc;                                                          \
-                                                                        \
-  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
-                             ref_mv);                                   \
-                                                                        \
-  bestmv->row *= 8;                                                     \
-  bestmv->col *= 8;
-
-static unsigned int setup_center_error(
-    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
-    int *mvcost[2], unsigned int *sse1, int *distortion) {
-  unsigned int besterr;
-  if (second_pred != NULL) {
-    if (is_cur_buf_hbd(xd)) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
-      if (mask) {
-        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
-                                  y_stride, mask, mask_stride, invert_mask);
-      } else {
-        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
-                                 y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      if (mask) {
-        aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
-                           mask, mask_stride, invert_mask);
-      } else {
-        aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-    }
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-static INLINE int divide_and_round(int n, int d) {
-  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
-}
-
-static INLINE int is_cost_list_wellbehaved(int *cost_list) {
-  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
-         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
-}
-
-// Returns surface minima estimate at given precision in 1/2^n bits.
-// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
-// For a given set of costs S0, S1, S2, S3, S4 at points
-// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
-// the solution for the location of the minima (x0, y0) is given by:
-// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
-// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
-// The code below is an integerized version of that.
-static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
-}
-
-int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  SETUP_SUBPEL_SEARCH;
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               src_address, src_stride, y, y_stride,
-                               second_pred, mask, mask_stride, invert_mask, w,
-                               h, offset, mvjcost, mvcost, sse1, distortion);
-  (void)halfiters;
-  (void)quarteriters;
-  (void)eighthiters;
-  (void)whichdir;
-  (void)allow_hp;
-  (void)forced_stop;
-  (void)hstep;
-  (void)use_accurate_subpel_search;
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)do_reset_fractional_mv;
-
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    int ir, ic;
-    unsigned int minpt;
-    get_cost_surf_min(cost_list, &ir, &ic, 2);
-    if (ir != 0 || ic != 0) {
-      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
-    }
-  } else {
-    FIRST_LEVEL_CHECKS;
-    if (halfiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-
-    tr = br;
-    tc = bc;
-
-    // Each subsequent iteration checks at least one point in common with
-    // the last iteration could be 2 ( if diag selected) 1/4 pel
-    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-    if (forced_stop != 2) {
-      hstep >>= 1;
-      FIRST_LEVEL_CHECKS;
-      if (quarteriters > 1) {
-        SECOND_LEVEL_CHECKS;
-      }
-    }
-  }
-
-  tr = br;
-  tc = bc;
-
-  if (allow_hp && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-  }
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-int av1_find_best_sub_pixel_tree_pruned_more(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  SETUP_SUBPEL_SEARCH;
-  (void)use_accurate_subpel_search;
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)do_reset_fractional_mv;
-
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               src_address, src_stride, y, y_stride,
-                               second_pred, mask, mask_stride, invert_mask, w,
-                               h, offset, mvjcost, mvcost, sse1, distortion);
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    unsigned int minpt;
-    int ir, ic;
-    get_cost_surf_min(cost_list, &ir, &ic, 1);
-    if (ir != 0 || ic != 0) {
-      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
-    }
-  } else {
-    FIRST_LEVEL_CHECKS;
-    if (halfiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-  }
-
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    tr = br;
-    tc = bc;
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-  }
-
-  if (allow_hp && forced_stop == 0) {
-    tr = br;
-    tc = bc;
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-int av1_find_best_sub_pixel_tree_pruned(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  SETUP_SUBPEL_SEARCH;
-  (void)use_accurate_subpel_search;
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)do_reset_fractional_mv;
-
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               src_address, src_stride, y, y_stride,
-                               second_pred, mask, mask_stride, invert_mask, w,
-                               h, offset, mvjcost, mvcost, sse1, distortion);
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX) {
-    unsigned int left, right, up, down, diag;
-    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
-               (cost_list[2] < cost_list[4] ? 0 : 2);
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(left, tr, tc - hstep);
-        CHECK_BETTER(down, tr + hstep, tc);
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(right, tr, tc + hstep);
-        CHECK_BETTER(down, tr + hstep, tc);
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(left, tr, tc - hstep);
-        CHECK_BETTER(up, tr - hstep, tc);
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(right, tr, tc + hstep);
-        CHECK_BETTER(up, tr - hstep, tc);
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-    }
-  } else {
-    FIRST_LEVEL_CHECKS;
-    if (halfiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-  }
-
-  tr = br;
-  tc = bc;
-
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-
-  if (allow_hp && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-/* clang-format off */
-static const MV search_step_table[12] = {
-  // left, right, up, down
-  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
-  { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
-  { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
-};
-/* clang-format on */
-
-static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                int mi_row, int mi_col, const MV *const mv,
-                                const aom_variance_fn_ptr_t *vfp,
-                                const uint8_t *const src, const int src_stride,
-                                const uint8_t *const y, int y_stride,
-                                int subpel_x_q3, int subpel_y_q3,
-                                const uint8_t *second_pred, const uint8_t *mask,
-                                int mask_stride, int invert_mask, int w, int h,
-                                unsigned int *sse, int subpel_search) {
-  unsigned int besterr;
-  if (is_cur_buf_hbd(xd)) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_highbd_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
-            subpel_search);
-      } else {
-        aom_highbd_comp_avg_upsampled_pred(
-            xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, xd->bd, subpel_search);
-      }
-    } else {
-      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
-                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
-                                subpel_search);
-    }
-    besterr = vfp->vf(pred8, w, src, src_stride, sse);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
-                                     second_pred, w, h, subpel_x_q3,
-                                     subpel_y_q3, y, y_stride, mask,
-                                     mask_stride, invert_mask, subpel_search);
-      } else {
-        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
-                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
-                                    y, y_stride, subpel_search);
-      }
-    } else {
-      aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                         subpel_y_q3, y, y_stride, subpel_search);
-    }
-
-    besterr = vfp->vf(pred, w, src, src_stride, sse);
-  }
-  return besterr;
-}
-
-static unsigned int upsampled_setup_center_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
-    const int src_stride, const uint8_t *const y, int y_stride,
-    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
-    int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
-    unsigned int *sse1, int *distortion, int subpel_search) {
-  unsigned int besterr =
-      upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
-                           y + offset, y_stride, 0, 0, second_pred, mask,
-                           mask_stride, invert_mask, w, h, sse1, subpel_search);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-// when use_accurate_subpel_search == 0
-static INLINE unsigned int estimate_upsampled_pref_error(
-    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
-    const int src_stride, const uint8_t *const pre, int y_stride,
-    int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
-    const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) {
-  if (second_pred == NULL) {
-    return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
-                    sse);
-  } else if (mask) {
-    return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
-                     second_pred, mask, mask_stride, invert_mask, sse);
-  } else {
-    return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
-                     sse, second_pred);
-  }
-}
-
-int av1_find_best_sub_pixel_tree(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  const uint8_t *const src_address = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int thismse;
-  const int y_stride = xd->plane[0].pre[0].stride;
-  MV *bestmv = &x->best_mv.as_mv;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int iter, round = 3 - forced_stop;
-  int tr = br;
-  int tc = bc;
-  const MV *search_step = search_step_table;
-  int idx, best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-  int minc, maxc, minr, maxr;
-
-  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  if (use_accurate_subpel_search)
-    besterr = upsampled_setup_center_error(
-        xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
-        src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
-        h, offset, mvjcost, mvcost, sse1, distortion,
-        use_accurate_subpel_search);
-  else
-    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                                 src_address, src_stride, y, y_stride,
-                                 second_pred, mask, mask_stride, invert_mask, w,
-                                 h, offset, mvjcost, mvcost, sse1, distortion);
-
-  (void)cost_list;  // to silence compiler warning
-
-  if (do_reset_fractional_mv) {
-    av1_set_fractional_mv(x->fractional_best_mv);
-  }
-
-  for (iter = 0; iter < round; ++iter) {
-    if ((x->fractional_best_mv[iter].as_mv.row == br) &&
-        (x->fractional_best_mv[iter].as_mv.col == bc))
-      return INT_MAX;
-    x->fractional_best_mv[iter].as_mv.row = br;
-    x->fractional_best_mv[iter].as_mv.col = bc;
-    // Check vertical and horizontal sub-pixel positions.
-    for (idx = 0; idx < 4; ++idx) {
-      tr = br + search_step[idx].row;
-      tc = bc + search_step[idx].col;
-      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        MV this_mv = { tr, tc };
-
-        if (use_accurate_subpel_search) {
-          thismse = upsampled_pref_error(
-              xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-              mask, mask_stride, invert_mask, w, h, &sse,
-              use_accurate_subpel_search);
-        } else {
-          thismse = estimate_upsampled_pref_error(
-              vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride,
-              sp(tc), sp(tr), second_pred, mask, mask_stride, invert_mask,
-              &sse);
-        }
-
-        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
-                                                mvcost, error_per_bit);
-
-        if (cost_array[idx] < besterr) {
-          best_idx = idx;
-          besterr = cost_array[idx];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    tc = bc + kc;
-    tr = br + kr;
-    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      MV this_mv = { tr, tc };
-
-      if (use_accurate_subpel_search) {
-        thismse = upsampled_pref_error(
-            xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
-            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-            mask, mask_stride, invert_mask, w, h, &sse,
-            use_accurate_subpel_search);
-      } else {
-        thismse = estimate_upsampled_pref_error(
-            vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride,
-            sp(tc), sp(tr), second_pred, mask, mask_stride, invert_mask, &sse);
-      }
-
-      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                            error_per_bit);
-
-      if (cost_array[4] < besterr) {
-        best_idx = 4;
-        besterr = cost_array[4];
-        *distortion = thismse;
-        *sse1 = sse;
-      }
-    } else {
-      cost_array[idx] = INT_MAX;
-    }
-
-    if (best_idx < 4 && best_idx >= 0) {
-      br += search_step[best_idx].row;
-      bc += search_step[best_idx].col;
-    } else if (best_idx == 4) {
-      br = tr;
-      bc = tc;
-    }
-
-    if (iters_per_step > 1 && best_idx != -1) {
-      if (use_accurate_subpel_search) {
-        SECOND_LEVEL_CHECKS_BEST(1);
-      } else {
-        SECOND_LEVEL_CHECKS_BEST(0);
-      }
-    }
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-#undef PRE
-#undef CHECK_BETTER
-
-unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                     const MV *this_mv) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const uint8_t *const src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t *const dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
-  const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
-  const int_mv ref_mv = av1_get_ref_mv(x, 0);
-  unsigned int mse;
-  unsigned int sse;
-
-  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                AOM_PLANE_Y, AOM_PLANE_Y);
-  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
-  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, x->mv_cost_stack,
-                     x->errorperbit);
-  return mse;
-}
-
-// Refine MV in a small range
-unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int *pts0, int *pts_inref0,
-                                  int total_samples) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
-                            { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
-  const int_mv ref_mv = av1_get_ref_mv(x, 0);
-  int16_t br = mbmi->mv[0].as_mv.row;
-  int16_t bc = mbmi->mv[0].as_mv.col;
-  int16_t *tr = &mbmi->mv[0].as_mv.row;
-  int16_t *tc = &mbmi->mv[0].as_mv.col;
-  WarpedMotionParams best_wm_params = mbmi->wm_params;
-  int best_num_proj_ref = mbmi->num_proj_ref;
-  unsigned int bestmse;
-  int minc, maxc, minr, maxr;
-  const int start = cm->allow_high_precision_mv ? 0 : 4;
-  int ite;
-
-  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                             &ref_mv.as_mv);
-
-  // Calculate the center position's error
-  assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
-  bestmse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col,
-                                    &mbmi->mv[0].as_mv);
-
-  // MV search
-  for (ite = 0; ite < 2; ++ite) {
-    int best_idx = -1;
-    int idx;
-
-    for (idx = start; idx < start + 4; ++idx) {
-      unsigned int thismse;
-
-      *tr = br + neighbors[idx].row;
-      *tc = bc + neighbors[idx].col;
-
-      if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
-        MV this_mv = { *tr, *tc };
-        int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-
-        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
-        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-        if (total_samples > 1)
-          mbmi->num_proj_ref =
-              selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
-
-        if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
-                             *tc, &mbmi->wm_params, mi_row, mi_col)) {
-          thismse =
-              av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
-
-          if (thismse < bestmse) {
-            best_idx = idx;
-            best_wm_params = mbmi->wm_params;
-            best_num_proj_ref = mbmi->num_proj_ref;
-            bestmse = thismse;
-          }
-        }
-      }
-    }
-
-    if (best_idx == -1) break;
-
-    if (best_idx >= 0) {
-      br += neighbors[best_idx].row;
-      bc += neighbors[best_idx].col;
-    }
-  }
-
-  *tr = br;
-  *tc = bc;
-  mbmi->wm_params = best_wm_params;
-  mbmi->num_proj_ref = best_num_proj_ref;
-  return bestmse;
-}
-
-static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+// Checks whether the mv is within range of the mv_limits
+static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
                                int range) {
   return ((row - range) >= mv_limits->row_min) &
          ((row + range) <= mv_limits->row_max) &
@@ -998,158 +396,240 @@
          ((col + range) <= mv_limits->col_max);
 }
 
-static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
-  return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
-         (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+static INLINE int get_mvpred_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  unsigned unused;
+  int bestsme;
+
+  bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                    ref_stride, &unused);
+
+  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+
+  return bestsme;
 }
 
-#define CHECK_BETTER                                                      \
-  {                                                                       \
-    if (thissad < bestsad) {                                              \
-      if (use_mvcost)                                                     \
-        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
-      if (thissad < bestsad) {                                            \
-        bestsad = thissad;                                                \
-        best_site = i;                                                    \
-      }                                                                   \
-    }                                                                     \
+static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct buf_2d *const src,
+                                 const uint8_t *const ref_address,
+                                 const int ref_stride) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+}
+
+static INLINE int get_mvpred_compound_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+  unsigned unused;
+  int bestsme;
+
+  if (mask) {
+    bestsme = vfp->msvf(src_buf, src_stride, 0, 0,
+                        get_buf_from_fullmv(ref, this_mv), ref_stride,
+                        second_pred, mask, mask_stride, invert_mask, &unused);
+  } else if (second_pred) {
+    bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, &unused, second_pred);
+  } else {
+    bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                      ref_stride, &unused);
   }
 
-#define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
-#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
 
-// Calculate and return a sad+mvcost list around an integer best pel.
-static INLINE void calc_int_cost_list(const MACROBLOCK *x,
-                                      const MV *const ref_mv, int sadpb,
-                                      const aom_variance_fn_ptr_t *fn_ptr,
-                                      const MV *best_mv, int *cost_list) {
-  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
-  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
-  const int br = best_mv->row;
-  const int bc = best_mv->col;
-  int i;
-  unsigned int sse;
-  const MV this_mv = { br, bc };
+  return bestsme;
+}
 
-  cost_list[0] =
-      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
-                 in_what->stride, &sse) +
-      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
-  if (check_bounds(&x->mv_limits, br, bc, 1)) {
-    for (i = 0; i < 4; i++) {
-      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
-                                    get_buf_from_mv(in_what, &neighbor_mv),
-                                    in_what->stride, &sse) +
-                         mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmv_vec_cost,
-                                     x->mv_cost_stack, x->errorperbit);
+static INLINE int get_mvpred_compound_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const struct buf_2d *const src, const uint8_t *const ref_address,
+    const int ref_stride) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+
+  if (mask) {
+    return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
+                     mask, mask_stride, invert_mask);
+  } else if (second_pred) {
+    return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
+  } else {
+    return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+  }
+}
+
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
+#define USE_SAD_COSTLIST 1
+
+// calc_int_cost_list uses var to populate the costlist, which is more accurate
+// than sad but slightly slower.
+static AOM_FORCE_INLINE void calc_int_cost_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
+
+  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv);
+
+  if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
     }
   } else {
-    for (i = 0; i < 4; i++) {
-      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-      if (!is_mv_in(&x->mv_limits, &neighbor_mv))
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
         cost_list[i + 1] = INT_MAX;
-      else
-        cost_list[i + 1] =
-            fn_ptr->vf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &neighbor_mv), in_what->stride,
-                       &sse) +
-            mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, x->errorperbit);
+      } else {
+        cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+      }
     }
   }
 }
 
-static INLINE void calc_int_sad_list(const MACROBLOCK *x,
-                                     const MV *const ref_mv, int sadpb,
-                                     const aom_variance_fn_ptr_t *fn_ptr,
-                                     const MV *best_mv, int *cost_list,
-                                     const int use_mvcost, const int bestsad) {
-  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
-  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
-  int i;
-  const int br = best_mv->row;
-  const int bc = best_mv->col;
+// calc_int_sad_list uses sad to populate the costlist, which is less accurate
+// than var but faster.
+static AOM_FORCE_INLINE void calc_int_sad_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list, int costlist_has_sad) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
 
-  if (cost_list[0] == INT_MAX) {
-    cost_list[0] = bestsad;
-    if (check_bounds(&x->mv_limits, br, bc, 1)) {
-      for (i = 0; i < 4; i++) {
-        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-        cost_list[i + 1] =
-            fn_ptr->sdf(what->buf, what->stride,
-                        get_buf_from_mv(in_what, &this_mv), in_what->stride);
+  assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
+
+  // Refresh the costlist it does not contain valid sad
+  if (!costlist_has_sad) {
+    cost_list[0] = get_mvpred_sad(
+        ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
+
+    if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        cost_list[i + 1] = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
       }
     } else {
-      for (i = 0; i < 4; i++) {
-        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-        if (!is_mv_in(&x->mv_limits, &this_mv))
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
           cost_list[i + 1] = INT_MAX;
-        else
-          cost_list[i + 1] =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &this_mv), in_what->stride);
-      }
-    }
-  } else {
-    if (use_mvcost) {
-      for (i = 0; i < 4; i++) {
-        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-        if (cost_list[i + 1] != INT_MAX) {
-          cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+        } else {
+          cost_list[i + 1] = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
         }
       }
     }
   }
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
+
+  for (int idx = 0; idx < 4; idx++) {
+    if (cost_list[idx + 1] != INT_MAX) {
+      const FULLPEL_MV this_mv = { br + neighbors[idx].row,
+                                   bc + neighbors[idx].col };
+      cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
+    }
+  }
 }
 
+#define CHECK_BETTER                                                      \
+  if (thissad < bestsad) {                                                \
+    int tmp_thissad = thissad;                                            \
+    if (use_mvcost) thissad += mvsad_err_cost_(&this_mv, mv_cost_params); \
+    if (thissad < bestsad) {                                              \
+      raw_bestsad = tmp_thissad;                                          \
+      bestsad = thissad;                                                  \
+      best_site = i;                                                      \
+    }                                                                     \
+  }
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
-//
 static int pattern_search(
-    MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit,
-    int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
-    int use_mvcost, const MV *center_mv,
+    FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const int search_param, const int do_init_search,
     const int num_candidates[MAX_PATTERN_SCALES],
-    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES],
+    int *cost_list, FULLPEL_MV *best_mv) {
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
   int i, s, t;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
   const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
-  int bestsad = INT_MAX;
+  int bestsad = INT_MAX, raw_bestsad = INT_MAX;
   int thissad;
   int k = -1;
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const int use_mvcost = ms_params->mv_cost_params.mv_cost_type != MV_COST_NONE;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   assert(search_param < MAX_MVSEARCH_STEPS);
   int best_init_s = search_param_to_steps[search_param];
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  br = start_mv->row;
-  bc = start_mv->col;
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  br = start_mv.row;
+  bc = start_mv.col;
   if (cost_list != NULL) {
     cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
         INT_MAX;
   }
+  int costlist_has_sad = 0;
 
   // Work out the start point for the search
-  bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, start_mv), in_what->stride) +
-            mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
+  raw_bestsad = get_mvpred_sad(ms_params, src,
+                               get_buf_from_fullmv(ref, &start_mv), ref_stride);
+  bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1157,23 +637,21 @@
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
       int best_site = -1;
-      if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+      if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
         for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = { br + candidates[t][i].row,
-                               bc + candidates[t][i].col };
-          thissad =
-              vfp->sdf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
+                                       bc + candidates[t][i].col };
+          thissad = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = { br + candidates[t][i].row,
-                               bc + candidates[t][i].col };
-          if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
-          thissad =
-              vfp->sdf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
+                                       bc + candidates[t][i].col };
+          if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+          thissad = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
           CHECK_BETTER
         }
       }
@@ -1200,23 +678,22 @@
     for (; s >= last_s; s--) {
       // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
-        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         }
@@ -1237,27 +714,26 @@
         next_chkpts_indices[1] = k;
         next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
 
-        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
+            const FULLPEL_MV this_mv = {
               br + candidates[s][next_chkpts_indices[i]].row,
               bc + candidates[s][next_chkpts_indices[i]].col
             };
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
+            const FULLPEL_MV this_mv = {
               br + candidates[s][next_chkpts_indices[i]].row,
               bc + candidates[s][next_chkpts_indices[i]].col
             };
-            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         }
@@ -1272,25 +748,25 @@
 
     // Note: If we enter the if below, then cost_list must be non-NULL.
     if (s == 0) {
-      cost_list[0] = bestsad;
+      cost_list[0] = raw_bestsad;
+      costlist_has_sad = 1;
       if (!do_init_search || s != best_init_s) {
-        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            cost_list[i + 1] = thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            cost_list[i + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
-            cost_list[i + 1] = thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            cost_list[i + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         }
@@ -1309,32 +785,30 @@
         next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
         cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
         cost_list[((k + 2) % 4) + 1] = cost_list[0];
-        cost_list[0] = bestsad;
+        cost_list[0] = raw_bestsad;
 
-        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
+            const FULLPEL_MV this_mv = {
               br + candidates[s][next_chkpts_indices[i]].row,
               bc + candidates[s][next_chkpts_indices[i]].col
             };
-            cost_list[next_chkpts_indices[i] + 1] = thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
+            const FULLPEL_MV this_mv = {
               br + candidates[s][next_chkpts_indices[i]].row,
               bc + candidates[s][next_chkpts_indices[i]].col
             };
-            if (!is_mv_in(&x->mv_limits, &this_mv)) {
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
               cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
               continue;
             }
-            cost_list[next_chkpts_indices[i] + 1] = thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
             CHECK_BETTER
           }
         }
@@ -1348,6 +822,9 @@
     }
   }
 
+  best_mv->row = br;
+  best_mv->col = bc;
+
   // Returns the one-away integer pel cost/sad around the best as follows:
   // cost_list[0]: cost/sad at the best integer pel
   // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
@@ -1355,75 +832,38 @@
   // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
   // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    const MV best_int_mv = { br, bc };
-    if (last_is_4) {
-      calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
-                        use_mvcost, bestsad);
+    if (USE_SAD_COSTLIST) {
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
     } else {
-      calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
-                         cost_list);
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
     }
   }
-  x->best_mv.as_mv.row = br;
-  x->best_mv.as_mv.col = bc;
-  return bestsad;
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
+  return var_cost;
 }
+#undef CHECK_BETTER
 
-int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
-                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
-                       int use_mvcost) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
-  unsigned int unused;
-
-  return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                 in_what->stride, &unused) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmv_vec_cost,
-                                   x->mv_cost_stack, x->errorperbit)
-                     : 0);
-}
-
-int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
-                          const MV *center_mv, const uint8_t *second_pred,
-                          const aom_variance_fn_ptr_t *vfp, int use_mvcost) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
-  unsigned int unused;
-
-  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
-                   what->buf, what->stride, &unused, second_pred) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmv_vec_cost,
-                                   x->mv_cost_stack, x->errorperbit)
-                     : 0);
-}
-
-int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
-                            const MV *center_mv, const uint8_t *second_pred,
-                            const uint8_t *mask, int mask_stride,
-                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
-                            int use_mvcost) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
-  unsigned int unused;
-
-  return vfp->msvf(what->buf, what->stride, 0, 0,
-                   get_buf_from_mv(in_what, best_mv), in_what->stride,
-                   second_pred, mask, mask_stride, invert_mask, &unused) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmv_vec_cost,
-                                   x->mv_cost_stack, x->errorperbit)
-                     : 0);
-}
-
-int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
-                   int sad_per_bit, int do_init_search, int *cost_list,
-                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                   const MV *center_mv) {
+// For the following foo_search, the input arguments are:
+// x: The struct used to hold a bunch of random configs.
+// start_mv: where we are starting our motion search
+// search_param: how many steps to skip in our motion search. For example,
+//   a value 3 suggests that 3 search steps have already taken place prior to
+//   this function call, so we jump directly to step 4 of the search process
+// sad_per_bit: a multiplier used to convert rate to sad cost
+// do_init_search: if on, do an initial search of all possible scales around the
+//   start_mv, and then pick the best scale.
+// cond_list: used to hold the cost around the best full mv so we can use it to
+//   speed up subpel search later.
+// vfp: a function pointer to the simd function so we can compute the cost
+//   efficiently
+// ref_mv: the reference mv used to compute the mv cost
+static int hex_search(const FULLPEL_MV start_mv,
+                      const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                      const int search_param, const int do_init_search,
+                      int *cost_list, FULLPEL_MV *best_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
@@ -1451,15 +891,14 @@
       { -512, 1024 }, { -1024, 0 } },
   };
   /* clang-format on */
-  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
-                        cost_list, vfp, use_mvcost, center_mv,
-                        hex_num_candidates, hex_candidates);
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        hex_num_candidates, hex_candidates, cost_list, best_mv);
 }
 
-static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
-                         int sad_per_bit, int do_init_search, int *cost_list,
-                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                         const MV *center_mv) {
+static int bigdia_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_param, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1492,15 +931,15 @@
           { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
       };
   /* clang-format on */
-  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
-                        cost_list, vfp, use_mvcost, center_mv,
-                        bigdia_num_candidates, bigdia_candidates);
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        bigdia_num_candidates, bigdia_candidates, cost_list,
+                        best_mv);
 }
 
-static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
-                         int sad_per_bit, int do_init_search, int *cost_list,
-                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                         const MV *center_mv) {
+static int square_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_param, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1533,42 +972,215 @@
           { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
       };
   /* clang-format on */
-  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
-                        cost_list, vfp, use_mvcost, center_mv,
-                        square_num_candidates, square_candidates);
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        square_num_candidates, square_candidates, cost_list,
+                        best_mv);
 }
 
-static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
-                           int sad_per_bit,
-                           int do_init_search,  // must be zero for fast_hex
-                           int *cost_list, const aom_variance_fn_ptr_t *vfp,
-                           int use_mvcost, const MV *center_mv) {
-  return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
-                        center_mv);
+static int fast_hex_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_param, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv) {
+  return hex_search(start_mv, ms_params,
+                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                    do_init_search, cost_list, best_mv);
 }
 
-static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
-                           int sad_per_bit, int do_init_search, int *cost_list,
-                           const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                           const MV *center_mv) {
-  return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                       sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
-                       center_mv);
+static int fast_dia_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_param, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       do_init_search, cost_list, best_mv);
 }
 
-#undef CHECK_BETTER
+static int diamond_search_sad(FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_param, int *num00,
+                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
 
-// Exhuastive motion search around a given centre position with a given
+  const int ref_stride = ref->stride;
+  const uint8_t *best_address;
+
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+  const search_site_config *cfg = ms_params->search_sites;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = 0;
+  int is_off_center = 0;
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  const int tot_steps = cfg->ss_count - search_param;
+
+  *num00 = 0;
+  *best_mv = start_mv;
+
+  // Check the starting position
+  best_address = get_buf_from_fullmv(ref, &start_mv);
+  bestsad = get_mvpred_compound_sad(ms_params, src, best_address, ref_stride);
+  bestsad += mvsad_err_cost_(best_mv, &ms_params->mv_cost_params);
+
+  int next_step_size = tot_steps > 2 ? cfg->radius[tot_steps - 2] : 1;
+  for (int step = tot_steps - 1; step >= 0; --step) {
+    const search_site *ss = cfg->ss[step];
+    best_site = 0;
+    if (step > 0) next_step_size = cfg->radius[step - 1];
+
+    int all_in = 1, j;
+    // Trap illegal vectors
+    all_in &= best_mv->row + ss[1].mv.row >= ms_params->mv_limits.row_min;
+    all_in &= best_mv->row + ss[2].mv.row <= ms_params->mv_limits.row_max;
+    all_in &= best_mv->col + ss[3].mv.col >= ms_params->mv_limits.col_min;
+    all_in &= best_mv->col + ss[4].mv.col <= ms_params->mv_limits.col_max;
+
+    // TODO(anyone): Implement 4 points search for msdf&sdaf
+    if (all_in && !mask && !second_pred) {
+      const uint8_t *src_buf = src->buf;
+      const int src_stride = src->stride;
+      for (int idx = 1; idx <= cfg->searches_per_step[step]; idx += 4) {
+        unsigned char const *block_offset[4];
+        unsigned int sads[4];
+
+        for (j = 0; j < 4; j++)
+          block_offset[j] = ss[idx + j].offset + best_address;
+
+        vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
+        for (j = 0; j < 4; j++) {
+          if (sads[j] < bestsad) {
+            const FULLPEL_MV this_mv = { best_mv->row + ss[idx + j].mv.row,
+                                         best_mv->col + ss[idx + j].mv.col };
+            unsigned int thissad =
+                sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = idx + j;
+            }
+          }
+        }
+      }
+    } else {
+      for (int idx = 1; idx <= cfg->searches_per_step[step]; idx++) {
+        const FULLPEL_MV this_mv = { best_mv->row + ss[idx].mv.row,
+                                     best_mv->col + ss[idx].mv.col };
+
+        if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+          const uint8_t *const check_here = ss[idx].offset + best_address;
+          unsigned int thissad;
+
+          thissad =
+              get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = idx;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site != 0) {
+      if (second_best_mv) {
+        *second_best_mv = *best_mv;
+      }
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      is_off_center = 1;
+    }
+
+    if (is_off_center == 0) (*num00)++;
+
+    if (best_site == 0) {
+      while (next_step_size == cfg->radius[step] && step > 2) {
+        ++(*num00);
+        --step;
+        next_step_size = cfg->radius[step - 1];
+      }
+    }
+  }
+
+  return bestsad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int step_param, int *cost_list,
+                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  int thissme, n, num00 = 0;
+  int bestsme = diamond_search_sad(start_mv, ms_params, step_param, &n, best_mv,
+                                   second_best_mv);
+
+  if (bestsme < INT_MAX) {
+    bestsme = get_mvpred_compound_var_cost(ms_params, best_mv);
+  }
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  const int further_steps = cfg->ss_count - 1 - step_param;
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      // TODO(chiyotsai@google.com): There is another bug here where the second
+      // best mv gets incorrectly overwritten. Fix it later.
+      FULLPEL_MV tmp_best_mv;
+      thissme = diamond_search_sad(start_mv, ms_params, step_param + n, &num00,
+                                   &tmp_best_mv, second_best_mv);
+
+      if (thissme < INT_MAX) {
+        thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv);
+      }
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = tmp_best_mv;
+      }
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  return bestsme;
+}
+
+// Exhaustive motion search around a given centre position with a given
 // step size.
-static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
-                                  int range, int step, int sad_per_bit,
-                                  const aom_variance_fn_ptr_t *fn_ptr,
-                                  const MV *center_mv) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  MV fcenter_mv = { center_mv->row, center_mv->col };
+static int exhaustive_mesh_search(FULLPEL_MV start_mv,
+                                  const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  const int range, const int step,
+                                  FULLPEL_MV *best_mv,
+                                  FULLPEL_MV *second_best_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
@@ -1576,31 +1188,30 @@
 
   assert(step >= 1);
 
-  clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  *best_mv = fcenter_mv;
-  best_sad =
-      fn_ptr->sdf(what->buf, what->stride,
-                  get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
-      mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
-  start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
-  start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
-  end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row);
-  end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  *best_mv = start_mv;
+  best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
+                            ref_stride);
+  best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
+  start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
+  start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
+  end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
+  end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
 
   for (r = start_row; r <= end_row; r += step) {
     for (c = start_col; c <= end_col; c += col_step) {
       // Step > 1 means we are not checking every location in this pass.
       if (step > 1) {
-        const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
-        unsigned int sad =
-            fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                        in_what->stride);
+        const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
+        unsigned int sad = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
         if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
           if (sad < best_sad) {
             best_sad = sad;
-            x->second_best_mv.as_mv = *best_mv;
+            if (second_best_mv) {
+              *second_best_mv = *best_mv;
+            }
             *best_mv = mv;
           }
         }
@@ -1610,34 +1221,37 @@
           unsigned int sads[4];
           const uint8_t *addrs[4];
           for (i = 0; i < 4; ++i) {
-            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
-            addrs[i] = get_buf_from_mv(in_what, &mv);
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            addrs[i] = get_buf_from_fullmv(ref, &mv);
           }
-          fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+          vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
 
           for (i = 0; i < 4; ++i) {
             if (sads[i] < best_sad) {
-              const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+              const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
               const unsigned int sad =
-                  sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+                  sads[i] + mvsad_err_cost_(&mv, mv_cost_params);
               if (sad < best_sad) {
                 best_sad = sad;
-                x->second_best_mv.as_mv = *best_mv;
+                if (second_best_mv) {
+                  *second_best_mv = *best_mv;
+                }
                 *best_mv = mv;
               }
             }
           }
         } else {
           for (i = 0; i < end_col - c; ++i) {
-            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
-            unsigned int sad =
-                fn_ptr->sdf(what->buf, what->stride,
-                            get_buf_from_mv(in_what, &mv), in_what->stride);
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            unsigned int sad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
             if (sad < best_sad) {
-              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              sad += mvsad_err_cost_(&mv, mv_cost_params);
               if (sad < best_sad) {
                 best_sad = sad;
-                x->second_best_mv.as_mv = *best_mv;
+                if (second_best_mv) {
+                  *second_best_mv = *best_mv;
+                }
                 *best_mv = mv;
               }
             }
@@ -1650,236 +1264,27 @@
   return best_sad;
 }
 
-int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             const aom_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv) {
-  int i, j, step;
-
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *in_what;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *best_address;
-
-  unsigned int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-
-  int ref_row;
-  int ref_col;
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
-
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  ref_row = ref_mv->row;
-  ref_col = ref_mv->col;
-  *num00 = 0;
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
-
-  // Work out the start point for the search
-  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-  best_address = in_what;
-
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
-            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    int all_in = 1, t;
-
-    // All_in is true if every one of the points we are checking are within
-    // the bounds of the image.
-    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min);
-    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max);
-    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min);
-    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max);
-
-    // If all the pixels are within the bounds we don't check whether the
-    // search point is valid in this loop,  otherwise we check each point
-    // for validity..
-    if (all_in) {
-      unsigned int sad_array[4];
-
-      for (j = 0; j < cfg->searches_per_step; j += 4) {
-        unsigned char const *block_offset[4];
-
-        for (t = 0; t < 4; t++)
-          block_offset[t] = ss[i + t].offset + best_address;
-
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
-
-        for (t = 0; t < 4; t++, i++) {
-          if (sad_array[t] < bestsad) {
-            const MV this_mv = { best_mv->row + ss[i].mv.row,
-                                 best_mv->col + ss[i].mv.col };
-            sad_array[t] +=
-                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (sad_array[t] < bestsad) {
-              bestsad = sad_array[t];
-              best_site = i;
-            }
-          }
-        }
-      }
-    } else {
-      for (j = 0; j < cfg->searches_per_step; j++) {
-        // Trap illegal vectors
-        const MV this_mv = { best_mv->row + ss[i].mv.row,
-                             best_mv->col + ss[i].mv.col };
-
-        if (is_mv_in(&x->mv_limits, &this_mv)) {
-          const uint8_t *const check_here = ss[i].offset + best_address;
-          unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-          if (thissad < bestsad) {
-            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_site = i;
-            }
-          }
-        }
-        i++;
-      }
-    }
-    if (best_site != last_site) {
-      x->second_best_mv.as_mv = *best_mv;
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
-      while (1) {
-        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
-                             best_mv->col + ss[best_site].mv.col };
-        if (is_mv_in(&x->mv_limits, &this_mv)) {
-          const uint8_t *const check_here = ss[best_site].offset + best_address;
-          unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-          if (thissad < bestsad) {
-            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
-              continue;
-            }
-          }
-        }
-        break;
-      }
-#endif
-    } else if (best_address == in_what) {
-      (*num00)++;
-    }
-  }
-  return bestsad;
-}
-
-/* do_refine: If last step (1-away) of n-step search doesn't pick the center
-              point as the best match, we will do a final 1-away diamond
-              refining search  */
-static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
-                              MV *mvp_full, int step_param, int sadpb,
-                              int further_steps, int do_refine, int *cost_list,
-                              const aom_variance_fn_ptr_t *fn_ptr,
-                              const MV *ref_mv, const search_site_config *cfg) {
-  MV temp_mv;
-  int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param,
-                                        sadpb, &n, fn_ptr, ref_mv);
-  if (bestsme < INT_MAX)
-    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-  x->best_mv.as_mv = temp_mv;
-
-  // If there won't be more n-step search, check to see if refining search is
-  // needed.
-  if (n > further_steps) do_refine = 0;
-
-  while (n < further_steps) {
-    ++n;
-
-    if (num00) {
-      num00--;
-    } else {
-      thissme =
-          cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param + n,
-                                  sadpb, &num00, fn_ptr, ref_mv);
-      if (thissme < INT_MAX)
-        thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-
-      // check to see if refining search is needed.
-      if (num00 > further_steps - n) do_refine = 0;
-
-      if (thissme < bestsme) {
-        bestsme = thissme;
-        x->best_mv.as_mv = temp_mv;
-      }
-    }
-  }
-
-  // final 1-away diamond refining search
-  if (do_refine) {
-    const int search_range = 8;
-    MV best_mv = x->best_mv.as_mv;
-    thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
-    if (thissme < INT_MAX)
-      thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
-    if (thissme < bestsme) {
-      bestsme = thissme;
-      x->best_mv.as_mv = best_mv;
-    }
-  }
-
-  // Return cost list.
-  if (cost_list) {
-    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
-  }
-  return bestsme;
-}
-
-#define MIN_RANGE 7
-#define MAX_RANGE 256
-#define MIN_INTERVAL 1
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
-static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                 const MV *centre_mv_full, int sadpb,
-                                 int *cost_list,
-                                 const aom_variance_fn_ptr_t *fn_ptr,
-                                 const MV *ref_mv, MV *dst_mv) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
-  MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
+                                 const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct MESH_PATTERN *const mesh_patterns,
+                                 int *cost_list, FULLPEL_MV *best_mv,
+                                 FULLPEL_MV *second_best_mv) {
+  const int kMinRange = 7;
+  const int kMaxRange = 256;
+  const int kMinInterval = 1;
+
   int bestsme;
   int i;
-  int interval = sf->mesh_patterns[0].interval;
-  int range = sf->mesh_patterns[0].range;
+  int interval = mesh_patterns[0].interval;
+  int range = mesh_patterns[0].range;
   int baseline_interval_divisor;
 
-  // Keep track of number of exhaustive calls (this frame in this thread).
-  if (x->ex_search_count_ptr != NULL) ++(*x->ex_search_count_ptr);
+  *best_mv = start_mv;
 
   // Trap illegal values for interval and range for this function.
-  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+  if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
       (interval > range))
     return INT_MAX;
 
@@ -1887,119 +1292,47 @@
 
   // Check size of proposed first range against magnitude of the centre
   // value used as a starting point.
-  range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
-  range = AOMMIN(range, MAX_RANGE);
+  range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
+  range = AOMMIN(range, kMaxRange);
   interval = AOMMAX(interval, range / baseline_interval_divisor);
 
   // initial search
-  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
-                                   sadpb, fn_ptr, &temp_mv);
+  bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
+                                   best_mv, second_best_mv);
 
-  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+  if ((interval > kMinInterval) && (range > kMinRange)) {
     // Progressive searches with range and step size decreasing each time
     // till we reach a step size of 1. Then break out.
     for (i = 1; i < MAX_MESH_STEP; ++i) {
       // First pass with coarser step and longer range
-      bestsme = exhuastive_mesh_search(
-          x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
-          sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+      bestsme = exhaustive_mesh_search(
+          *best_mv, ms_params, mesh_patterns[i].range,
+          mesh_patterns[i].interval, best_mv, second_best_mv);
 
-      if (sf->mesh_patterns[i].interval == 1) break;
+      if (mesh_patterns[i].interval == 1) break;
     }
   }
 
-  if (bestsme < INT_MAX)
-    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-  *dst_mv = temp_mv;
+  if (bestsme < INT_MAX) {
+    bestsme = get_mvpred_var_cost(ms_params, best_mv);
+  }
 
   // Return cost list.
   if (cost_list) {
-    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
   }
   return bestsme;
 }
 
-int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
-                            int search_range,
-                            const aom_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-    const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
-                       ((ref_mv->row + 1) < x->mv_limits.row_max) &
-                       ((ref_mv->col - 1) > x->mv_limits.col_min) &
-                       ((ref_mv->col + 1) < x->mv_limits.col_max);
-
-    if (all_in) {
-      unsigned int sads[4];
-      const uint8_t *const positions[4] = { best_address - in_what->stride,
-                                            best_address - 1, best_address + 1,
-                                            best_address + in_what->stride };
-
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
-
-      for (j = 0; j < 4; ++j) {
-        if (sads[j] < best_sad) {
-          const MV mv = { ref_mv->row + neighbors[j].row,
-                          ref_mv->col + neighbors[j].col };
-          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
-          if (sads[j] < best_sad) {
-            best_sad = sads[j];
-            best_site = j;
-          }
-        }
-      }
-    } else {
-      for (j = 0; j < 4; ++j) {
-        const MV mv = { ref_mv->row + neighbors[j].row,
-                        ref_mv->col + neighbors[j].col };
-
-        if (is_mv_in(&x->mv_limits, &mv)) {
-          unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              best_site = j;
-            }
-          }
-        }
-      }
-    }
-
-    if (best_site == -1) {
-      break;
-    } else {
-      x->second_best_mv.as_mv = *ref_mv;
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
-      best_address = get_buf_from_mv(in_what, ref_mv);
-    }
-  }
-
-  return best_sad;
-}
-
 // This function is called when we do joint motion search in comp_inter_inter
 // mode, or when searching for one component of an ext-inter compound mode.
-int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
-                             const aom_variance_fn_ptr_t *fn_ptr,
-                             const uint8_t *mask, int mask_stride,
-                             int invert_mask, const MV *center_mv,
-                             const uint8_t *second_pred) {
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
   static const search_neighbors neighbors[8] = {
     { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
     { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
@@ -2010,59 +1343,47 @@
     { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
     { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
   };
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  MV *best_mv = &x->best_mv.as_mv;
-  unsigned int best_sad = INT_MAX;
-  int i, j;
-  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
-      { 0 };
+
+  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
+                                SEARCH_GRID_STRIDE_8P] = { 0 };
   int grid_center = SEARCH_GRID_CENTER_8P;
   int grid_coord = grid_center;
 
-  clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  if (mask) {
-    best_sad = fn_ptr->msdf(what->buf, what->stride,
-                            get_buf_from_mv(in_what, best_mv), in_what->stride,
-                            second_pred, mask, mask_stride, invert_mask) +
-               mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
-  } else {
-    best_sad =
-        fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                     in_what->stride, second_pred) +
-        mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
-  }
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const struct buf_2d *src = ms_buffers->src;
+  const struct buf_2d *ref = ms_buffers->ref;
+  const int ref_stride = ref->stride;
+
+  *best_mv = start_mv;
+  clamp_fullmv(best_mv, mv_limits);
+
+  unsigned int best_sad = get_mvpred_compound_sad(
+      ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
+  best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
 
   do_refine_search_grid[grid_coord] = 1;
 
-  for (i = 0; i < search_range; ++i) {
+  for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
     int best_site = -1;
 
-    for (j = 0; j < 8; ++j) {
+    for (int j = 0; j < 8; ++j) {
       grid_coord = grid_center + neighbors[j].coord_offset;
       if (do_refine_search_grid[grid_coord] == 1) {
         continue;
       }
-      const MV mv = { best_mv->row + neighbors[j].coord.row,
-                      best_mv->col + neighbors[j].coord.col };
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
+                              best_mv->col + neighbors[j].coord.col };
 
       do_refine_search_grid[grid_coord] = 1;
-      if (is_mv_in(&x->mv_limits, &mv)) {
+      if (av1_is_fullmv_in_range(mv_limits, mv)) {
         unsigned int sad;
-        if (mask) {
-          sad = fn_ptr->msdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &mv), in_what->stride,
-                             second_pred, mask, mask_stride, invert_mask);
-        } else {
-          sad = fn_ptr->sdaf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &mv), in_what->stride,
-                             second_pred);
-        }
+        sad = get_mvpred_compound_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
         if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
           if (sad < best_sad) {
             best_sad = sad;
             best_site = j;
@@ -2082,19 +1403,164 @@
   return best_sad;
 }
 
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  int is_allowed = sf->allow_exhaustive_searches &&
-                   (sf->exhaustive_searches_thresh < INT_MAX) &&
-                   !cpi->rc.is_src_frame_alt_ref;
-  if (x->m_search_count_ptr != NULL && x->ex_search_count_ptr != NULL) {
-    const int max_ex =
-        AOMMAX(MIN_EX_SEARCH_LIMIT,
-               (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
-    is_allowed = *x->ex_search_count_ptr <= max_ex && is_allowed;
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const SEARCH_METHODS search_method = ms_params->search_method;
+
+  const int is_intra_mode = ms_params->is_intra_mode;
+  int run_mesh_search = ms_params->run_mesh_search;
+
+  int var = 0;
+  MARK_MV_INVALID(best_mv);
+  if (second_best_mv) {
+    MARK_MV_INVALID(second_best_mv);
   }
-  return is_allowed;
+
+  assert(ms_params->ms_buffers.second_pred == NULL &&
+         ms_params->ms_buffers.mask == NULL &&
+         "av1_full_pixel_search does not support compound pred");
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  switch (search_method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv);
+      break;
+    case HEX:
+      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case SQUARE:
+      var =
+          square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case BIGDIA:
+      var =
+          bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case NSTEP:
+    case DIAMOND:
+      var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
+                               best_mv, second_best_mv);
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search && search_method == NSTEP) {
+    int exhuastive_thr = ms_params->force_mesh_thresh;
+    exhuastive_thr >>=
+        10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    // Threshold variance for an exhaustive full search.
+    if (var > exhuastive_thr) run_mesh_search = 1;
+  }
+
+  // TODO(yunqing): the following is used to reduce mesh search in temporal
+  // filtering. Can extend it to intrabc.
+  if (!is_intra_mode && ms_params->prune_mesh_search) {
+    const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
+                                        abs(start_mv.col - best_mv->col));
+    if (full_pel_mv_diff <= 4) {
+      run_mesh_search = 0;
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    FULLPEL_MV tmp_mv_ex;
+    // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
+    // or non-intraBC)
+    // TODO(chiyotsai@google.com):  There is a bug here where the second best mv
+    // gets overwritten without actually comparing the rdcost.
+    const MESH_PATTERN *const mesh_patterns =
+        ms_params->mesh_patterns[is_intra_mode];
+    // TODO(chiyotsai@google.com): the second best mv is not set correctly by
+    // full_pixel_exhaustive, which can incorrectly override it.
+    var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns,
+                                   cost_list, &tmp_mv_ex, second_best_mv);
+    if (var_ex < var) {
+      var = var_ex;
+      *best_mv = tmp_mv_ex;
+    }
+  }
+
+  return var;
+}
+
+int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv) {
+  if (!av1_use_hash_me(cpi)) return INT_MAX;
+
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+
+  if (block_width != block_height) return INT_MAX;
+
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffer = &ms_params->ms_buffers;
+
+  const uint8_t *src = ms_buffer->src->buf;
+  const int src_stride = ms_buffer->src->stride;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int x_pos = mi_col * MI_SIZE;
+  const int y_pos = mi_row * MI_SIZE;
+
+  uint32_t hash_value1, hash_value2;
+  int best_hash_cost = INT_MAX;
+
+  // for the hashMap
+  hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
+
+  av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
+                           &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+
+  const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+  if (count <= 1) {
+    return INT_MAX;
+  }
+
+  Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+  for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
+    block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
+    if (hash_value2 == ref_block_hash.hash_value2) {
+      // Make sure the prediction is from valid area.
+      const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
+                      GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
+      if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
+                           cpi->common.seq_params.mib_size_log2))
+        continue;
+
+      FULLPEL_MV hash_mv;
+      hash_mv.col = ref_block_hash.x - x_pos;
+      hash_mv.row = ref_block_hash.y - y_pos;
+      if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
+      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv);
+      if (refCost < best_hash_cost) {
+        best_hash_cost = refCost;
+        *best_mv = hash_mv;
+      }
+    }
+  }
+
+  return best_hash_cost;
 }
 
 static int vector_match(int16_t *ref, int16_t *src, int bwl) {
@@ -2162,13 +1628,7 @@
   return (center - (bw >> 1));
 }
 
-static const MV search_pos[4] = {
-  { -1, 0 },
-  { 0, -1 },
-  { 0, 1 },
-  { 1, 0 },
-};
-
+// A special fast version of motion search used in rt mode
 unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize, int mi_row,
                                            int mi_col, const MV *ref_mv) {
@@ -2187,13 +1647,17 @@
   const int src_stride = x->plane[0].src.stride;
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
-  MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+  int_mv *best_int_mv = &xd->mi[0]->mv[0];
   unsigned int best_sad, tmp_sad, this_sad[4];
-  MV this_mv;
   const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
-  MvLimits subpel_mv_limits;
+  static const MV search_pos[4] = {
+    { -1, 0 },
+    { 0, -1 },
+    { 0, 1 },
+    { 1, 0 },
+  };
 
   if (scaled_ref_frame) {
     int i;
@@ -2207,8 +1671,7 @@
 
   if (xd->bd != 8) {
     unsigned int sad;
-    tmp_mv->row = 0;
-    tmp_mv->col = 0;
+    best_int_mv->as_fullmv = kZeroFullMv;
     sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
                                  xd->plane[0].pre[0].buf, ref_stride);
 
@@ -2245,12 +1708,14 @@
   }
 
   // Find the best match per 1-D search
-  tmp_mv->col = vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
-  tmp_mv->row = vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+  best_int_mv->as_fullmv.col =
+      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+  best_int_mv->as_fullmv.row =
+      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
 
-  this_mv = *tmp_mv;
+  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   src_buf = x->plane[0].src.buf;
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
   best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
 
   {
@@ -2267,8 +1732,8 @@
   for (idx = 0; idx < 4; ++idx) {
     if (this_sad[idx] < best_sad) {
       best_sad = this_sad[idx];
-      tmp_mv->row = search_pos[idx].row + this_mv.row;
-      tmp_mv->col = search_pos[idx].col + this_mv.col;
+      best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+      best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
     }
   }
 
@@ -2282,22 +1747,19 @@
   else
     this_mv.col += 1;
 
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
 
   tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   if (best_sad > tmp_sad) {
-    *tmp_mv = this_mv;
+    best_int_mv->as_fullmv = this_mv;
     best_sad = tmp_sad;
   }
 
-  tmp_mv->row *= 8;
-  tmp_mv->col *= 8;
+  convert_fullmv_to_mv(best_int_mv);
 
-  set_subpel_mv_search_range(
-      &x->mv_limits, &subpel_mv_limits.col_min, &subpel_mv_limits.col_max,
-      &subpel_mv_limits.row_min, &subpel_mv_limits.row_max, ref_mv);
-  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
-           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+  SubpelMvLimits subpel_mv_limits;
+  av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(&best_int_mv->as_mv, &subpel_mv_limits);
 
   if (scaled_ref_frame) {
     int i;
@@ -2307,479 +1769,53 @@
   return best_sad;
 }
 
-int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int method,
-                          int run_mesh_search, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, int var_max, int rd,
-                          int x_pos, int y_pos, int intra,
-                          const search_site_config *cfg) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
-  int var = 0;
+// =============================================================================
+//  Fullpixel Motion Search: OBMC
+// =============================================================================
+static INLINE int get_obmc_mvpred_var(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
 
-  if (cost_list) {
-    cost_list[0] = INT_MAX;
-    cost_list[1] = INT_MAX;
-    cost_list[2] = INT_MAX;
-    cost_list[3] = INT_MAX;
-    cost_list[4] = INT_MAX;
-  }
-
-  // Keep track of number of searches (this frame in this thread).
-  if (x->m_search_count_ptr != NULL) ++(*x->m_search_count_ptr);
-
-  switch (method) {
-    case FAST_DIAMOND:
-      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                            cost_list, fn_ptr, 1, ref_mv);
-      break;
-    case FAST_HEX:
-      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                            cost_list, fn_ptr, 1, ref_mv);
-      break;
-    case HEX:
-      var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
-                           fn_ptr, 1, ref_mv);
-      break;
-    case SQUARE:
-      var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
-                          fn_ptr, 1, ref_mv);
-      break;
-    case BIGDIA:
-      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
-                          fn_ptr, 1, ref_mv);
-      break;
-    case NSTEP:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, cfg);
-
-      // Should we allow a follow on exhaustive search?
-      if (is_exhaustive_allowed(cpi, x)) {
-        int exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-          int var_ex;
-          MV tmp_mv_ex;
-          var_ex =
-              full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
-                                    cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            x->best_mv.as_mv = tmp_mv_ex;
-          }
-        }
-      }
-      break;
-    default: assert(0 && "Invalid search method.");
-  }
-
-  // Should we allow a follow on exhaustive search?
-  if (!run_mesh_search) {
-    if (method == NSTEP) {
-      if (is_exhaustive_allowed(cpi, x)) {
-        int exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) run_mesh_search = 1;
-      }
-    }
-  }
-
-  if (run_mesh_search) {
-    int var_ex;
-    MV tmp_mv_ex;
-    var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
-                                   cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-    if (var_ex < var) {
-      var = var_ex;
-      x->best_mv.as_mv = tmp_mv_ex;
-    }
-  }
-
-  if (method != NSTEP && rd && var < var_max)
-    var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
-
-  do {
-    if (!intra || !av1_use_hash_me(&cpi->common)) break;
-
-    // already single ME
-    // get block size and original buffer of current block
-    const int block_height = block_size_high[bsize];
-    const int block_width = block_size_wide[bsize];
-    if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
-      if (block_width == 4 || block_width == 8 || block_width == 16 ||
-          block_width == 32 || block_width == 64 || block_width == 128) {
-        uint8_t *what = x->plane[0].src.buf;
-        const int what_stride = x->plane[0].src.stride;
-        uint32_t hash_value1, hash_value2;
-        MV best_hash_mv;
-        int best_hash_cost = INT_MAX;
-
-        // for the hashMap
-        hash_table *ref_frame_hash =
-            intra ? &cpi->common.cur_frame->hash_table
-                  : av1_get_ref_frame_hash_map(&cpi->common,
-                                               x->e_mbd.mi[0]->ref_frame[0]);
-
-        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
-                                 &hash_value2, is_cur_buf_hbd(&x->e_mbd), x);
-
-        const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
-        // for intra, at lest one matching can be found, itself.
-        if (count <= (intra ? 1 : 0)) {
-          break;
-        }
-
-        Iterator iterator =
-            av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
-        for (int i = 0; i < count; i++, iterator_increment(&iterator)) {
-          block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
-          if (hash_value2 == ref_block_hash.hash_value2) {
-            // For intra, make sure the prediction is from valid area.
-            if (intra) {
-              const int mi_col = x_pos / MI_SIZE;
-              const int mi_row = y_pos / MI_SIZE;
-              const MV dv = { 8 * (ref_block_hash.y - y_pos),
-                              8 * (ref_block_hash.x - x_pos) };
-              if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col,
-                                   bsize, cpi->common.seq_params.mib_size_log2))
-                continue;
-            }
-            MV hash_mv;
-            hash_mv.col = ref_block_hash.x - x_pos;
-            hash_mv.row = ref_block_hash.y - y_pos;
-            if (!is_mv_in(&x->mv_limits, &hash_mv)) continue;
-            const int refCost =
-                av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
-            if (refCost < best_hash_cost) {
-              best_hash_cost = refCost;
-              best_hash_mv = hash_mv;
-            }
-          }
-        }
-        if (best_hash_cost < var) {
-          x->second_best_mv = x->best_mv;
-          x->best_mv.as_mv = best_hash_mv;
-          var = best_hash_cost;
-        }
-      }
-    }
-  } while (0);
-
-  return var;
-}
-
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
-
-/* checks if (r, c) has better score than previous best */
-#define MVC(r, c)                                                              \
-  (unsigned int)(mvcost                                                        \
-                     ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +              \
-                         mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
-                            error_per_bit +                                    \
-                        4096) >>                                               \
-                           13                                                  \
-                     : 0)
-
-#define CHECK_BETTER(v, r, c)                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
-    thismse = (DIST(r, c));                               \
-    if ((v = MVC(r, c) + thismse) < besterr) {            \
-      besterr = v;                                        \
-      br = r;                                             \
-      bc = c;                                             \
-      *distortion = thismse;                              \
-      *sse1 = sse;                                        \
-    }                                                     \
-  } else {                                                \
-    v = INT_MAX;                                          \
-  }
-
-#undef CHECK_BETTER0
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-#undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                     \
-    MV this_mv = { r, c };                                                    \
-    thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
-                                        mask, vfp, z, pre(y, y_stride, r, c), \
-                                        y_stride, sp(c), sp(r), w, h, &sse,   \
-                                        use_accurate_subpel_search);          \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);        \
-    if ((v + thismse) < besterr) {                                            \
-      besterr = v + thismse;                                                  \
-      br = r;                                                                 \
-      bc = c;                                                                 \
-      *distortion = thismse;                                                  \
-      *sse1 = sse;                                                            \
-    }                                                                         \
-  } else {                                                                    \
-    v = INT_MAX;                                                              \
-  }
-
-static unsigned int setup_obmc_center_error(
-    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
-    const uint8_t *const y, int y_stride, int offset, int *mvjcost,
-    int *mvcost[2], unsigned int *sse1, int *distortion) {
-  unsigned int besterr;
-  besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-static int upsampled_obmc_pref_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
-    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
-    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
-    int subpel_search) {
-  unsigned int besterr;
-
-  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
-  if (is_cur_buf_hbd(xd)) {
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
-    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
-                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
-                              subpel_search);
-    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
-  } else {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, y, y_stride, subpel_search);
-
-    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-  }
-  return besterr;
-}
-
-static unsigned int upsampled_setup_obmc_center_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
-    const uint8_t *const y, int y_stride, int w, int h, int offset,
-    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
-    int subpel_search) {
-  unsigned int besterr = upsampled_obmc_pref_error(
-      xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
-      0, w, h, sse1, subpel_search);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_accurate_subpel_search) {
-  const int32_t *wsrc = x->wsrc_buf;
-  const int32_t *mask = x->mask_buf;
-  const int *const z = wsrc;
-  const int *const src_address = z;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int thismse;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int iter;
-  int round = 3 - forced_stop;
-  int tr = br;
-  int tc = bc;
-  const MV *search_step = search_step_table;
-  int idx, best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-  const int w = block_size_wide[mbmi->sb_type];
-  const int h = block_size_high[mbmi->sb_type];
-  int offset;
-  int y_stride;
-  const uint8_t *y;
-
-  int minc, maxc, minr, maxr;
-
-  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
-
-  y = pd->pre[is_second].buf;
-  y_stride = pd->pre[is_second].stride;
-  offset = bestmv->row * y_stride + bestmv->col;
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-  // use_accurate_subpel_search can be 0 or 1 or 2
-  if (use_accurate_subpel_search)
-    besterr = upsampled_setup_obmc_center_error(
-        xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
-        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
-        use_accurate_subpel_search);
-  else
-    besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
-                                      z, y, y_stride, offset, mvjcost, mvcost,
-                                      sse1, distortion);
-
-  for (iter = 0; iter < round; ++iter) {
-    // Check vertical and horizontal sub-pixel positions.
-    for (idx = 0; idx < 4; ++idx) {
-      tr = br + search_step[idx].row;
-      tc = bc + search_step[idx].col;
-      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        MV this_mv = { tr, tc };
-        if (use_accurate_subpel_search) {
-          thismse = upsampled_obmc_pref_error(
-              xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
-              use_accurate_subpel_search);
-        } else {
-          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
-                              sp(tr), src_address, mask, &sse);
-        }
-
-        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
-                                                mvcost, error_per_bit);
-        if (cost_array[idx] < besterr) {
-          best_idx = idx;
-          besterr = cost_array[idx];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    tc = bc + kc;
-    tr = br + kr;
-    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      MV this_mv = { tr, tc };
-
-      if (use_accurate_subpel_search) {
-        thismse = upsampled_obmc_pref_error(
-            xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
-            use_accurate_subpel_search);
-      } else {
-        thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
-                            src_address, mask, &sse);
-      }
-
-      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                            error_per_bit);
-
-      if (cost_array[4] < besterr) {
-        best_idx = 4;
-        besterr = cost_array[4];
-        *distortion = thismse;
-        *sse1 = sse;
-      }
-    } else {
-      cost_array[idx] = INT_MAX;
-    }
-
-    if (best_idx < 4 && best_idx >= 0) {
-      br += search_step[best_idx].row;
-      bc += search_step[best_idx].col;
-    } else if (best_idx == 4) {
-      br = tr;
-      bc = tc;
-    }
-
-    if (iters_per_step > 1 && best_idx != -1) {
-      if (use_accurate_subpel_search) {
-        SECOND_LEVEL_CHECKS_BEST(1);
-      } else {
-        SECOND_LEVEL_CHECKS_BEST(0);
-      }
-    }
-
-    tr = br;
-    tc = bc;
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-#undef DIST
-#undef MVC
-#undef CHECK_BETTER
-
-static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
-                               const int32_t *mask, const MV *best_mv,
-                               const MV *center_mv,
-                               const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                               int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  const MV mv = get_mv_from_fullmv(this_mv);
   unsigned int unused;
 
-  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc,
+  return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
                   mask, &unused) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmv_vec_cost,
-                                   x->mv_cost_stack, x->errorperbit)
-                     : 0);
+         mv_err_cost_(&mv, mv_cost_params);
 }
 
-static int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc,
-                                    const int32_t *mask, MV *ref_mv,
-                                    int error_per_bit, int search_range,
-                                    const aom_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, int is_second) {
-  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
-                                       in_what->stride, wsrc, mask) +
-                          mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
+static int obmc_refining_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
+  const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const int kSearchRange = 8;
 
-  for (i = 0; i < search_range; i++) {
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
+                                       ref_buf->stride, wsrc, mask) +
+                          mvsad_err_cost_(best_mv, mv_cost_params);
+
+  for (int i = 0; i < kSearchRange; i++) {
     int best_site = -1;
 
-    for (j = 0; j < 4; j++) {
-      const MV mv = { ref_mv->row + neighbors[j].row,
-                      ref_mv->col + neighbors[j].col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
-                                        in_what->stride, wsrc, mask);
+    for (int j = 0; j < 4; j++) {
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
+                              best_mv->col + neighbors[j].col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
+                                        ref_buf->stride, wsrc, mask);
         if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
           if (sad < best_sad) {
             best_sad = sad;
             best_site = j;
@@ -2791,121 +1827,88 @@
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
     }
   }
   return best_sad;
 }
 
-static int obmc_diamond_search_sad(const MACROBLOCK *x,
-                                   const search_site_config *cfg,
-                                   const int32_t *wsrc, const int32_t *mask,
-                                   MV *ref_mv, MV *best_mv, int search_param,
-                                   int sad_per_bit, int *num00,
-                                   const aom_variance_fn_ptr_t *fn_ptr,
-                                   const MV *center_mv, int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+static int obmc_diamond_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
+    FULLPEL_MV *best_mv, int search_param, int *num00) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const search_site_config *cfg = ms_params->search_sites;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *const ref_buf = ms_buffers->ref;
   // search_param determines the length of the initial step and hence the number
   // of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
   // (MAX_FIRST_STEP/4) pel... etc.
-  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  const uint8_t *best_address, *in_what_ref;
+
+  const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_param;
+  const uint8_t *best_address, *init_ref;
   int best_sad = INT_MAX;
   int best_site = 0;
-  int last_site = 0;
-  int i, j, step;
+  int step;
 
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
-  best_address = in_what_ref;
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
   *num00 = 0;
-  *best_mv = *ref_mv;
+  *best_mv = start_mv;
 
   // Check the starting position
-  best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
-             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+  best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
+             mvsad_err_cost_(best_mv, mv_cost_params);
 
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j++) {
-      const MV mv = { best_mv->row + ss[i].mv.row,
-                      best_mv->col + ss[i].mv.col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+  for (step = tot_steps; step >= 0; --step) {
+    const search_site *const ss = cfg->ss[step];
+    best_site = 0;
+    for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
+      const FULLPEL_MV mv = { best_mv->row + ss[idx].mv.row,
+                              best_mv->col + ss[idx].mv.col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        int sad = fn_ptr->osdf(best_address + ss[idx].offset, ref_buf->stride,
                                wsrc, mask);
         if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
           if (sad < best_sad) {
             best_sad = sad;
-            best_site = i;
+            best_site = idx;
           }
         }
       }
-
-      i++;
     }
 
-    if (best_site != last_site) {
+    if (best_site != 0) {
       best_mv->row += ss[best_site].mv.row;
       best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
-      last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
-      while (1) {
-        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
-                             best_mv->col + ss[best_site].mv.col };
-        if (is_mv_in(&x->mv_limits, &this_mv)) {
-          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
-                                 in_what->stride, wsrc, mask);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
-              continue;
-            }
-          }
-        }
-        break;
-      }
-#endif
-    } else if (best_address == in_what_ref) {
+    } else if (best_address == init_ref) {
       (*num00)++;
     }
   }
   return best_sad;
 }
 
-static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
-                                   MV *mvp_full, int step_param, int sadpb,
-                                   int further_steps, int do_refine,
-                                   const aom_variance_fn_ptr_t *fn_ptr,
-                                   const MV *ref_mv, MV *dst_mv, int is_second,
-                                   const search_site_config *cfg) {
-  (void)cpi;  // to silence compiler warning
-  const int32_t *wsrc = x->wsrc_buf;
-  const int32_t *mask = x->mask_buf;
-  MV temp_mv;
+static int obmc_full_pixel_diamond(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
+    int step_param, int do_refine, FULLPEL_MV *best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  FULLPEL_MV tmp_mv;
   int thissme, n, num00 = 0;
   int bestsme =
-      obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
-                              step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
-  if (bestsme < INT_MAX)
-    bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
-                                  is_second);
-  *dst_mv = temp_mv;
+      obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
+  if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+  *best_mv = tmp_mv;
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
+  const int further_steps = cfg->ss_count - 1 - step_param;
   if (n > further_steps) do_refine = 0;
 
   while (n < further_steps) {
@@ -2914,271 +1917,1475 @@
     if (num00) {
       num00--;
     } else {
-      thissme = obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv, is_second);
-      if (thissme < INT_MAX)
-        thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
-                                      1, is_second);
+      thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
+                                        step_param + n, &num00);
+      if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
 
       // check to see if refining search is needed.
       if (num00 > further_steps - n) do_refine = 0;
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        *dst_mv = temp_mv;
+        *best_mv = tmp_mv;
       }
     }
   }
 
   // final 1-away diamond refining search
   if (do_refine) {
-    const int search_range = 8;
-    MV best_mv = *dst_mv;
-    thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb,
-                                       search_range, fn_ptr, ref_mv, is_second);
-    if (thissme < INT_MAX)
-      thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1,
-                                    is_second);
+    tmp_mv = *best_mv;
+    thissme = obmc_refining_search_sad(ms_params, &tmp_mv);
+    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
     if (thissme < bestsme) {
       bestsme = thissme;
-      *dst_mv = best_mv;
+      *best_mv = tmp_mv;
     }
   }
   return bestsme;
 }
 
-int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
-                               int step_param, int sadpb, int further_steps,
-                               int do_refine,
-                               const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *ref_mv, MV *dst_mv, int is_second,
-                               const search_site_config *cfg) {
-  if (cpi->sf.obmc_full_pixel_search_level == 0) {
-    return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
-                                   further_steps, do_refine, fn_ptr, ref_mv,
-                                   dst_mv, is_second, cfg);
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv) {
+  if (!ms_params->fast_obmc_search) {
+    const int do_refine = 1;
+    const int bestsme = obmc_full_pixel_diamond(ms_params, start_mv, step_param,
+                                                do_refine, best_mv);
+    return bestsme;
   } else {
-    const int32_t *wsrc = x->wsrc_buf;
-    const int32_t *mask = x->mask_buf;
-    const int search_range = 8;
-    *dst_mv = *mvp_full;
-    clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-             x->mv_limits.row_min, x->mv_limits.row_max);
-    int thissme = obmc_refining_search_sad(
-        x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
-    if (thissme < INT_MAX)
-      thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
-                                    is_second);
+    *best_mv = start_mv;
+    clamp_fullmv(best_mv, &ms_params->mv_limits);
+    int thissme = obmc_refining_search_sad(ms_params, best_mv);
+    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
     return thissme;
   }
 }
 
+// =============================================================================
+//  Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int get_subpel_part(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV mv) {
+  const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
+  return &buf->buf[offset];
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  if (second_pred == NULL) {
+    return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     sse, second_pred);
+  }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                const MV *this_mv,
+                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  unsigned int besterr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_highbd_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, xd->bd, subpel_search_type);
+      } else {
+        aom_highbd_comp_avg_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+            subpel_search_type);
+      }
+    } else {
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                                xd->bd, subpel_search_type);
+    }
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, subpel_search_type);
+      } else {
+        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                    ref, ref_stride, subpel_search_type);
+      }
+    } else {
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search_type);
+    }
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  if (second_pred != NULL) {
+    if (mask) {
+      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                   ref, ref_stride, mask, mask_stride,
+                                   invert_mask, subpel_search_type);
+    } else {
+      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                  ref, ref_stride, subpel_search_type);
+    }
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+  }
+
+  besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+  return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse = estimated_pref_error(this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *is_better) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse;
+    thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *is_better |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+                                    unsigned int right_cost,
+                                    unsigned int up_cost,
+                                    unsigned int down_cost) {
+  const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
+                         left_cost <= right_cost ? -step_size : step_size };
+
+  return diag_step;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE MV first_level_check_fast(
+    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  // Check the four cardinal directions
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  int dummy = 0;
+  const unsigned int left =
+      check_better_fast(&left_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const unsigned int right =
+      check_better_fast(&right_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const unsigned int up =
+      check_better_fast(&top_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                        besterr, sse1, distortion, &dummy);
+
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+  const unsigned int down =
+      check_better_fast(&bottom_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                    besterr, sse1, distortion, &dummy);
+
+  return diag_step;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+    const MV this_mv, const MV diag_step, MV *best_mv, int hstep,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(diag_step.row == hstep || diag_step.row == -hstep);
+  assert(diag_step.col == hstep || diag_step.col == -hstep);
+  const int tr = this_mv.row;
+  const int tc = this_mv.col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int dummy = 0;
+  if (tr != br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    assert(diag_step.row == br - tr);
+    const MV chess_mv_1 = { br, bc + diag_step.col };
+    const MV chess_mv_2 = { br + diag_step.row, bc };
+    check_better_fast(&chess_mv_1, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    check_better_fast(&chess_mv_2, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+  } else if (tr == br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    // Continue searching in the best direction
+    const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
+    const MV top_long_mv = { br - hstep, bc + diag_step.col };
+    check_better_fast(&bottom_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(&top_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br - diag_step.row, bc };
+    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                      besterr, sse1, distortion, &dummy);
+  } else if (tr != br && tc == bc) {
+    assert(diag_step.row == br - tr);
+    // Continue searching in the best direction
+    const MV right_long_mv = { br + diag_step.row, bc + hstep };
+    const MV left_long_mv = { br + diag_step.row, bc - hstep };
+    check_better_fast(&right_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(&left_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br, bc - diag_step.col };
+    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                      besterr, sse1, distortion, &dummy);
+  }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int iters) {
+  const MV diag_step =
+      first_level_check_fast(this_mv, best_mv, hstep, mv_limits, var_params,
+                             mv_cost_params, besterr, sse1, distortion);
+  if (iters > 1) {
+    second_level_check_fast(this_mv, diag_step, best_mv, hstep, mv_limits,
+                            var_params, mv_cost_params, besterr, sse1,
+                            distortion);
+  }
+}
+
+static AOM_FORCE_INLINE MV
+first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
+                  MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
+                  const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                  const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+                  unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  const unsigned int left =
+      check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int right =
+      check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int up =
+      check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int down =
+      check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+               besterr, sse1, distortion, &dummy);
+
+  return diag_step;
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    }
+  } else {
+    check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+    check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv);
+    }
+  }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
+  const int src_stride = ms_buffers->src->stride;
+  const int y_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  unsigned int besterr;
+
+  if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+      if (mask) {
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+                                  mask, mask_stride, invert_mask);
+      } else {
+        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      if (mask) {
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                           mask_stride, invert_mask);
+      } else {
+        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+#else
+    (void)xd;
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+    if (mask) {
+      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                         mask_stride, invert_mask);
+    } else {
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+    }
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+  } else {
+    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+                                         int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+// Checks the list of mvs searched in the last iteration and see if we are
+// repeating it. If so, return 1. Otherwise we update the last_mv_search_list
+// with current_mv and return 0.
+static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+                                               const MV current_mv, int iter) {
+  if (last_mv_search_list) {
+    if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
+      return 1;
+    }
+
+    last_mv_search_list[iter].as_mv = current_mv;
+  }
+  return 0;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    int dummy = 0;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
+      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    if (forced_stop != HALF_PEL) {
+      if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+        return INT_MAX;
+      }
+      iter++;
+
+      hstep >>= 1;
+      start_mv = *bestmv;
+      two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                            mv_cost_params, &besterr, sse1, distortion,
+                            iters_per_step);
+    }
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + ir * hstep,
+                           start_mv.col + ic * hstep };
+      int dummy = 0;
+      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+                                  (cost_list[2] < cost_list[4] ? 0 : 2);
+
+    const MV left_mv = { start_mv.row, start_mv.col - hstep };
+    const MV right_mv = { start_mv.row, start_mv.col + hstep };
+    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+    const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+    int dummy = 0;
+
+    switch (whichdir) {
+      case 0:  // bottom left quadrant
+        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 1:  // bottom right quadrant
+        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 2:  // top left quadrant
+        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 3:  // top right quadrant
+        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 MV start_mv, MV *bestmv, int *distortion,
+                                 unsigned int *sse1,
+                                 int_mv *last_mv_search_list) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  // How many steps to take. A round of 0 means fullpel search only, 1 means
+  // half-pel, and so on.
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+
+  unsigned int besterr = INT_MAX;
+
+  *bestmv = start_mv;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG) {
+    besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                           mv_cost_params, sse1, distortion);
+  } else {
+    besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                                 distortion);
+  }
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
+                                     iter)) {
+      return INT_MAX;
+    }
+
+    MV diag_step;
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                    mv_limits, var_params, mv_cost_params,
+                                    &besterr, sse1, distortion);
+    } else {
+      diag_step = first_level_check_fast(iter_center_mv, bestmv, hstep,
+                                         mv_limits, var_params, mv_cost_params,
+                                         &besterr, sse1, distortion);
+    }
+
+    // Check diagonal sub-pixel position
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                            mv_limits, var_params, mv_cost_params, &besterr,
+                            sse1, distortion);
+    }
+
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
 // Note(yunqingwang): The following 2 functions are only used in the motion
 // vector unit test, which return extreme motion vectors allowed by the MV
 // limits.
-#define COMMON_MV_TEST              \
-  SETUP_SUBPEL_SEARCH;              \
-                                    \
-  (void)error_per_bit;              \
-  (void)vfp;                        \
-  (void)src_address;                \
-  (void)src_stride;                 \
-  (void)y;                          \
-  (void)y_stride;                   \
-  (void)second_pred;                \
-  (void)w;                          \
-  (void)h;                          \
-  (void)use_accurate_subpel_search; \
-  (void)offset;                     \
-  (void)mvjcost;                    \
-  (void)mvcost;                     \
-  (void)sse1;                       \
-  (void)distortion;                 \
-                                    \
-  (void)halfiters;                  \
-  (void)quarteriters;               \
-  (void)eighthiters;                \
-  (void)whichdir;                   \
-  (void)forced_stop;                \
-  (void)hstep;                      \
-                                    \
-  (void)tr;                         \
-  (void)tc;                         \
-  (void)sse;                        \
-  (void)thismse;                    \
-  (void)cost_list;
-// Return the maximum MV.
-int av1_return_max_sub_pixel_mv(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  COMMON_MV_TEST;
-  (void)mask;
-  (void)mask_stride;
-  (void)invert_mask;
-  (void)minr;
-  (void)minc;
-
+// Returns the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv, MV *bestmv, int *distortion,
+                                unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
   (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)do_reset_fractional_mv;
+  (void)start_mv;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
 
-  bestmv->row = maxr;
-  bestmv->col = maxc;
-  besterr = 0;
-  // In the sub-pel motion search, if hp is not used, then the last bit of mv
-  // has to be 0.
-  lower_mv_precision(bestmv, allow_hp, 0);
-  return besterr;
-}
-// Return the minimum MV.
-int av1_return_min_sub_pixel_mv(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
-  COMMON_MV_TEST;
-  (void)maxr;
-  (void)maxc;
-  (void)mask;
-  (void)mask_stride;
-  (void)invert_mask;
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
 
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)do_reset_fractional_mv;
+  bestmv->row = mv_limits->row_max;
+  bestmv->col = mv_limits->col_max;
 
-  bestmv->row = minr;
-  bestmv->col = minc;
-  besterr = 0;
+  unsigned int besterr = 0;
+
   // In the sub-pel motion search, if hp is not used, then the last bit of mv
   // has to be 0.
   lower_mv_precision(bestmv, allow_hp, 0);
   return besterr;
 }
 
-void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int ref,
-                              MV ref_mv_full, int num_planes,
-                              int use_subpixel) {
-  assert(num_planes == 1 &&
-         "Currently simple_motion_search only supports luma plane");
-  assert(!frame_is_intra_only(&cpi->common) &&
-         "Simple motion search only enabled for non-key frames");
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
+// Returns the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv, MV *bestmv, int *distortion,
+                                unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
+  (void)cm;
+  (void)start_mv;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
 
-  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
 
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
-  mbmi->ref_frame[0] = ref;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  bestmv->row = mv_limits->row_min;
+  bestmv->col = mv_limits->col_min;
 
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-  struct buf_2d backup_yv12;
-  // ref_mv is used to code the motion vector. ref_mv_full is the initial point.
-  // ref_mv is in units of 1/8 pel whereas ref_mv_full is in units of pel.
-  MV ref_mv = { 0, 0 };
-  const int step_param = cpi->mv_step_param;
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  const SEARCH_METHODS search_methods = NSTEP;
-  const int do_mesh_search = 0;
-  const int sadpb = x->sadperbit16;
-  int cost_list[5];
-  const int ref_idx = 0;
-  int var;
+  unsigned int besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
 
-  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
-                       get_ref_scale_factors(cm, ref), num_planes);
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  if (scaled_ref_frame) {
-    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
-  }
+// Computes the cost of the current predictor by going through the whole
+// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
+// during motion_mode_rd. We are going through the whole
+// av1_enc_build_inter_predictor because we might have changed the interpolation
+// filter, etc before motion_mode_rd is called.
+static INLINE unsigned int compute_motion_cost(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
+    const MV *this_mv) {
+  unsigned int mse;
+  unsigned int sse;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
-  // This overwrites the mv_limits so we will need to restore it later.
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-  var = av1_full_pixel_search(
-      cpi, x, bsize, &ref_mv_full, step_param, search_methods, do_mesh_search,
-      sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
-      mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC]);
-  // Restore
-  x->mv_limits = tmp_mv_limits;
-
-  const int use_subpel_search =
-      var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
-  }
-  if (use_subpel_search) {
-    int not_used = 0;
-    if (cpi->sf.use_accurate_subpel_search) {
-      const int pw = block_size_wide[bsize];
-      const int ph = block_size_high[bsize];
-      cpi->find_fractional_mv_step(
-          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
-          NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
-    } else {
-      cpi->find_fractional_mv_step(
-          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
-          NULL, 0, 0, 0, 0, 0, 1);
-    }
-  } else {
-    // Manually convert from units of pixel to 1/8-pixels if we are not doing
-    // subpel search
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-  }
-
-  mbmi->mv[0].as_mv = x->best_mv.as_mv;
-
-  // Get a copy of the prediction output
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
                                 AOM_PLANE_Y, AOM_PLANE_Y);
 
-  aom_clear_system_state();
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
 
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  const uint8_t *const src = ms_buffers->src->buf;
+  const int src_stride = ms_buffers->src->stride;
+  const uint8_t *const dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
+
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
+  return mse;
+}
+
+// Refines MV in a small range
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  static const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+                                   { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+  MV *best_mv = &mbmi->mv[0].as_mv;
+
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
+  unsigned int bestmse;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  const int start = ms_params->allow_hp ? 0 : 4;
+
+  // Calculate the center position's error
+  assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
+  bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
+
+  // MV search
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int ite = 0; ite < 2; ++ite) {
+    int best_idx = -1;
+
+    for (int idx = start; idx < start + 4; ++idx) {
+      unsigned int thismse;
+
+      MV this_mv = { best_mv->row + neighbors[idx].row,
+                     best_mv->col + neighbors[idx].col };
+      if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1)
+          mbmi->num_proj_ref =
+              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+
+        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                                 this_mv.row, this_mv.col, &mbmi->wm_params,
+                                 mi_row, mi_col)) {
+          thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      best_mv->row += neighbors[best_idx].row;
+      best_mv->col += neighbors[best_idx].col;
+    }
+  }
+
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
+  return bestmse;
+}
+// =============================================================================
+//  Subpixel Motion Search: OBMC
+// =============================================================================
+// Estimates the variance of prediction residue
+static INLINE int estimate_obmc_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *src = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
+}
+
+// Calculates the variance of prediction residue
+static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                     const MV *this_mv,
+                                     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                     unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  unsigned int besterr;
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+                              subpel_search_type);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+  }
+#else
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                     subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+  return besterr;
+}
+
+static unsigned int setup_obmc_center_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
+  // get_buf_from_mv(ref, *this_mv).
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = ms_buffers->ref->buf;
+  const int ref_stride = ms_buffers->ref->stride;
+  unsigned int besterr =
+      var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+// Estimates the variance of prediction residue
+// TODO(chiyotsai@google.com): the cost does does not match the cost in
+// mv_cost_. Investigate this later.
+static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+                                       const MV_COST_PARAMS *mv_cost_params) {
+  const MV *ref_mv = mv_cost_params->ref_mv;
+  const int *mvjcost = mv_cost_params->mvjcost;
+  const int *const *mvcost = mv_cost_params->mvcost;
+  const int error_per_bit = mv_cost_params->error_per_bit;
+  const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
+  const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
+                       GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return (unsigned)((mv_cost(&diff_mv, mvjcost,
+                                 CONVERT_TO_CONST_MVCOST(mvcost)) *
+                             error_per_bit +
+                         4096) >>
+                        13);
+    case MV_COST_NONE: return 0;
+    default:
+      assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
+      return 0;
   }
 }
 
-void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               int mi_col, BLOCK_SIZE bsize,
-                               const MV ref_mv_full, int use_subpixel,
-                               unsigned int *sse, unsigned int *var) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const MV_REFERENCE_FRAME ref =
-      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
 
-  av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, ref_mv_full, 1,
-                           use_subpixel);
+    cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
+    cost += thismse;
 
-  const uint8_t *src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
 
-  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse =
+        upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static AOM_FORCE_INLINE MV obmc_first_level_check(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
+    const int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    const unsigned int left =
+        obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int right =
+        obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int up =
+        obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int down =
+        obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  } else {
+    const unsigned int left = obmc_check_better_fast(
+        &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+    const unsigned int right = obmc_check_better_fast(
+        &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const unsigned int up = obmc_check_better_fast(
+        &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+
+    const unsigned int down = obmc_check_better_fast(
+        &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  }
+}
+
+// A newer version of second level check for obmc that gives better quality.
+static AOM_FORCE_INLINE void obmc_second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+    obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv);
+    }
+  } else {
+    obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+    obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+                             mv_cost_params, besterr, sse1, distortion,
+                             &has_better_mv);
+    }
+  }
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)last_mv_search_list;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
+                                      distortion);
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                          mv_limits, var_params, mv_cost_params,
+                                          &besterr, sse1, distortion);
+
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                                 mv_limits, var_params, mv_cost_params,
+                                 &besterr, sse1, distortion);
+    }
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
+// =============================================================================
+//  Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int sse, var;
+
+  var = vfp->vf(what->buf, what->stride, get_buf_from_fullmv(in_what, best_mv),
+                in_what->stride, &sse);
+  (void)var;
+
+  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
+                           x->errorperbit, mv_cost_type);
+}
+
+static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+                                    const FULLPEL_MV best_mv,
+                                    const uint8_t *second_pred,
+                                    const aom_variance_fn_ptr_t *vfp,
+                                    const struct buf_2d *src,
+                                    const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_fullmv(in_what, &best_mv), in_what->stride, 0,
+                   0, what->buf, what->stride, &unused, second_pred) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_mask_var(
+    const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
+    const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->msvf(what->buf, what->stride, 0, 0,
+                   get_buf_from_fullmv(in_what, &best_mv), in_what->stride,
+                   second_pred, mask, mask_stride, invert_mask, &unused) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre) {
+  if (mask) {
+    return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
+                               mask_stride, invert_mask, vfp, src, pre);
+  } else {
+    return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
+                             pre);
+  }
 }

diff --git a/libaom/av1/encoder/mcomp.h b/libaom/av1/encoder/mcomp.h
index 71547da..73135d8 100644
--- a/libaom/av1/encoder/mcomp.h
+++ b/libaom/av1/encoder/mcomp.h

@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_ENCODER_MCOMP_H_
 #define AOM_AV1_ENCODER_MCOMP_H_
 
+#include "av1/common/mv.h"
 #include "av1/encoder/block.h"
 
 #include "aom_dsp/variance.h"
@@ -28,9 +29,6 @@
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
-// Allowed motion vector pixel distance outside image border
-// for Block_16x16
-#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
 
 #define SEARCH_RANGE_8P 3
 #define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
@@ -39,77 +37,246 @@
 
 // motion search site
 typedef struct search_site {
-  MV mv;
+  FULLPEL_MV mv;
   int offset;
 } search_site;
 
 typedef struct search_site_config {
-  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  search_site ss[MAX_MVSEARCH_STEPS * 2][16 + 1];
   int ss_count;
-  int searches_per_step;
+  int searches_per_step[MAX_MVSEARCH_STEPS * 2];
+  int radius[MAX_MVSEARCH_STEPS * 2];
+  int stride;
 } search_site_config;
 
 typedef struct {
-  MV coord;
+  FULLPEL_MV coord;
   int coord_offset;
 } search_neighbors;
 
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
-void av1_init3smotion_compensation(search_site_config *cfg, int stride);
-
-void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
-
-int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
-                    int *mvcost[2], int weight);
-
-// Utility to compute variance + MV rate cost for a given MV
-int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
-                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
-                       int use_mvcost);
-int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
-                          const MV *center_mv, const uint8_t *second_pred,
-                          const aom_variance_fn_ptr_t *vfp, int use_mvcost);
-int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
-                            const MV *center_mv, const uint8_t *second_pred,
-                            const uint8_t *mask, int mask_stride,
-                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
-                            int use_mvcost);
-
 struct AV1_COMP;
 struct SPEED_FEATURES;
 
-int av1_init_search_range(int size);
+// =============================================================================
+//  Cost functions
+// =============================================================================
+typedef struct {
+  const MV *ref_mv;
+  FULLPEL_MV full_ref_mv;
+  const int *mvjcost;
+  const int *mvcost[2];
+  int error_per_bit;
+  int sad_per_bit;
+  MV_COST_TYPE mv_cost_type;
+} MV_COST_PARAMS;
 
-int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
-                            int distance, const aom_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv);
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+                    int *mvcost[2], int weight);
+
+int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp);
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre);
+
+// =============================================================================
+//  Motion Search
+// =============================================================================
+typedef struct {
+  // The reference buffer
+  const struct buf_2d *ref;
+
+  // The source and predictors/mask used by translational search
+  const struct buf_2d *src;
+  const uint8_t *second_pred;
+  const uint8_t *mask;
+  int mask_stride;
+  int inv_mask;
+
+  // The weighted source and mask used by OBMC
+  const int32_t *wsrc;
+  const int32_t *obmc_mask;
+} MSBuffers;
+
+static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *mask,
+                                            int mask_stride, int invert_mask) {
+  ms_buffers->second_pred = second_pred;
+  ms_buffers->mask = mask;
+  ms_buffers->mask_stride = mask_stride;
+  ms_buffers->inv_mask = invert_mask;
+}
+
+// =============================================================================
+//  Fullpixel Motion Search
+// =============================================================================
+enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} UENUM1BYTE(SEARCH_METHODS);
+
+// This struct holds fullpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  BLOCK_SIZE bsize;
+  const aom_variance_fn_ptr_t *vfp;
+
+  MSBuffers ms_buffers;
+
+  SEARCH_METHODS search_method;
+  const search_site_config *search_sites;
+  FullMvLimits mv_limits;
+
+  int run_mesh_search;    // Sets mesh search unless it got pruned by
+                          // prune_mesh_search.
+  int prune_mesh_search;  // Disables mesh search if the best_mv after a normal
+                          // search if close to the start_mv.
+  int force_mesh_thresh;  // Forces mesh search if the residue variance is
+                          // higher than the threshold.
+  const struct MESH_PATTERN *mesh_patterns[2];
+
+  int is_intra_mode;
+
+  int fast_obmc_search;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+} FULLPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_fullpel_ms_params(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                        const struct AV1_COMP *cpi,
+                                        const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                        const MV *ref_mv,
+                                        const search_site_config *search_sites);
+
+// Sets up configs for fullpixel diamond search
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+// Sets up configs for firstpass motion search
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for all other types of motion search
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+// Set up limit values for MV components.
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_row_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_height, int border) {
+  const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->row_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE +
+                   border - 2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->row_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_col_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_col, int mi_width, int border) {
+  const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->col_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border -
+                   2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->col_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_col, int mi_height, int mi_width, int border) {
+  av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
+  av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+
+int av1_init_search_range(int size);
 
 unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
                                            int mi_row, int mi_col,
                                            const MV *ref_mv);
 
-// Runs sequence of diamond searches in smaller steps for RD.
-int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                           MV *mvp_full, int step_param, int sadpb,
-                           int further_steps, int do_refine, int *cost_list,
-                           const aom_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv);
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
 
-int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
-                   int sad_per_bit, int do_init_search, int *cost_list,
-                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                   const MV *center_mv);
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv);
 
-typedef int(fractional_mv_step_fp)(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp,
-    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
-    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred,
-    const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search, const int do_reset_fractional_mv);
+int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv);
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+                                         FULLPEL_MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+//  Subpixel Motion Search
+// =============================================================================
+enum {
+  EIGHTH_PEL,
+  QUARTER_PEL,
+  HALF_PEL,
+  FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+typedef struct {
+  const aom_variance_fn_ptr_t *vfp;
+  SUBPEL_SEARCH_TYPE subpel_search_type;
+  // Source and reference buffers
+  MSBuffers ms_buffers;
+  int w, h;
+} SUBPEL_SEARCH_VAR_PARAMS;
+
+// This struct holds subpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  // High level motion search settings
+  int allow_hp;
+  const int *cost_list;
+  SUBPEL_FORCE_STOP forced_stop;
+  int iters_per_step;
+  SubpelMvLimits mv_limits;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+
+  // Distortion calculation params
+  SUBPEL_SEARCH_VAR_PARAMS var_params;
+} SUBPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list);
+
+typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                   MV start_mv, MV *bestmv, int *distortion,
+                                   unsigned int *sse1,
+                                   int_mv *last_mv_search_list);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -117,63 +284,12 @@
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
 
-typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
-                                    int sad_per_bit, int distance,
-                                    const aom_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*av1_diamond_search_fn_t)(
-    MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
-    int search_param, int sad_per_bit, int *num00,
-    const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
-
-int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
-                             const aom_variance_fn_ptr_t *fn_ptr,
-                             const uint8_t *mask, int mask_stride,
-                             int invert_mask, const MV *center_mv,
-                             const uint8_t *second_pred);
-
-int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
-                          int method, int run_mesh_search, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, int var_max, int rd,
-                          int x_pos, int y_pos, int intra,
-                          const search_site_config *cfg);
-
-int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                               MV *mvp_full, int step_param, int sadpb,
-                               int further_steps, int do_refine,
-                               const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *ref_mv, MV *dst_mv, int is_second,
-                               const search_site_config *cfg);
-int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_accurate_subpel_search);
-
-unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
-                                     MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                     int mi_row, int mi_col, const MV *this_mv);
-unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
-                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col, int *pts0,
-                                  int *pts_inref0, int total_samples);
-
-// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
-// ref. Note that this sets the offset of mbmi, so we will need to reset it
-// after calling this function.
-void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize, int ref,
-                              MV ref_mv_full, int num_planes, int use_subpixel);
-
-// Performs a simple motion search to calculate the sse and var of the residue
-void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               int mi_col, BLOCK_SIZE bsize,
-                               const MV ref_mv_full, int use_subpixel,
-                               unsigned int *sse, unsigned int *var);
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples);
 
 static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
   for (int z = 0; z < 3; z++) {
@@ -181,6 +297,31 @@
   }
 }
 
+static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+                                                  const FullMvLimits *mv_limits,
+                                                  const MV *ref_mv) {
+  const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
+  const int minc =
+      AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv);
+  const int maxc =
+      AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv);
+  const int minr =
+      AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv);
+  const int maxr =
+      AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv);
+
+  subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc);
+  subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc);
+  subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr);
+  subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+                                           MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/misc_model_weights.h b/libaom/av1/encoder/misc_model_weights.h
new file mode 100644
index 0000000..f00aeab
--- /dev/null
+++ b/libaom/av1/encoder/misc_model_weights.h

@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define MV_PREC_FEATURE_SIZE 18
+
+#define NUM_DNN_LAYERS 1
+#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE
+#define MV_PREC_LAYER_SIZE_0 32
+#define NUM_LOGITS 1
+
+const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f,
+                                                       141.6251917346238f,
+                                                       0.36313633945679064f,
+                                                       0.0028162791958822085f,
+                                                       0.000484820537626698f,
+                                                       0.002769969388939025f,
+                                                       0.0f,
+                                                       0.00031274626720947577f,
+                                                       0.00020578555375160075f,
+                                                       0.0007075246732697733f,
+                                                       0.000539641029909925f,
+                                                       0.0013939401375906984f,
+                                                       4.985394760423499f,
+                                                       4.985394760423499f,
+                                                       4.9992148717283085f,
+                                                       5.143739822380163f,
+                                                       5.518483124004564f,
+                                                       87.63597847427077f };
+
+const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f,
+                                                      68.04472572607503f,
+                                                      13.23247674430399f,
+                                                      0.0029123438396921955f,
+                                                      0.0015331406169374737f,
+                                                      0.0029149813096313775f,
+                                                      1.0f,
+                                                      0.00047501102871357813f,
+                                                      0.00030025962993117947f,
+                                                      0.0009861163580391207f,
+                                                      0.0012157593528004055f,
+                                                      0.002004954948490521f,
+                                                      6.539447500484038f,
+                                                      6.539447500484038f,
+                                                      6.396589058279465f,
+                                                      3.4870155874262516f,
+                                                      3.8911353973740535f,
+                                                      112.07985259573601f };
+
+const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f,
+                                                 -0.1483527373474774f,
+                                                 0.08112076098858864f,
+                                                 -0.9582568679627453f,
+                                                 -0.34794757171071206f,
+                                                 0.6465225723304947f,
+                                                 0.0f,
+                                                 0.06754171885839604f,
+                                                 0.27156803620541214f,
+                                                 0.10635231245664407f,
+                                                 -0.031183926995968583f,
+                                                 0.048122572260291f,
+                                                 -0.19498534230045128f,
+                                                 -0.2614116319273316f,
+                                                 -0.3223762845136331f,
+                                                 -1.2063368350609205f,
+                                                 -0.523333556911706f,
+                                                 1.075632260890728f,
+                                                 0.48989726814387946f,
+                                                 -0.34816466111070477f,
+                                                 0.41668357610256473f,
+                                                 -1.0973562848791671f,
+                                                 0.04183921854389494f,
+                                                 -0.9123815389260476f,
+                                                 0.0f,
+                                                 0.859965047744027f,
+                                                 0.1962095804679813f,
+                                                 0.2606564339077058f,
+                                                 0.26695868715184895f,
+                                                 0.5319308568326692f,
+                                                 -0.23717505799723165f,
+                                                 -0.43127224481782567f,
+                                                 -0.3214545776203726f,
+                                                 0.5850852241402176f,
+                                                 -0.26705531612587813f,
+                                                 -0.5786016766610093f,
+                                                 0.9360519909983003f,
+                                                 0.20771329289016555f,
+                                                 -0.027614159544811823f,
+                                                 -1.175022807046164f,
+                                                 -0.07578967497693835f,
+                                                 0.6890172485324256f,
+                                                 0.0f,
+                                                 -0.008008338164988263f,
+                                                 -0.08064800010158935f,
+                                                 -0.22606910981666667f,
+                                                 0.4541586669210879f,
+                                                 0.07731527661370792f,
+                                                 -0.6744475941247964f,
+                                                 -0.2625842448396184f,
+                                                 1.7018613444303785f,
+                                                 -0.08622229073162656f,
+                                                 0.041858142814941275f,
+                                                 -0.24575964090386415f,
+                                                 -0.046626044730994964f,
+                                                 0.7608713064175202f,
+                                                 -0.23330119070907146f,
+                                                 -0.10115510984500826f,
+                                                 0.9722537349192069f,
+                                                 0.11718554254290829f,
+                                                 0.0f,
+                                                 0.2075123446014759f,
+                                                 0.09465167310768637f,
+                                                 0.7609896851963016f,
+                                                 0.4441038581385328f,
+                                                 0.26064144727430955f,
+                                                 -0.14678625366485035f,
+                                                 -0.03597014452200524f,
+                                                 0.3128680867196166f,
+                                                 1.102496797385966f,
+                                                 0.06642253233084111f,
+                                                 -1.2665494483407629f,
+                                                 0.09049412632000911f,
+                                                 -1.1160621999565095f,
+                                                 0.043420275255913035f,
+                                                 -0.8811412259978966f,
+                                                 0.21076234632287777f,
+                                                 0.16571534463543866f,
+                                                 0.0f,
+                                                 -0.7324075176473275f,
+                                                 -0.3677622514459495f,
+                                                 0.3273532243056415f,
+                                                 0.22922161936797775f,
+                                                 0.8204766691058087f,
+                                                 0.02982161033720488f,
+                                                 0.5266419954188112f,
+                                                 -1.0032154963302191f,
+                                                 0.7007602969763729f,
+                                                 0.37196355167990885f,
+                                                 -0.7608579453228548f,
+                                                 0.08568111584781847f,
+                                                 0.07011061059123677f,
+                                                 0.3233263598082507f,
+                                                 -0.08249928295410253f,
+                                                 0.08220165761319252f,
+                                                 0.22148722752246794f,
+                                                 0.0f,
+                                                 0.6122392701743506f,
+                                                 -0.26429838296378333f,
+                                                 0.31958081620005463f,
+                                                 -0.006027177397853826f,
+                                                 -0.3088310785887994f,
+                                                 -0.5436192046707807f,
+                                                 -0.011080356757423306f,
+                                                 0.12632650770008413f,
+                                                 -0.45097913215234525f,
+                                                 1.8008072867127298f,
+                                                 -0.7630029654575501f,
+                                                 -0.4054774329826579f,
+                                                 0.40386074452544535f,
+                                                 -0.18541426257453025f,
+                                                 0.2444879765079863f,
+                                                 -0.6216724756115081f,
+                                                 0.27030299321302f,
+                                                 0.0f,
+                                                 -0.6835848952967989f,
+                                                 -0.7914184320964815f,
+                                                 -0.6761595019582928f,
+                                                 -1.009565565604081f,
+                                                 -0.1904242439353305f,
+                                                 0.4463417126318631f,
+                                                 0.6025503823452971f,
+                                                 0.5149990860115566f,
+                                                 1.0242970663937634f,
+                                                 0.037947306826401385f,
+                                                 0.07039339786212848f,
+                                                 0.14273796789711987f,
+                                                 0.168103961425691f,
+                                                 1.6596066376811978f,
+                                                 0.19321092229384657f,
+                                                 -0.3710750388148514f,
+                                                 -0.01717015559410288f,
+                                                 0.0f,
+                                                 0.3005688477942597f,
+                                                 0.23877080653829577f,
+                                                 0.2718594552971173f,
+                                                 0.3885402571589898f,
+                                                 0.32999531945669247f,
+                                                 -0.6134460954213243f,
+                                                 -0.13972265462799183f,
+                                                 -0.07180089575716991f,
+                                                 -1.014572598188105f,
+                                                 0.0717207322809836f,
+                                                 0.34896157745155615f,
+                                                 -0.27127687591403f,
+                                                 -0.5058651212773623f,
+                                                 -1.5442435628306925f,
+                                                 -0.6399784724734707f,
+                                                 0.6274301429074947f,
+                                                 -0.4645750072767051f,
+                                                 0.0f,
+                                                 -0.2406726815244178f,
+                                                 -0.06321214115916597f,
+                                                 0.312856714253404f,
+                                                 0.16459514124116134f,
+                                                 0.3993579604809623f,
+                                                 -0.15232044351561913f,
+                                                 -0.5613743948568469f,
+                                                 0.7219801372223262f,
+                                                 0.2936857469624009f,
+                                                 0.7823466656034087f,
+                                                 -0.12416947814098349f,
+                                                 -0.36413756654028345f,
+                                                 -0.07992098796866462f,
+                                                 -0.7395722879842416f,
+                                                 0.8639913543220514f,
+                                                 -0.311931773757945f,
+                                                 -1.7308240470400613f,
+                                                 0.0f,
+                                                 0.394499716712104f,
+                                                 0.6511462819539963f,
+                                                 -0.0722425275974144f,
+                                                 0.13490818194661386f,
+                                                 0.055319135836378035f,
+                                                 0.15389577508097013f,
+                                                 0.28958598328870605f,
+                                                 -0.14608429470539772f,
+                                                 0.09488817462478298f,
+                                                 -0.17231294096622088f,
+                                                 0.6721115415911466f,
+                                                 -0.05664621150536103f,
+                                                 0.03291799673669331f,
+                                                 0.02845382711057482f,
+                                                 -0.9953563446999164f,
+                                                 -0.17994298220605923f,
+                                                 0.6560824519337476f,
+                                                 0.0f,
+                                                 -0.30990646375917935f,
+                                                 0.17215517202874f,
+                                                 0.2026816225170481f,
+                                                 0.22011958747715601f,
+                                                 0.3562520768889686f,
+                                                 -0.18436559057189175f,
+                                                 0.1733377147302066f,
+                                                 0.02818276995640877f,
+                                                 -0.29703005574859076f,
+                                                 -0.3310652639215064f,
+                                                 -1.6091173258529277f,
+                                                 0.45461585790028003f,
+                                                 -0.5078643334592593f,
+                                                 -0.338997374732338f,
+                                                 0.4688619590359733f,
+                                                 0.627099126828289f,
+                                                 -0.5249801376494249f,
+                                                 0.0f,
+                                                 0.34465498218272883f,
+                                                 0.009891680630908135f,
+                                                 -0.27244020967349f,
+                                                 0.05404589867626979f,
+                                                 -0.06220329325739666f,
+                                                 -0.13365376464759104f,
+                                                 -0.13098573553512366f,
+                                                 0.11434198976289106f,
+                                                 0.6740951247574676f,
+                                                 1.3381727185724581f,
+                                                 -1.4865773213251936f,
+                                                 0.05809898701966341f,
+                                                 0.25380780261023456f,
+                                                 1.2716367496512722f,
+                                                 0.1768290070780598f,
+                                                 -0.07554828135356352f,
+                                                 0.8180570085344856f,
+                                                 0.0f,
+                                                 1.0788448980077463f,
+                                                 0.0651938742459459f,
+                                                 0.3807672030015587f,
+                                                 0.6144792680268445f,
+                                                 0.011660612214908059f,
+                                                 -0.018306023765580288f,
+                                                 0.44140813809926516f,
+                                                 -0.13411994195502386f,
+                                                 0.15920368955127778f,
+                                                 -0.19382358417849888f,
+                                                 -0.08802147969690055f,
+                                                 -0.019731052733814477f,
+                                                 0.1104744229169665f,
+                                                 -0.195834419735958f,
+                                                 -0.5005295046454347f,
+                                                 -0.17041241868229032f,
+                                                 -0.471942117351489f,
+                                                 0.0f,
+                                                 -0.3599073304761372f,
+                                                 -0.2745532782968519f,
+                                                 -0.8323064841106417f,
+                                                 -0.88355885384943f,
+                                                 -0.02826466859020679f,
+                                                 0.06977870308805256f,
+                                                 0.11926112095374196f,
+                                                 1.367382707959643f,
+                                                 -0.06119843162964051f,
+                                                 -0.5331395268889569f,
+                                                 -1.2155531584240624f,
+                                                 -0.01896651779524327f,
+                                                 0.10591845408571081f,
+                                                 -0.010632842156504733f,
+                                                 0.6150787968629282f,
+                                                 -0.4191690185896091f,
+                                                 -0.9961718918346271f,
+                                                 0.0f,
+                                                 0.23370364516013867f,
+                                                 0.4156033072362998f,
+                                                 0.1261005546633433f,
+                                                 0.0812413884532226f,
+                                                 -0.008894337353937203f,
+                                                 0.07984447025056046f,
+                                                 -0.1258098052766725f,
+                                                 -0.40245475467767916f,
+                                                 1.78188906675019f,
+                                                 -1.1544387954232302f,
+                                                 -0.41768781481273387f,
+                                                 0.6791211165341995f,
+                                                 -0.4175127856183446f,
+                                                 -0.07353219159767788f,
+                                                 -0.2888813577574072f,
+                                                 -0.7107767892597061f,
+                                                 -1.0450031091195449f,
+                                                 0.0f,
+                                                 -0.9221599545079143f,
+                                                 -0.6747876356740621f,
+                                                 0.30241454354872105f,
+                                                 0.4924965303373908f,
+                                                 -0.14042722740054084f,
+                                                 0.27744210409350445f,
+                                                 -0.14788270997426836f,
+                                                 -0.9081467469237995f,
+                                                 -0.04513115674995093f,
+                                                 -0.5254168669125793f,
+                                                 -0.6999012037974789f,
+                                                 0.434661246306547f,
+                                                 -0.7193303957246092f,
+                                                 -0.9117952623409744f,
+                                                 -1.5097267865916142f,
+                                                 -0.20779888103770922f,
+                                                 0.4935562480901218f,
+                                                 0.0f,
+                                                 0.18303393908923593f,
+                                                 0.34753722677570037f,
+                                                 0.29291001533177663f,
+                                                 0.3832351878354224f,
+                                                 0.3295194956120599f,
+                                                 -0.32398033003617527f,
+                                                 -0.31570906736433746f,
+                                                 0.23657779050372962f,
+                                                 0.9510794465234161f,
+                                                 -0.5122243902568278f,
+                                                 0.08652112725315658f,
+                                                 0.2246634353717998f,
+                                                 -0.9032595595582497f,
+                                                 -0.8936484034533545f,
+                                                 0.6012969720865752f,
+                                                 -0.6454216646117924f,
+                                                 -1.1753786049658332f,
+                                                 0.0f,
+                                                 -0.4360545677728656f,
+                                                 -0.6586237455328507f,
+                                                 -0.34347301697886656f,
+                                                 -0.8909724651992144f,
+                                                 -0.24378721818350263f,
+                                                 0.6179733359297576f,
+                                                 0.0661661181742234f,
+                                                 -0.14120142044993794f,
+                                                 -0.07732699885498932f,
+                                                 1.0221355882357506f,
+                                                 0.44514798994115284f,
+                                                 -0.7371569579959046f,
+                                                 -0.7212499572378936f,
+                                                 0.7453626921081045f,
+                                                 0.5478757761345768f,
+                                                 -0.39411232789985384f,
+                                                 0.7200542656743857f,
+                                                 0.0f,
+                                                 -0.11790869453118827f,
+                                                 -0.12317030713581928f,
+                                                 -0.4207902738133338f,
+                                                 0.15895105878327986f,
+                                                 0.304261777102111f,
+                                                 0.11450744587017621f,
+                                                 -0.11470709991317944f,
+                                                 0.5949222371739038f,
+                                                 0.6549518619412444f,
+                                                 -0.24390606570422838f,
+                                                 -0.4212796009440803f,
+                                                 -0.6269666206320964f,
+                                                 -0.5421193969807078f,
+                                                 -0.12297772128652287f,
+                                                 0.021517257619930424f,
+                                                 0.25462855095544523f,
+                                                 -0.22107798187348246f,
+                                                 0.0f,
+                                                 0.5204516300095662f,
+                                                 0.2837402841862462f,
+                                                 0.11310823283285916f,
+                                                 0.8944351685018025f,
+                                                 0.17487203235834015f,
+                                                 -0.5271221928634433f,
+                                                 -0.19516594503423199f,
+                                                 0.452456617580365f,
+                                                 1.2456272242706414f,
+                                                 0.24166615894862817f,
+                                                 0.09411429305204502f,
+                                                 -0.2730072283327243f,
+                                                 -0.8129383770918172f,
+                                                 -0.24093254193486136f,
+                                                 0.5696499174142177f,
+                                                 -0.11110805836073044f,
+                                                 -0.3968204166235694f,
+                                                 0.0f,
+                                                 -0.04388165369378549f,
+                                                 -0.005631266017272595f,
+                                                 -0.02574211858479705f,
+                                                 0.06230399626660669f,
+                                                 0.17677671232932785f,
+                                                 0.5172871274400965f,
+                                                 0.4919150085620063f,
+                                                 -1.597656637582941f,
+                                                 0.02415185715719143f,
+                                                 -0.17945446376668306f,
+                                                 -0.39340600199798886f,
+                                                 0.25013205256886845f,
+                                                 0.05972330340308685f,
+                                                 0.1359911505596489f,
+                                                 -0.02341033271820833f,
+                                                 0.15726074644063684f,
+                                                 0.47512625913020357f,
+                                                 0.0f,
+                                                 0.7327341664835779f,
+                                                 -0.3689092312320013f,
+                                                 0.4571824787436036f,
+                                                 0.6215465537945456f,
+                                                 0.0944111296842023f,
+                                                 -0.12571956176607574f,
+                                                 -0.2507235674395462f,
+                                                 -0.09579602654351593f,
+                                                 1.4463357293728496f,
+                                                 0.749153535856049f,
+                                                 -0.5553955120807588f,
+                                                 -0.09622771929369946f,
+                                                 -0.2598697420394813f,
+                                                 -0.964691815299676f,
+                                                 -0.8289963178173902f,
+                                                 0.7112949291983329f,
+                                                 -0.8667009730492162f,
+                                                 0.0f,
+                                                 -0.48698304169042794f,
+                                                 -0.18786095669893707f,
+                                                 -0.11425249263203247f,
+                                                 -0.3693391011684809f,
+                                                 0.09933145842585253f,
+                                                 0.2568559685298844f,
+                                                 0.7048512233651738f,
+                                                 0.6056238412407038f,
+                                                 -0.4355558119826642f,
+                                                 0.17318931883915484f,
+                                                 0.6481333496429564f,
+                                                 -0.45728823054344486f,
+                                                 -0.006325004538589701f,
+                                                 0.45609864075494927f,
+                                                 -0.6199385981116988f,
+                                                 0.035105808783046165f,
+                                                 0.1203147963894839f,
+                                                 0.0f,
+                                                 0.383402190836527f,
+                                                 0.048429009055370106f,
+                                                 0.5887186439275204f,
+                                                 -0.20538767641607814f,
+                                                 -0.031237879611002117f,
+                                                 0.3140759860883231f,
+                                                 0.24447070584999556f,
+                                                 0.7271263905705878f,
+                                                 0.8432799162434237f,
+                                                 -0.11530577554199217f,
+                                                 -0.7781023892314718f,
+                                                 0.05359488822710336f,
+                                                 0.5624870388700809f,
+                                                 0.5134656523208906f,
+                                                 0.18304041423438375f,
+                                                 -0.04237421156328257f,
+                                                 -0.20759809886942207f,
+                                                 0.0f,
+                                                 -0.06249337454975615f,
+                                                 0.10081284533873777f,
+                                                 0.3894374350259183f,
+                                                 1.518217777528342f,
+                                                 -0.9100037950171563f,
+                                                 0.17796906121831477f,
+                                                 -0.2892167255357892f,
+                                                 0.6117902467884032f,
+                                                 0.13332120964959573f,
+                                                 -0.3487155932849374f,
+                                                 -0.32920583745734694f,
+                                                 0.08242631209809854f,
+                                                 -0.24920225708110588f,
+                                                 0.8401757259392635f,
+                                                 0.11729108681358365f,
+                                                 0.11222925752499184f,
+                                                 -0.027078490721459958f,
+                                                 0.0f,
+                                                 0.726132375517389f,
+                                                 0.72220359881096f,
+                                                 0.5721582611845177f,
+                                                 0.15139162075524315f,
+                                                 0.6676549461551197f,
+                                                 -0.321449586554697f,
+                                                 -0.10141104515219895f,
+                                                 -0.09711123988777906f,
+                                                 0.9623356184776928f,
+                                                 -0.7941822373167173f,
+                                                 -0.9373923554119346f,
+                                                 0.4573241832354059f,
+                                                 -0.42029139056126147f,
+                                                 0.2675223459380999f,
+                                                 -0.5487300191551386f,
+                                                 0.2236621891916084f,
+                                                 0.11692039230044018f,
+                                                 0.0f,
+                                                 0.1758399202780961f,
+                                                 0.676447587678781f,
+                                                 0.5945412815881029f,
+                                                 0.5669863357359594f,
+                                                 0.8433565415303922f,
+                                                 -0.30300550790708036f,
+                                                 -0.43332881999693673f,
+                                                 -0.4996522695731392f,
+                                                 -0.2084930815451962f,
+                                                 0.27765278702463786f,
+                                                 1.0886848763946915f,
+                                                 -0.0739433655813831f,
+                                                 -0.4762801579229192f,
+                                                 -0.2490825339320731f,
+                                                 -1.8820479350439439f,
+                                                 -0.4251592225775914f,
+                                                 -0.3992922365484464f,
+                                                 0.0f,
+                                                 0.19598917760218867f,
+                                                 0.4860238022746914f,
+                                                 0.3364528828641281f,
+                                                 0.3350950865226741f,
+                                                 0.2773654548632006f,
+                                                 -0.30547262140782566f,
+                                                 0.028649620490728344f,
+                                                 -0.11763407628280315f,
+                                                 0.6237318502627169f,
+                                                 -0.3958952632477945f,
+                                                 0.14797171297835243f,
+                                                 0.45821729624747465f,
+                                                 -0.8687137170773626f,
+                                                 0.06989667196937126f,
+                                                 -0.5752606929478727f,
+                                                 0.16986945686358412f,
+                                                 0.6925071596817824f,
+                                                 0.0f,
+                                                 0.4991250796183003f,
+                                                 0.03424654896322111f,
+                                                 0.6153698611882319f,
+                                                 0.5070872444849457f,
+                                                 0.43615747516328135f,
+                                                 -0.7870352838659244f,
+                                                 -0.6424101231965247f,
+                                                 -0.7005774876651399f,
+                                                 0.79983115431488f,
+                                                 0.15720357955596242f,
+                                                 -1.408372612176309f,
+                                                 -0.039294695217213765f,
+                                                 0.6979415372962309f,
+                                                 0.27403316751965656f,
+                                                 1.2844596102619275f,
+                                                 -0.2781534150257364f,
+                                                 0.3248437714908865f,
+                                                 0.0f,
+                                                 0.4364362371752831f,
+                                                 -0.2548580911485434f,
+                                                 -0.19578001373349452f,
+                                                 -0.04597194387828005f,
+                                                 -0.010035156855533233f,
+                                                 0.0415941475251266f,
+                                                 0.07929549739797387f,
+                                                 -0.060629652912508866f,
+                                                 0.5977303008711333f,
+                                                 -1.4404008068066554f,
+                                                 0.8555694790197376f,
+                                                 -0.03693438534401856f,
+                                                 0.17761411164512408f,
+                                                 -0.11858304304109235f,
+                                                 -1.4241324353471327f,
+                                                 0.1533849765389186f,
+                                                 0.7650643783126995f,
+                                                 0.0f,
+                                                 -0.0639949379280401f,
+                                                 0.4288617817939563f,
+                                                 0.4235508646885404f,
+                                                 0.3419843254383798f,
+                                                 -0.015992360660098768f,
+                                                 -0.773247697505441f,
+                                                 -0.4908452922015917f,
+                                                 0.9868134897291486f,
+                                                 -0.5078689994742608f,
+                                                 1.05632043744864f,
+                                                 -0.38867419409275117f,
+                                                 -0.0065547696858664194f,
+                                                 -0.3056003173415037f,
+                                                 -0.333762331930102f,
+                                                 0.4459671174011671f,
+                                                 0.08219092584580244f,
+                                                 -0.08099158579518179f,
+                                                 0.0f,
+                                                 -0.1568180656346373f,
+                                                 -0.061962372393910135f,
+                                                 0.14065868174859464f,
+                                                 -0.055925712798972765f,
+                                                 0.05136117465820622f,
+                                                 0.0907831030477633f,
+                                                 0.19518110495319604f,
+                                                 -0.7470794578145956f,
+                                                 1.5945999734733545f,
+                                                 -0.4351697502345834f,
+                                                 -0.33253649399571805f };
+
+const float av1_mv_prec_nn_bias_layer_0[] = {
+  -0.651213833993862f,   -1.1243309933417809f,  -0.2123880023097051f,
+  0.23095477452877616f,  -0.6668057665893545f,  0.3082268148379634f,
+  -0.3344916753975844f,  -0.20920185606857844f, 0.6057933917964854f,
+  0.5031857662559803f,   -1.5380096313468152f,  -0.4457245344804041f,
+  1.82368055812373f,     0.7973912064077963f,   0.25706500555622913f,
+  0.1394695119825382f,   0.4508811973450553f,   -0.5408959545111782f,
+  1.064829233697863f,    0.3733268644246235f,   1.1173169029905483f,
+  -0.2012817466400134f,  -0.16628447748302294f, 1.3086000088940826f,
+  0.7267092979664235f,   -0.9097857006590555f,  -0.7564259343863077f,
+  -0.49844128036716173f, -0.4675729246975423f,  -0.03626154526362181f,
+  -0.41957330902404616f, -0.9658160514319954f
+};
+
+const float av1_mv_prec_nn_weights_layer_1[] = {
+  1.5017296484510276f,  1.044216918060133f,   -1.066541411740906f,
+  -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f,
+  0.7117244268817873f,  -0.7695942296628597f, 0.7892157680137047f,
+  -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f,
+  -0.9699580532370483f, 0.5849682956422552f,  -1.0372272986941953f,
+  -0.5005014627824439f, 1.1816204711740521f,  -1.2204867615892114f,
+  0.4510263977504913f,  0.35567865078585165f, -0.7811389330738839f,
+  -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f,
+  0.8861643352684585f,  0.6438840651522237f,  0.6677191546466089f,
+  0.9703715021995785f,  1.250893534236489f,   0.7733742028067933f,
+  -1.249673977776904f,  -1.2890127265725608f
+};
+
+const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f };
+
+static const NN_CONFIG av1_mv_prec_dnn_config = {
+  NUM_DNN_FEATURES,
+  NUM_LOGITS,
+  NUM_DNN_LAYERS,
+  { MV_PREC_LAYER_SIZE_0 },
+  {
+      av1_mv_prec_nn_weights_layer_0,
+      av1_mv_prec_nn_weights_layer_1,
+  },
+  {
+      av1_mv_prec_nn_bias_layer_0,
+      av1_mv_prec_nn_bias_layer_1,
+  },
+};
+#undef NUM_DNN_LAYERS
+#undef NUM_DNN_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_

diff --git a/libaom/av1/encoder/ml.c b/libaom/av1/encoder/ml.c
index 579900a..57228ec 100644
--- a/libaom/av1/encoder/ml.c
+++ b/libaom/av1/encoder/ml.c

@@ -15,11 +15,21 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/ml.h"
 
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+  const int prec_bits = 11;
+  const int prec = 1 << prec_bits;
+  const float inv_prec = (float)(1.0 / prec);
+  for (int i = 0; i < num_output; i++) {
+    output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+  }
+}
+
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_c(const float *input_nodes,
-                      const NN_CONFIG *const nn_config, float *const output) {
+                      const NN_CONFIG *const nn_config, int reduce_prec,
+                      float *const output) {
   int num_input_nodes = nn_config->num_inputs;
   int buf_index = 0;
   float buf[2][NN_MAX_NODES_PER_LAYER];
@@ -55,8 +65,80 @@
       val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
     output[node] = val;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
 
+#if CONFIG_NN_V2
+// Applies the ReLu activation to one fc layer
+// output[i] = Max(input[i],0.0f)
+static float *nn_relu(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    layer->output[i] = AOMMAX(input[i], 0.0f);
+  }
+
+  return layer->output;
+}
+
+// Applies the Sigmoid activation to one fc layer
+// output[i] = 1/(1+exp(input[i]))
+static float *nn_sigmoid(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f);
+    layer->output[i] = 1.0f / (1.0f + expf(-tmp));
+  }
+
+  return layer->output;
+}
+
+// Forward prediction in one fc layer, used in function av1_nn_predict_V2
+static float *nn_fc_forward(const float *input, FC_LAYER *layer) {
+  const float *weights = layer->weights;
+  const float *bias = layer->bias;
+  assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER);
+  // fc
+  for (int node = 0; node < layer->num_outputs; ++node) {
+    float val = bias[node];
+    for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i];
+    layer->output[node] = val;
+    weights += layer->num_inputs;
+  }
+
+  // activation
+  switch (layer->activation) {
+    case NONE: return layer->output;
+    case RELU: return nn_relu(layer->output, layer);
+    case SIGMOID: return nn_sigmoid(layer->output, layer);
+    case SOFTSIGN:
+      assert(0 && "Softsign has not been supported in NN.");  // TO DO
+      return NULL;
+    default:
+      assert(0 && "Unknown activation");  // Unknown activation
+      return NULL;
+  }
+}
+
+void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output) {
+  const float *input_nodes = feature;
+
+  // Propagate the layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int i = 0; i < num_layers; ++i) {
+    input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i);
+    assert(nn_config->layer[i + 1].num_inputs ==
+           nn_config->layer[i].num_outputs);
+  }
+
+  // Final layer
+  input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers);
+  assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
+  // Copy the final layer output
+  memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
+}
+#endif  // CONFIG_NN_V2
+
 void av1_nn_softmax(const float *input, float *output, int n) {
   // Softmax function is invariant to adding the same constant
   // to all input values, so we subtract the maximum input to avoid

diff --git a/libaom/av1/encoder/ml.h b/libaom/av1/encoder/ml.h
index 7f2750b..62d543d 100644
--- a/libaom/av1/encoder/ml.h
+++ b/libaom/av1/encoder/ml.h

@@ -34,11 +34,47 @@
 };
 // Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
 
+#if CONFIG_NN_V2
+// Fully-connectedly layer configuration
+struct FC_LAYER {
+  const int num_inputs;   // Number of input nodes, i.e. features.
+  const int num_outputs;  // Number of output nodes.
+
+  float *weights;               // Weight parameters.
+  float *bias;                  // Bias parameters.
+  const ACTIVATION activation;  // Activation function.
+
+  float *output;  // The output array.
+  float *dY;      // Gradient of outputs
+  float *dW;      // Gradient of weights.
+  float *db;      // Gradient of bias
+};
+
+// NN configure structure V2
+struct NN_CONFIG_V2 {
+  const int num_hidden_layers;  // Number of hidden layers, max = 10.
+  FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1];  // The layer array
+  const int num_logits;                      // Number of output nodes.
+  float *logits;    // Raw prediction (same as output of final layer)
+  const LOSS loss;  // Loss function
+};
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output);
+#endif  // CONFIG_NN_V2
+
 // Applies the softmax normalization function to the input
 // to get a valid probability distribution in the output:
 // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
 void av1_nn_softmax(const float *input, float *output, int n);
 
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/mode_prune_model_weights.h b/libaom/av1/encoder/mode_prune_model_weights.h
new file mode 100644
index 0000000..98ec368
--- /dev/null
+++ b/libaom/av1/encoder/mode_prune_model_weights.h

@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_HIDDEN_LAYERS_12 1
+#define NUM_FEATURES_12 6
+#define NUM_LAYER_0_UNITS_12 24
+#define NUM_LOGITS_12 2
+
+static const float av1_intrap_hiddenlayer_0_kernel_12[] = {
+  7.28372f,       -1.3333898f,    -1.3180022f,   -0.007156151f, -0.40799126f,
+  -0.57538104f,   -31.81647f,     6.7057495f,    6.351472f,     -0.029544508f,
+  0.026801195f,   1.12863f,       -0.70769817f,  -0.24183524f,  0.0649113f,
+  -0.7189517f,    0.21791299f,    0.12840256f,   -0.56424767f,  0.16924907f,
+  0.4605501f,     -0.170895f,     -0.60358995f,  -0.15383226f,  -4.0523643f,
+  0.6961917f,     1.3100256f,     -0.4189354f,   0.37264112f,   -0.14555685f,
+  10.628014f,     8.184437f,      8.941916f,     -0.011731001f, -0.45127156f,
+  0.42704004f,    36.84277f,      8.988796f,     8.844238f,     0.00030091056f,
+  -0.022038324f,  1.3566176f,     -8.863219f,    -0.84811693f,  -1.0908632f,
+  0.00023130262f, -1.0698471f,    -6.755927f,    7.1711984f,    4.7216063f,
+  3.5099216f,     -0.6650184f,    0.5935173f,    -0.6696286f,   11.8595295f,
+  0.3001874f,     0.29822728f,    0.04319222f,   -1.203178f,    1.1210147f,
+  0.035045594f,   -0.20559944f,   -0.015388541f, -0.7857941f,   -0.94100875f,
+  -0.1278549f,    -19.22603f,     7.9466896f,    6.5048656f,    -0.22195444f,
+  0.19061874f,    1.3927288f,     -8.896529f,    -0.48146892f,  -1.6098932f,
+  -0.0030235797f, -0.6533787f,    -2.1333003f,   -22.256454f,   -4.934058f,
+  -4.4707212f,    -0.015831878f,  -0.4243649f,   -2.776269f,    -0.23762038f,
+  0.1820098f,     -0.51865315f,   -1.1893421f,   0.34969202f,   0.10636194f,
+  14.545696f,     1.3849198f,     2.6815193f,    -0.5145498f,   0.45948258f,
+  -0.8842355f,    -0.9111363f,    -0.39652422f,  0.077266276f,  -0.68084997f,
+  0.4593515f,     -0.28872707f,   -6.936231f,    1.12253f,      1.7616503f,
+  -0.014069137f,  -0.0052156276f, -4.5095444f,   6.2076726f,    -0.058755957f,
+  -0.4675936f,    -0.13039507f,   0.12094394f,   -0.07285393f,  68.26125f,
+  7.4893136f,     8.770954f,      0.020274093f,  -0.027877754f, 1.6579602f,
+  -0.1825479f,    0.34832543f,    0.07472531f,   -0.44812247f,  -1.0941806f,
+  -0.16749863f,   1.1394324f,     0.47983396f,   -0.99983627f,  -0.00064249727f,
+  -1.3345739f,    -0.057157427f,  -18.14875f,    16.506035f,    15.539248f,
+  0.013191509f,   -0.021674965f,  -25.006235f,   0.51220596f,   0.7334426f,
+  0.81836903f,    -1.0443225f,    0.4459505f,    -1.2045046f
+};
+
+static const float av1_intrap_hiddenlayer_0_bias_12[] = {
+  -4.154915f,   14.33833f,   0.0f,       0.0f,         2.0440118f, 12.40922f,
+  -16.77514f,   0.5879813f,  3.2305415f, 0.8303539f,   0.0f,       14.488708f,
+  2.94393f,     1.874383f,   0.0f,       -0.53140444f, 0.0f,       1.8456234f,
+  -0.55427986f, -19.856262f, 0.0f,       0.17281002f,  48.31631f,  0.0f
+};
+
+static const float av1_intrap_logits_kernel_12[] = {
+  0.26843873f,   -0.09576241f,  0.34427166f,  0.09914787f,  -0.10275399f,
+  0.02999484f,   -0.1467772f,   0.11594324f,  0.29200763f,  0.0067976206f,
+  0.050393578f,  -0.018694371f, 0.3333476f,   0.2127221f,   0.35128218f,
+  0.19968672f,   0.08099991f,   0.084850654f, -0.16045967f, 0.30286232f,
+  0.6164765f,    -0.27140254f,  0.08210814f,  0.34852806f,  0.25028184f,
+  -0.12188078f,  0.16310331f,   0.31253803f,  -0.10792341f, 0.065858394f,
+  -0.1349708f,   0.08948815f,   0.31905392f,  0.03680656f,  -0.05040944f,
+  -0.051539157f, 0.3211852f,    0.2137136f,   0.45037416f,  0.22748767f,
+  -0.10978614f,  0.06475646f,   -0.16954158f, 0.32831904f,  0.16479677f,
+  -0.30020145f,  0.066221856f,  0.37213042f
+};
+
+static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f };
+
+static const NN_CONFIG av1_intrap_nn_config = {
+  NUM_FEATURES_12,
+  NUM_LOGITS_12,
+  NUM_HIDDEN_LAYERS_12,
+  {
+      NUM_LAYER_0_UNITS_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_kernel_12,
+      av1_intrap_logits_kernel_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_bias_12,
+      av1_intrap_logits_bias_12,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_12
+#undef NUM_FEATURES_12
+#undef NUM_LAYER_0_UNITS_12
+#undef NUM_LOGITS_12
+
+#define NUM_HIDDEN_LAYERS_15 1
+#define NUM_FEATURES_15 6
+#define NUM_LAYER_0_UNITS_15 24
+#define NUM_LOGITS_15 2
+
+static const float av1_intraph_hiddenlayer_0_kernel_15[] = {
+  -0.77480125f,   0.3219551f,    -0.015702145f,   -0.5310235f,   0.5254026f,
+  -1.1522819f,    2.682016f,     0.08001052f,     -0.2539285f,   0.04711023f,
+  -0.81296307f,   0.2675382f,    0.1952474f,      -0.0664705f,   1.2989824f,
+  -0.3150117f,    -0.8022715f,   0.045423955f,    -27.584324f,   -2.5608704f,
+  -3.2280366f,    0.05272543f,   -0.47141576f,    -0.07644298f,  -53.77942f,
+  -22.393923f,    -23.027853f,   -0.00015186476f, -0.010696465f, 2.7064638f,
+  -22.776028f,    11.514891f,    11.138167f,      -0.001243723f, -0.4802433f,
+  -8.758646f,     0.26398206f,   -0.23485385f,    0.27586034f,   -0.004954741f,
+  -0.4935232f,    -0.017607696f, 69.56049f,       -1.1756641f,   -0.052366666f,
+  -0.38052833f,   0.32474658f,   0.04634263f,     0.8583235f,    -0.528438f,
+  -0.7868907f,    -0.4757781f,   0.4620985f,      -0.70621157f,  231.40195f,
+  6.805205f,      9.420295f,     0.02585775f,     -0.03480937f,  1.3577378f,
+  0.1758226f,     15.056758f,    14.437874f,      -0.1305005f,   0.115103304f,
+  0.21297209f,    55.821743f,    -6.611156f,      -6.8552365f,   -0.011928095f,
+  -0.2042175f,    1.2557873f,    -1.0722278f,     -0.2683614f,   0.48318478f,
+  -0.73739994f,   0.54055226f,   -0.03224738f,    -0.06767959f,  -0.21015017f,
+  0.29171246f,    -0.6937296f,   -1.2342545f,     -0.41278538f,  -37.9365f,
+  17.68424f,      16.263042f,    -0.074828684f,   0.06607806f,   -0.16763286f,
+  13.594707f,     0.6152676f,    -0.4371223f,     -0.8365592f,   0.8273623f,
+  -1.2126317f,    0.1216157f,    -1.3002136f,     -0.18856938f,  -0.2589358f,
+  -0.76897144f,   0.21777137f,   -122.25033f,     -0.23490006f,  -3.1238277f,
+  -0.13916978f,   0.08576391f,   -1.7391548f,     -116.24812f,   14.906071f,
+  13.468357f,     0.02332889f,   -0.034617376f,   -18.506111f,   0.7500542f,
+  -1.1882535f,    0.40848416f,   -0.28434393f,    -0.71471655f,  -0.29188696f,
+  -0.46588746f,   -0.17324813f,  -0.62460244f,    -1.1801276f,   0.28993344f,
+  -0.22072886f,   129.2688f,     -0.33782578f,    -0.34836572f,  -0.034112718f,
+  -0.023666814f,  -0.5865087f,   -33.484146f,     1.1431375f,    0.56056374f,
+  -0.0049730353f, -0.24347587f,  -1.3003352f,     0.88973033f,   0.8499571f,
+  -0.5678484f,    -0.39009875f,  -0.062105156f,   -0.13965102f
+};
+
+static const float av1_intraph_hiddenlayer_0_bias_15[] = {
+  0.0f,       -0.2926711f, 0.0f,         -1.0303509f, -27.459345f,  12.412848f,
+  0.0f,       -2.5971522f, -0.02733541f, -19.881912f, 14.391992f,   -8.249469f,
+  0.0f,       0.0f,        13.676118f,   -0.6472994f, -0.07189449f, 1.1986839f,
+  52.479107f, 0.0f,        0.0f,         -3.0187025f, 1.4435643f,   0.0f
+};
+
+static const float av1_intraph_logits_kernel_15[] = {
+  0.05390722f,   -0.06859513f, 0.036842898f, 0.190772f,    0.13623567f,
+  0.09321194f,   0.2314745f,   -0.13958375f, -0.3058229f,  -0.0104543045f,
+  0.11336068f,   -0.276115f,   0.00470723f,  -0.49123898f, -0.15988174f,
+  0.087681435f,  0.022517204f, 0.073877744f, 0.2968856f,   -0.1401399f,
+  -0.38788354f,  -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f,
+  -0.032179773f, -0.35758728f, 0.25819537f,  0.11468631f,  0.13573235f,
+  -0.2672175f,   0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f,
+  -0.21821865f,  0.08434734f,  0.3129456f,   -0.18215221f, 0.08884877f,
+  -0.35621428f,  0.11405768f,  0.27370325f,  0.14956686f,  0.01604587f,
+  -0.18334487f,  -0.42385718f, -0.08033409f
+};
+
+static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f };
+
+static const NN_CONFIG av1_intrap_hd_nn_config = {
+  NUM_FEATURES_15,
+  NUM_LOGITS_15,
+  NUM_HIDDEN_LAYERS_15,
+  {
+      NUM_LAYER_0_UNITS_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_kernel_15,
+      av1_intraph_logits_kernel_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_bias_15,
+      av1_intraph_logits_bias_15,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_15
+#undef NUM_FEATURES_15
+#undef NUM_LAYER_0_UNITS_15
+#undef NUM_LOGITS_15
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_

diff --git a/libaom/av1/encoder/model_rd.h b/libaom/av1/encoder/model_rd.h
new file mode 100644
index 0000000..c353c8f
--- /dev/null
+++ b/libaom/av1/encoder/model_rd.h

@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODEL_RD_H_
+#define AOM_AV1_ENCODER_MODEL_RD_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "aom_ports/system_state.h"
+#include "config/aom_dsp_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
+
+typedef void (*model_rd_for_sb_type)(const AV1_COMP *const cpi,
+                                     BLOCK_SIZE bsize, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, int plane_from,
+                                     int plane_to, int *out_rate_sum,
+                                     int64_t *out_dist_sum, int *skip_txfm_sb,
+                                     int64_t *skip_sse_sb, int *plane_rate,
+                                     int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+                                       const MACROBLOCK *const x,
+                                       BLOCK_SIZE plane_bsize, int plane,
+                                       int64_t sse, int num_samples, int *rate,
+                                       int64_t *dist);
+
+static int64_t calculate_sse(MACROBLOCKD *const xd,
+                             const struct macroblock_plane *p,
+                             struct macroblockd_plane *pd, const int bw,
+                             const int bh) {
+  int64_t sse = 0;
+  const int shift = xd->bd - 8;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  } else {
+    sse =
+        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+  }
+#else
+  sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+#endif
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+  return sse;
+}
+
+static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+                                            int plane, const BLOCK_SIZE bsize) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  int bw, bh;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+
+  return sse;
+}
+
+static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
+                                         const MACROBLOCK *const x,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         int64_t sse, int num_samples,
+                                         int *rate, int64_t *dist) {
+  (void)num_samples;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.rd_sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = p->dequant_QTX[1] >> dequant_shift;
+    if (quantizer < 120)
+      *rate = (int)AOMMIN(
+          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+          INT_MAX);
+    else
+      *rate = 0;
+    assert(*rate >= 0);
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+                                 p->dequant_QTX[1] >> dequant_shift, rate,
+                                 dist);
+  }
+  *dist <<= 4;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *const x,
+                                             BLOCK_SIZE plane_bsize, int plane,
+                                             int64_t sse, int num_samples,
+                                             int *rate, int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xqr = log2(sse_norm / qstepsqr);
+  double rate_f, dist_by_sse_norm_f;
+  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+                       &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static AOM_INLINE void model_rd_for_sb(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+
+    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+    assert(rate_sum >= 0);
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  rate_sum = AOMMIN(rate_sum, INT_MAX);
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static AOM_INLINE void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType);
+
+static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+  model_rd_for_sb, model_rd_for_sb_with_curvfit
+};
+
+static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+  model_rd_from_sse, model_rd_with_curvfit
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_MODEL_RD_H_

diff --git a/libaom/av1/encoder/motion_search_facade.c b/libaom/av1/encoder/motion_search_facade.c
new file mode 100644
index 0000000..8db1423
--- /dev/null
+++ b/libaom/av1/encoder/motion_search_facade.c

@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+  FULLPEL_MV fmv;
+  int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+  const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  const int ref = mbmi->ref_frame[ref_idx];
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+  int step_param;
+  if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) {
+    // Take the weighted average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (av1_init_search_range(x->max_mv_context[ref]) +
+                  mv_search_params->mv_step_param) /
+                 2;
+  } else {
+    step_param = mv_search_params->mv_step_param;
+  }
+
+  if (cpi->sf.mv_sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
+    int boffset =
+        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.mv_sf.adaptive_motion_search) {
+    int bwl = mi_size_wide_log2[bsize];
+    int bhl = mi_size_high_log2[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) {
+      step_param += 2;
+      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+    }
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          best_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            // Swap back the original buffers before returning.
+            for (int j = 0; j < num_planes; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+  FULLPEL_MV start_mv;
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+  else
+    start_mv = get_fullmv_from_mv(&ref_mv);
+
+  // cand stores start_mv and all possible MVs in a SB.
+  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = {
+    { { 0, 0 }, 0 }
+  };
+  cand[0].fmv = start_mv;
+  int cnt = 1;
+  int total_weight = 0;
+
+  if (!cpi->sf.mv_sf.full_pixel_search_level &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    if (x->valid_cost_b) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const int tplw = mi_size_wide[tpl_bsize];
+      const int tplh = mi_size_high[tpl_bsize];
+      const int nw = mi_size_wide[bsize] / tplw;
+      const int nh = mi_size_high[bsize] / tplh;
+
+      if (nw >= 1 && nh >= 1) {
+        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+        int valid = 1;
+
+        // Assign large weight to start_mv, so it is always tested.
+        cand[0].weight = nw * nh;
+
+        for (int k = 0; k < nh; k++) {
+          for (int l = 0; l < nw; l++) {
+            const int_mv mv =
+                x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME];
+            if (mv.as_int == INVALID_MV) {
+              valid = 0;
+              break;
+            }
+
+            const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                     GET_MV_RAWPEL(mv.as_mv.col) };
+            int unique = 1;
+            for (int m = 0; m < cnt; m++) {
+              if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
+                  RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
+                unique = 0;
+                cand[m].weight++;
+                break;
+              }
+            }
+
+            if (unique) {
+              cand[cnt].fmv = fmv;
+              cand[cnt].weight = 1;
+              cnt++;
+            }
+          }
+          if (!valid) break;
+        }
+
+        if (valid) {
+          total_weight = 2 * nh * nw;
+          if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight);
+        }
+      }
+    }
+  }
+
+  // Further reduce the search range.
+  if (search_range < INT_MAX) {
+    const search_site_config *ss_cfg = &mv_search_params->ss_cfg[SS_CFG_SRC];
+    // MAx step_param is ss_cfg->ss_count.
+    if (search_range < 1) {
+      step_param = ss_cfg->ss_count;
+    } else {
+      while (ss_cfg->radius[ss_cfg->ss_count - step_param - 1] >
+                 (search_range << 1) &&
+             ss_cfg->ss_count - step_param - 1 > 0)
+        step_param++;
+    }
+  }
+
+  int cost_list[5];
+  int_mv second_best_mv;
+  best_mv->as_int = second_best_mv.as_int = INVALID_MV;
+
+  const search_site_config *src_search_sites =
+      &mv_search_params->ss_cfg[SS_CFG_SRC];
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     src_search_sites);
+
+  switch (mbmi->motion_mode) {
+    case SIMPLE_TRANSLATION: {
+      int sum_weight = 0;
+
+      for (int m = 0; m < cnt; m++) {
+        FULLPEL_MV smv = cand[m].fmv;
+        FULLPEL_MV this_best_mv, this_second_best_mv;
+
+        int thissme = av1_full_pixel_search(
+            smv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+            &this_best_mv, &this_second_best_mv);
+
+        if (thissme < bestsme) {
+          bestsme = thissme;
+          best_mv->as_fullmv = this_best_mv;
+          second_best_mv.as_fullmv = this_second_best_mv;
+        }
+
+        sum_weight += cand[m].weight;
+        if (m >= 2 || 4 * sum_weight > 3 * total_weight) break;
+      }
+    } break;
+    case OBMC_CAUSAL:
+      bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
+                                           step_param, &best_mv->as_fullmv);
+      break;
+    default: assert(0 && "Invalid motion mode!\n");
+  }
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  // Terminate search with the current ref_idx if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION &&
+      best_mv->as_int != INVALID_MV) {
+    int_mv this_mv;
+    this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+    const int ref_mv_idx = mbmi->ref_mv_idx;
+    const int this_mv_rate =
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost,
+                        x->mv_cost_stack, MV_COST_WEIGHT);
+    mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+    mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+
+    for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+      // Check if the motion search result same as previous results
+      if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+        // Compare the rate cost
+        const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+                                   mode_info[prev_ref_idx].drl_cost;
+        const int this_rate_cost =
+            this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+        if (prev_rate_cost <= this_rate_cost) {
+          // If the current rate_cost is worse than the previous rate_cost, then
+          // we terminate the search. Since av1_single_motion_search is only
+          // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+          // best_mv to INVALID mv to signal that we wish to terminate search
+          // for the current mode.
+          best_mv->as_int = INVALID_MV;
+          return;
+        }
+      }
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(best_mv);
+  }
+
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int_mv fractional_ms_list[3];
+    av1_set_fractional_mv(fractional_ms_list);
+    int dis; /* TODO: use dis in distortion calculation later. */
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+
+    switch (mbmi->motion_mode) {
+      case SIMPLE_TRANSLATION:
+        if (cpi->sf.mv_sf.use_accurate_subpel_search) {
+          const int try_second = second_best_mv.as_int != INVALID_MV &&
+                                 second_best_mv.as_int != best_mv->as_int;
+          const int best_mv_var = mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
+              &x->pred_sse[ref], fractional_ms_list);
+
+          if (try_second) {
+            MV this_best_mv;
+            subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+            if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
+                                         subpel_start_mv)) {
+              const int this_var = mv_search_params->find_fractional_mv_step(
+                  xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
+                  &x->pred_sse[ref], fractional_ms_list);
+              if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+            }
+          }
+        } else {
+          mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
+              &x->pred_sse[ref], NULL);
+        }
+        break;
+      case OBMC_CAUSAL:
+        av1_find_best_obmc_sub_pixel_tree_up(xd, cm, &ms_params,
+                                             subpel_start_mv, &best_mv->as_mv,
+                                             &dis, &x->pred_sse[ref], NULL);
+        break;
+      default: assert(0 && "Invalid motion mode!\n");
+    }
+  }
+  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, x->nmv_vec_cost,
+                             x->mv_cost_stack, MV_COST_WEIGHT);
+
+  if (cpi->sf.mv_sf.adaptive_motion_search &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION)
+    x->pred_mv[ref] = best_mv->as_mv;
+}
+
+void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int_mv *cur_mv,
+                             const uint8_t *mask, int mask_stride,
+                             int *rate_mv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  const int plane = 0;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  int_mv ref_mv[2];
+  int ite, ref;
+
+  // Get the prediction block from the 'other' reference frame.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  InterPredParams inter_pred_params;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+  int_mv best_mv;
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+      if (cur_mv[id].as_int == init_mv[id].as_int) {
+        break;
+      } else {
+        int_mv cur_int_mv, init_int_mv;
+        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+        if (cur_int_mv.as_int == init_int_mv.as_int) {
+          break;
+        }
+      }
+    }
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
+    ref_yv12[0] = xd->plane[plane].pre[0];
+    ref_yv12[1] = xd->plane[plane].pre[1];
+
+    av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          &cm->sf_identity, &ref_yv12[!id], interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
+    av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
+                                      &inter_pred_params);
+
+    const int order_idx = id != 0;
+    av1_dist_wtd_comp_weight_assign(
+        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
+
+    // Do full-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    // Make motion search params
+    FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+    av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                       &ref_mv[id].as_mv, NULL);
+    av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                             mask_stride, id);
+
+    // Use the mv result from the single mode as mv predictor.
+    const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
+
+    // Small-range full-pixel motion search.
+    bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                       &best_mv.as_fullmv);
+
+    if (bestsme < INT_MAX) {
+      bestsme = av1_get_mvpred_compound_var(
+          &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
+          mask_stride, id, &cpi->fn_ptr[bsize], &x->plane[0].src,
+          &ref_yv12[id]);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.features.cur_frame_force_integer_mv) {
+      convert_fullmv_to_mv(&best_mv);
+    }
+    if (bestsme < INT_MAX &&
+        cpi->common.features.cur_frame_force_integer_mv == 0) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+      av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                        &ref_mv[id].as_mv, NULL);
+      av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                               mask, mask_stride, id);
+      ms_params.forced_stop = EIGHTH_PEL;
+      MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+      bestsme = cpi->mv_search_params.find_fractional_mv_step(
+          xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+    }
+
+    // Restore the pointer to the first prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+    if (bestsme < last_besterr[id]) {
+      cur_mv[id] = best_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv +=
+        av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost,
+                        x->mv_cost_stack, MV_COST_WEIGHT);
+  }
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, MV *this_mv,
+                                       const uint8_t *second_pred,
+                                       const uint8_t *mask, int mask_stride,
+                                       int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int ref = mbmi->ref_frame[ref_idx];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  struct buf_2d ref_yv12 = pd->pre[ref_idx];
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  int bestsme = INT_MAX;
+  int_mv best_mv;
+
+  // Make motion search params
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                     &ref_mv.as_mv, NULL);
+  av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                           mask_stride, ref_idx);
+
+  // Use the mv result from the single mode as mv predictor.
+  const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
+
+  // Small-range full-pixel motion search.
+  bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                     &best_mv.as_fullmv);
+
+  if (bestsme < INT_MAX) {
+    bestsme = av1_get_mvpred_compound_var(
+        &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
+        mask_stride, ref_idx, &cpi->fn_ptr[bsize], &x->plane[0].src, &ref_yv12);
+  }
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(&best_mv);
+  }
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    unsigned int sse;
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv,
+                                      NULL);
+    av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                             mask, mask_stride, ref_idx);
+    ms_params.forced_stop = EIGHTH_PEL;
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    bestsme = cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+  }
+
+  // Restore the pointer to the first unscaled prediction buffer.
+  if (ref_idx) pd->pre[0] = orig_yv12;
+
+  if (bestsme < INT_MAX) *this_mv = best_mv.as_mv;
+
+  *rate_mv = 0;
+
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                              x->mv_cost_stack, MV_COST_WEIGHT);
+}
+
+static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
+                                               MACROBLOCK *x, BLOCK_SIZE bsize,
+                                               const MV *other_mv, int ref_idx,
+                                               uint8_t *second_pred) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x);
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y);
+
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
+
+  InterPredParams inter_pred_params;
+
+  av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
+                        mbmi->interp_filters);
+  inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+
+  // Get the prediction block from the 'other' reference frame.
+  av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
+                                    &inter_pred_params);
+
+  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                                  &xd->jcp_param.bck_offset,
+                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
+}
+
+// Wrapper for av1_compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+void av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  if (is_cur_buf_hbd(xd))
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+  else
+    second_pred = (uint8_t *)second_pred_alloc_16;
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+  build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
+  av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask,
+                                    mask_stride, rate_mv, ref_idx);
+}
+
+static AOM_INLINE void do_masked_motion_search_indexed(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+    int_mv *tmp_mv, int *rate_mv, int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = block_size_wide[bsize];
+
+  mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
+  if (which == 0 || which == 1) {
+    av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+                                                 mask_stride, rate_mv, which);
+  } else if (which == 2) {
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv);
+  }
+}
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int_mv tmp_mv[2];
+  int tmp_rate_mv = 0;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, 2);
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) {
+    // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+    // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV
+    int which = (NEWMV == compound_ref1_mode(this_mode));
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, which);
+    mbmi->mv[which].as_int = tmp_mv[which].as_int;
+  }
+  return tmp_rate_mv;
+}
+
+int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize, int ref,
+                                FULLPEL_MV start_mv, int num_planes,
+                                int use_subpixel) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to calculate the cost of the motion vector
+  const MV ref_mv = kZeroMv;
+  const int step_param = cpi->mv_search_params.mv_step_param;
+  const search_site_config *src_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  int cost_list[5];
+  const int ref_idx = 0;
+  int var;
+  int_mv best_mv;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     src_search_sites);
+
+  var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                              cond_cost_list(cpi, cost_list),
+                              &best_mv.as_fullmv, NULL);
+
+  const int use_subpel_search =
+      var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+      use_subpixel;
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+    ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop;
+
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, &not_used,
+        &x->pred_sse[ref], NULL);
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    convert_fullmv_to_mv(&best_mv);
+  }
+
+  mbmi->mv[0] = best_mv;
+
+  // Get a copy of the prediction output
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  aom_clear_system_state();
+
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+
+  return best_mv;
+}
+
+int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize,
+                                 const FULLPEL_MV start_mv, int use_subpixel,
+                                 unsigned int *sse, unsigned int *var) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const MV_REFERENCE_FRAME ref =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+
+  int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+                                            start_mv, 1, use_subpixel);
+
+  const uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+
+  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+
+  return best_mv;
+}

diff --git a/libaom/av1/encoder/motion_search_facade.h b/libaom/av1/encoder/motion_search_facade.h
new file mode 100644
index 0000000..3b86e93
--- /dev/null
+++ b/libaom/av1/encoder/motion_search_facade.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_
+#define AOM_AV1_ENCODER_MOTION_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int64_t rd;
+  int drl_cost;
+
+  int rate_mv;
+  int_mv mv;
+
+  int_mv full_search_mv;
+  int full_mv_rate;
+} inter_mode_info;
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv);
+
+void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int_mv *cur_mv,
+                             const uint8_t *mask, int mask_stride,
+                             int *rate_mv);
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode);
+
+void av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
+
+void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, MV *this_mv,
+                                       const uint8_t *second_pred,
+                                       const uint8_t *mask, int mask_stride,
+                                       int *rate_mv, int ref_idx);
+
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref. Note that this sets the offset of mbmi, so we will need to reset it
+// after calling this function.
+int_mv av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                int ref, FULLPEL_MV start_mv, int num_planes,
+                                int use_subpixel);
+
+// Performs a simple motion search to calculate the sse and var of the residue
+int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                 const FULLPEL_MV start_mv, int use_subpixel,
+                                 unsigned int *sse, unsigned int *var);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MOTION_SEARCH_H_

diff --git a/libaom/av1/encoder/mv_prec.c b/libaom/av1/encoder/mv_prec.c
new file mode 100644
index 0000000..8fcbde9
--- /dev/null
+++ b/libaom/av1/encoder/mv_prec.c

@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/encodemv.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/misc_model_weights.h"
+#endif  // !CONFIG_REALTIME_ONLY
+#include "av1/encoder/mv_prec.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+    const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    int ref_idx) {
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+
+  const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frames[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+  const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
+  const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
+  const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
+
+  return av1_cost_symbol(p15);
+}
+
+static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+                                         int comp_idx, const AV1_COMP *cpi,
+                                         int *rates) {
+  assert(comp_val != 0 && "mv component should not have zero value!");
+  const int sign = comp_val < 0;
+  const int mag = sign ? -comp_val : comp_val;
+  const int mag_minus_1 = mag - 1;
+  int offset;
+  const int mv_class = av1_get_mv_class(mag_minus_1, &offset);
+  const int int_part = offset >> 3;         // int mv data
+  const int frac_part = (offset >> 1) & 3;  // fractional mv data
+  const int high_part = offset & 1;         // high precision mv data
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+  int r_idx = 0;
+
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  nmv_component *mvcomp_ctx = nmvc->comps;
+  nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx];
+  aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf;
+  aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf;
+  aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf;
+  aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf;
+  aom_cdf_prob *frac_part_cdf = mv_class
+                                    ? (cur_mvcomp_ctx->fp_cdf)
+                                    : (cur_mvcomp_ctx->class0_fp_cdf[int_part]);
+  aom_cdf_prob *high_part_cdf =
+      mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf);
+
+  const int sign_rate = get_symbol_cost(sign_cdf, sign);
+  rates[r_idx++] = sign_rate;
+  update_cdf(sign_cdf, sign, 2);
+
+  const int class_rate = get_symbol_cost(class_cdf, mv_class);
+  rates[r_idx++] = class_rate;
+  update_cdf(class_cdf, mv_class, MV_CLASSES);
+
+  int int_bit_rate = 0;
+  if (mv_class == MV_CLASS_0) {
+    int_bit_rate = get_symbol_cost(class0_cdf, int_part);
+    update_cdf(class0_cdf, int_part, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i) {
+      int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1);
+      update_cdf(bits_cdf[i], (int_part >> i) & 1, 2);
+    }
+  }
+  rates[r_idx++] = int_bit_rate;
+  const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part);
+  rates[r_idx++] = frac_part_rate;
+  update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE);
+  const int high_part_rate =
+      use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0;
+  if (use_hp) {
+    update_cdf(high_part_cdf, high_part, 2);
+  }
+  rates[r_idx++] = high_part_rate;
+
+  mv_stats->last_bit_zero += !high_part;
+  mv_stats->last_bit_nonzero += high_part;
+  const int total_rate =
+      (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate);
+  return total_rate;
+}
+
+static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+                                        const MV *cur_mv, const AV1_COMP *cpi) {
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  aom_cdf_prob *joint_cdf = nmvc->joints_cdf;
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+
+  const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col };
+  const int mv_joint = av1_get_mv_joint(&diff);
+  // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp
+  const MV hp_diff = diff;
+  const int hp_mv_joint = av1_get_mv_joint(&hp_diff);
+  const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 };
+  const MV lp_diff = use_hp ? truncated_diff : diff;
+  const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
+
+  aom_clear_system_state();
+  const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
+  const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
+  const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
+
+  update_cdf(joint_cdf, mv_joint, MV_JOINTS);
+
+  mv_stats->total_mv_rate += mv_joint_rate;
+  mv_stats->hp_total_mv_rate += hp_mv_joint_rate;
+  mv_stats->lp_total_mv_rate += lp_mv_joint_rate;
+  mv_stats->mv_joint_count[mv_joint]++;
+
+  for (int comp_idx = 0; comp_idx < 2; comp_idx++) {
+    const int comp_val = comp_idx ? diff.col : diff.row;
+    const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row;
+    const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row;
+    int rates[5];
+    av1_zero_array(rates, 5);
+
+    const int comp_rate =
+        comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates)
+                 : 0;
+    // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false
+    const int hp_rate =
+        hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0;
+    const int lp_rate =
+        lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0;
+
+    mv_stats->total_mv_rate += comp_rate;
+    mv_stats->hp_total_mv_rate += hp_rate;
+    mv_stats->lp_total_mv_rate += lp_rate;
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
+                                          const AV1_COMP *cpi, int mi_row,
+                                          int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) {
+    return;
+  }
+
+  const MB_MODE_INFO *mbmi =
+      mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col];
+  const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+
+  if (!is_inter_block(mbmi)) {
+    mv_stats->intra_count++;
+    return;
+  }
+  mv_stats->inter_count++;
+
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int is_compound = has_second_ref(mbmi);
+
+  if (mode == NEWMV || mode == NEW_NEWMV) {
+    // All mvs are new
+    for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) {
+      const MV ref_mv =
+          get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+      const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+      keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV ||
+             mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    // has exactly one new_mv
+    mv_stats->default_mvs += 1;
+
+    const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV);
+    const MV ref_mv =
+        get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+    const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+
+    keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+  } else {
+    // No new_mv
+    mv_stats->default_mvs += 1 + is_compound;
+  }
+
+  // Add texture information
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int num_rows = block_size_high[bsize];
+  const int num_cols = block_size_wide[bsize];
+  const int y_stride = cpi->source->y_stride;
+  const int px_row = 4 * mi_row, px_col = 4 * mi_col;
+  const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd = cm->seq_params.bit_depth;
+  if (buf_is_hbd) {
+    uint16_t *source_buf =
+        CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff =
+            abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  } else {
+    uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  }
+}
+
+// Split block
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+                                           const AV1_COMP *cpi, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *cm = &cpi->common;
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int qbs = mi_size_wide[bsize] / 4;
+  switch (partition) {
+    case PARTITION_NONE:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_row = mi_row + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_col = mi_col + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
+                                             const AV1_COMP *cpi,
+                                             const TileInfo *tile_info) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int mi_row_start = tile_info->mi_row_start;
+  const int mi_row_end = tile_info->mi_row_end;
+  const int mi_col_start = tile_info->mi_col_start;
+  const int mi_col_end = tile_info->mi_col_end;
+  const int sb_size_mi = cm->seq_params.mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
+    for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+    }
+  }
+}
+
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
+  MV_STATS *mv_stats = &cpi->mv_stats;
+  const AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx;
+      collect_mv_stats_tile(mv_stats, cpi, &tile_info);
+    }
+  }
+
+  mv_stats->q = current_q;
+  mv_stats->order = cpi->common.current_frame.order_hint;
+  mv_stats->valid = 1;
+}
+
+static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+                                        int current_q) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int order_hint = cpi->common.current_frame.order_hint;
+  const int order_diff = order_hint - mv_stats->order;
+  aom_clear_system_state();
+  const float area = (float)(cm->width * cm->height);
+  float features[MV_PREC_FEATURE_SIZE] = {
+    (float)current_q,
+    (float)mv_stats->q,
+    (float)order_diff,
+    mv_stats->inter_count / area,
+    mv_stats->intra_count / area,
+    mv_stats->default_mvs / area,
+    mv_stats->mv_joint_count[0] / area,
+    mv_stats->mv_joint_count[1] / area,
+    mv_stats->mv_joint_count[2] / area,
+    mv_stats->mv_joint_count[3] / area,
+    mv_stats->last_bit_zero / area,
+    mv_stats->last_bit_nonzero / area,
+    mv_stats->total_mv_rate / area,
+    mv_stats->hp_total_mv_rate / area,
+    mv_stats->lp_total_mv_rate / area,
+    mv_stats->horz_text / area,
+    mv_stats->vert_text / area,
+    mv_stats->diag_text / area,
+  };
+
+  for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) {
+    features[f_idx] =
+        (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx];
+  }
+  float score = 0.0f;
+
+  av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score);
+
+  const int use_high_hp = score >= 0.0f;
+  return use_high_hp;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
+  int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+
+  if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
+    use_hp = 0;
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+           av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) {
+    use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  av1_set_high_precision_mv(cpi, use_hp,
+                            cpi->common.features.cur_frame_force_integer_mv);
+}

diff --git a/libaom/av1/encoder/mv_prec.h b/libaom/av1/encoder/mv_prec.h
new file mode 100644
index 0000000..8df8b96
--- /dev/null
+++ b/libaom/av1/encoder/mv_prec.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MV_PREC_H_
+#define AOM_AV1_ENCODER_MV_PREC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+
+// Q threshold for high precision mv.
+#define HIGH_PRECISION_MV_QTHRESH 128
+#if !CONFIG_REALTIME_ONLY
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
+
+static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+  const int gf_group_index = cpi->gf_group.index;
+  const int gf_update_type = cpi->gf_group.update_type[gf_group_index];
+  return !frame_is_intra_only(&cpi->common) &&
+         !(gf_update_type == INTNL_OVERLAY_UPDATE ||
+           gf_update_type == OVERLAY_UPDATE);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void av1_set_high_precision_mv(
+    AV1_COMP *cpi, int allow_high_precision_mv,
+    int cur_frame_force_integer_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  const int copy_hp = cpi->common.features.allow_high_precision_mv =
+      allow_high_precision_mv && !cur_frame_force_integer_mv;
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost;
+  x->mv_cost_stack = *src;
+}
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
+
+#endif  // AOM_AV1_ENCODER_MV_PREC_H_

diff --git a/libaom/av1/encoder/nonrd_pickmode.c b/libaom/av1/encoder/nonrd_pickmode.c
new file mode 100644
index 0000000..a118001
--- /dev/null
+++ b/libaom/av1/encoder/nonrd_pickmode.c

@@ -0,0 +1,2182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/model_rd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern int g_pick_inter_mode_cnt;
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_SIZE best_intra_tx_size;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  int_interpfilters best_pred_filter;
+} BEST_PICKMODE;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+static const int pos_shift_16x16[4][4] = {
+  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+#define RT_INTER_MODES 9
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
+  { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, NEARMV },    { GOLDEN_FRAME, NEWMV },
+  { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+  { ALTREF_FRAME, NEWMV }
+};
+
+static const THR_MODES mode_idx[REF_FRAMES][4] = {
+  { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+  { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+  { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+  { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+  { THR_NEARESTG, THR_NEARG, THR_GLOBALMV, THR_NEWG },
+};
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+                                                   SMOOTH_PRED };
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED: return 0;
+      case V_PRED: return 1;
+      case H_PRED: return 2;
+      case SMOOTH_PRED: return 3;
+      default: assert(0); return -1;
+    }
+  }
+}
+
+enum {
+  //  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
+  INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
+};
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_mode = NEARESTMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_tx_size = TX_8X8;
+  bp->best_intra_tx_size = TX_8X8;
+  bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+  bp->best_mode_skip_txfm = 0;
+  bp->best_second_ref_frame = NONE_FRAME;
+  bp->best_pred = NULL;
+}
+
+static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int_mv *tmp_mv, int *rate_mv,
+                                  int64_t best_rd_sofar, int use_base_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int step_param = cpi->mv_search_params.mv_step_param;
+  FULLPEL_MV start_mv;
+  const int ref = mi->ref_frame[0];
+  const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
+  MV center_mv;
+  int dis;
+  int rv = 0;
+  int cost_list[5];
+  int search_subpel = 1;
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  start_mv = get_fullmv_from_mv(&ref_mv);
+
+  if (!use_base_mv)
+    center_mv = ref_mv;
+  else
+    center_mv = tmp_mv->as_mv;
+
+  const search_site_config *src_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     src_search_sites);
+
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv,
+                        NULL);
+
+  // calculate the bit cost on motion vector
+  MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+
+  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->nmv_vec_cost,
+                             x->mv_cost_stack, MV_COST_WEIGHT);
+
+  // TODO(kyslov) Account for Rate Mode!
+  rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
+
+  if (rv && search_subpel) {
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
+        &x->pred_sse[ref], NULL);
+
+    *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmv_vec_cost,
+                               x->mv_cost_stack, MV_COST_WEIGHT);
+  }
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+  return rv;
+}
+
+static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int best_pred_sad, int *rate_mv, RD_STATS *best_rdc) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  AV1_COMMON *cm = &cpi->common;
+  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+      cpi->oxcf.rc_mode == AOM_CBR) {
+    int tmp_sad;
+    int dis;
+    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    tmp_sad = av1_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+    frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+    int_mv best_mv = mi->mv[0];
+    best_mv.as_mv.row >>= 3;
+    best_mv.as_mv.col >>= 3;
+    MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
+
+    *rate_mv =
+        av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
+                        x->nmv_vec_cost, x->mv_cost_stack, MV_COST_WEIGHT);
+    frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+    frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
+        &x->pred_sse[ref_frame], NULL);
+    frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int;
+  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], rate_mv,
+                                     best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static INLINE void find_predictors(
+    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int *ref_frame_skip_mask,
+    const int flag_list[4], TileDataEnc *tile_data,
+    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+    int force_skip_low_temp_var) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  const int num_planes = av1_num_planes(cm);
+  (void)tile_data;
+
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  // TODO(kyslov) this needs various further optimizations. to be continued..
+  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, ref_frame);
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    av1_find_best_ref_mvs_from_stack(
+        cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+        &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+    if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+      av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+                  bsize);
+    }
+  } else {
+    *ref_frame_skip_mask |= (1 << ref_frame);
+  }
+  av1_count_overlappable_neighbors(cm, xd);
+  mbmi->num_proj_ref = 1;
+}
+
+static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
+                                            const MACROBLOCKD *xd,
+                                            const MACROBLOCK *x, int segment_id,
+                                            unsigned int *ref_costs_single) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+  }
+}
+
+static void estimate_comp_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+      // cost: if one ref frame is forward ref, the other ref is backward ref
+      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
+      }
+
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+    } else {
+      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+    }
+  }
+}
+
+static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCK *const x, unsigned int var,
+                                 unsigned int sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TX_SIZE tx_size;
+  if (x->tx_mode_search_type == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = AOMMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+  } else {
+    tx_size = AOMMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+  }
+
+  if (x->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
+    tx_size = TX_16X16;
+
+  return AOMMIN(tx_size, TX_16X16);
+}
+
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
+                                                          2, 2, 3, 3, 3, 4,
+                                                          4, 4, 5, 5 };
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
+                                                           2, 3, 2, 3, 4, 3,
+                                                           4, 5, 4, 5 };
+
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride, int w, int h,
+                           unsigned int *sse, int *sum, int block_size,
+                           uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      aom_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride, &sse8x8[k],
+                    &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+                 sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+                 sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                                       (b_width_log2_lookup[unit_size] +
+                                        b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(const int speed, const int width, const int height,
+                         const int norm_sum) {
+  if (speed >= 8 && norm_sum < 5) {
+    if (width <= 640 && height <= 480)
+      return 4;
+    else
+      return 2;
+  }
+  return 1;
+}
+
+static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                      int mi_row, int mi_col, MACROBLOCK *x,
+                                      MACROBLOCKD *xd, int *out_rate,
+                                      int64_t *out_dist, unsigned int *var_y,
+                                      unsigned int *sse_y, int *early_term,
+                                      int calculate_rd) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = p->dequant_QTX[0];
+  const uint32_t ac_quant = p->dequant_QTX[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[256] = { 0 };
+  int sum8x8[256] = { 0 };
+  unsigned int var8x8[256] = { 0 };
+  TX_SIZE tx_size;
+  int k;
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                          cpi->common.height, abs(sum) >> (bw + bh));
+
+  tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+  // The code below for setting skip flag assumes tranform size of at least 8x8,
+  // so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
+  xd->mi[0]->tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[64] = { 0 };
+    int sum16x16[64] = { 0 };
+    unsigned int var16x16[64] = { 0 };
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[16] = { 0 };
+    int sum32x32[16] = { 0 };
+    unsigned int var32x32[16] = { 0 };
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8)
+                        ? num8x8
+                        : ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx =
+        (tx_size == TX_8X8) ? sse8x8
+                            : ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx =
+        (tx_size == TX_8X8) ? var8x8
+                            : ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    *early_term = 0;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test && dc_test) {
+      int skip_uv[2] = { 0 };
+      unsigned int var_uv[2];
+      unsigned int sse_uv[2];
+      AV1_COMMON *const cm = &cpi->common;
+      // Transform skipping test in UV planes.
+      for (int i = 1; i <= 2; i++) {
+        int j = i - 1;
+        skip_uv[j] = 1;
+        if (x->color_sensitivity[j]) {
+          skip_uv[j] = 0;
+          struct macroblock_plane *const puv = &x->plane[i];
+          struct macroblockd_plane *const puvd = &xd->plane[i];
+          const BLOCK_SIZE uv_bsize = get_plane_block_size(
+              bsize, puvd->subsampling_x, puvd->subsampling_y);
+          // Adjust these thresholds for UV.
+          const int64_t uv_dc_thr =
+              (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3;
+          const int64_t uv_ac_thr =
+              (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
+                                        i);
+          var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+                                               puvd->dst.buf, puvd->dst.stride,
+                                               &sse_uv[j]);
+          if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+              (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+            skip_uv[j] = 1;
+          else
+            break;
+        }
+      }
+      if (skip_uv[0] & skip_uv[1]) {
+        *early_term = 1;
+      }
+    }
+  }
+  if (calculate_rd && out_dist != NULL && out_rate != NULL) {
+    if (!*early_term) {
+      const int bwide = block_size_wide[bsize];
+      const int bhigh = block_size_high[bsize];
+
+      model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh,
+                            out_rate, out_dist);
+    }
+
+    if (*early_term) {
+      *out_rate = 0;
+      *out_dist = sse << 4;
+    }
+  }
+}
+
+static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum,
+                              int64_t *out_dist_sum, int *skip_txfm_sb,
+                              int64_t *skip_sse_sb, unsigned int *var_y,
+                              unsigned int *sse_y, int calculate_rd) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+
+  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                           pd->dst.buf, pd->dst.stride, &sse);
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+
+  if (calculate_rd) {
+    const int bwide = block_size_wide[bsize];
+    const int bhigh = block_size_high[bsize];
+    model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
+                          &dist);
+  } else {
+    rate = INT_MAX;  // this will be overwritten later with block_yrd
+    dist = INT_MAX;
+  }
+  *var_y = var;
+  *sse_y = sse;
+  x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+  assert(rate >= 0);
+
+  if (skip_txfm_sb) *skip_txfm_sb = rate == 0;
+  if (skip_sse_sb) *skip_sse_sb = sse << 4;
+  rate = AOMMIN(rate, INT_MAX);
+  *out_rate_sum = (int)rate;
+  *out_dist_sum = dist;
+}
+
+static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+                      RD_STATS *this_rdc, int *skippable, int64_t *sse,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int num_4x4_w = mi_size_wide[bsize];
+  const int num_4x4_h = mi_size_high[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0;
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+  int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
+
+  (void)mi_row;
+  (void)mi_col;
+  (void)cpi;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+  } else {
+    aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride);
+  }
+#else
+  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+#endif
+
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+        const int block_offset = BLOCK_OFFSET(block);
+#if CONFIG_AV1_HIGHBITDEPTH
+        tran_low_t *const coeff = p->coeff + block_offset;
+        tran_low_t *const qcoeff = p->qcoeff + block_offset;
+        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+#else
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+#endif
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = bw;
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_64X64:
+            assert(0);  // Not implemented
+            break;
+          case TX_32X32:
+            assert(0);  // Not used
+            break;
+#if CONFIG_AV1_HIGHBITDEPTH
+          case TX_16X16:
+            aom_hadamard_16x16(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+            break;
+          case TX_8X8:
+            aom_hadamard_8x8(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+            break;
+#else
+          case TX_16X16:
+            aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+            av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                            p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                            p->dequant_QTX, eob, scan_order->scan);
+            break;
+          case TX_8X8:
+            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                            scan_order->scan);
+            break;
+          default:
+            assert(tx_size == TX_4X4);
+            x->fwd_txfm4x4(src_diff, low_coeff, diff_stride);
+            av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                            scan_order->scan);
+            break;
+#endif
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+  this_rdc->skip = *skippable;
+  this_rdc->rate = 0;
+  if (*sse < INT64_MAX) {
+    *sse = (*sse << 6) >> 2;
+    if (*skippable) {
+      this_rdc->dist = *sse;
+      return;
+    }
+  }
+
+  block = 0;
+  this_rdc->dist = 0;
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const int block_offset = BLOCK_OFFSET(block);
+        uint16_t *const eob = &p->eobs[block];
+#if CONFIG_AV1_HIGHBITDEPTH
+        int64_t dummy;
+        tran_low_t *const coeff = p->coeff + block_offset;
+        tran_low_t *const qcoeff = p->qcoeff + block_offset;
+        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+
+        if (*eob == 1)
+          this_rdc->rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          this_rdc->rate += aom_satd(qcoeff, step << 4);
+
+        this_rdc->dist +=
+            av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+#else
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+
+        if (*eob == 1)
+          this_rdc->rate += (int)abs(low_qcoeff[0]);
+        else if (*eob > 1)
+          this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+        this_rdc->dist +=
+            av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+#endif
+      }
+      block += step;
+    }
+  }
+
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode,
+                             MV_REFERENCE_FRAME ref_frame0,
+                             MV_REFERENCE_FRAME ref_frame1,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = pred_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame0;
+  mbmi->ref_frame[1] = ref_frame1;
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 1;
+  mbmi->interintra_mode = 0;
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+#if CONFIG_INTERNAL_STATS
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                                 int mode_index) {
+#else
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+#endif  // CONFIG_INTERNAL_STATS
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->rd_stats.skip = x->force_skip;
+  memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
+  memset(ctx->tx_type_map, DCT_DCT,
+         sizeof(ctx->tx_type_map[0]) * ctx->num_4x4_blk);
+  ctx->skippable = x->force_skip;
+#if CONFIG_INTERNAL_STATS
+  ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  ctx->mic = *xd->mi[0];
+  ctx->skippable = x->force_skip;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  ctx->comp_pred_diff = 0;
+  ctx->hybrid_pred_diff = 0;
+  ctx->single_pred_diff = 0;
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  for (int i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  if (p != NULL) p->in_use = 0;
+}
+
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return x
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
+                            RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
+                            int mv_col, int speed, uint32_t spatial_variance) {
+  // Bias against MVs associated with NEWMV mode that are very different from
+  // top/left neighbors.
+  if (this_mode == NEWMV) {
+    int al_mv_average_row;
+    int al_mv_average_col;
+    int left_row, left_col;
+    int row_diff, col_diff;
+    int above_mv_valid = 0;
+    int left_mv_valid = 0;
+    int above_row = 0;
+    int above_col = 0;
+
+    if (xd->above_mbmi) {
+      above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
+      above_row = xd->above_mbmi->mv[0].as_mv.row;
+      above_col = xd->above_mbmi->mv[0].as_mv.col;
+    }
+    if (xd->left_mbmi) {
+      left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
+      left_row = xd->left_mbmi->mv[0].as_mv.row;
+      left_col = xd->left_mbmi->mv[0].as_mv.col;
+    }
+    if (above_mv_valid && left_mv_valid) {
+      al_mv_average_row = (above_row + left_row + 1) >> 1;
+      al_mv_average_col = (above_col + left_col + 1) >> 1;
+    } else if (above_mv_valid) {
+      al_mv_average_row = above_row;
+      al_mv_average_col = above_col;
+    } else if (left_mv_valid) {
+      al_mv_average_row = left_row;
+      al_mv_average_col = left_col;
+    } else {
+      al_mv_average_row = al_mv_average_col = 0;
+    }
+    row_diff = al_mv_average_row - mv_row;
+    col_diff = al_mv_average_col - mv_col;
+    if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
+      if (bsize >= BLOCK_32X32)
+        this_rdc->rdcost = this_rdc->rdcost << 1;
+      else
+        this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+    }
+  } else {
+    // Bias for speed >= 8 for low spatial variance.
+    if (speed >= 8 && spatial_variance < 150 &&
+        (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
+      this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+  }
+}
+
+static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               RD_STATS *this_rdc, unsigned int *var_y,
+                               unsigned int *sse_y, int start_plane,
+                               int stop_plane) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+  uint32_t tot_var = *var_y;
+  uint32_t tot_sse = *sse_y;
+
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
+  this_rdc->skip = 0;
+
+  for (i = start_plane; i <= stop_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = p->dequant_QTX[0];
+    const uint32_t ac_quant = p->dequant_QTX[1];
+    const BLOCK_SIZE bs = plane_bsize;
+    unsigned int var;
+    if (!x->color_sensitivity[i - 1]) continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                             pd->dst.stride, &sse);
+    assert(sse >= var);
+    tot_var += var;
+    tot_sse += sse;
+
+    av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
+
+    av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+                                 &rate, &dist);
+
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
+  }
+
+  if (this_rdc->rate == 0) {
+    this_rdc->skip = 1;
+  }
+
+  if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
+      RDCOST(x->rdmult, 0, ((int64_t)tot_sse) << 4)) {
+    this_rdc->rate = 0;
+    this_rdc->dist = tot_sse << 4;
+    this_rdc->skip = 1;
+  }
+
+  *var_y = tot_var;
+  *sse_y = tot_sse;
+}
+
+struct estimate_block_intra_args {
+  AV1_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int skippable;
+  RD_STATS *rdc;
+};
+
+static void estimate_block_intra(int plane, int block, int row, int col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
+  struct estimate_block_intra_args *const args = arg;
+  AV1_COMP *const cpi = args->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int64_t src_stride = p->src.stride;
+  const int64_t dst_stride = pd->dst.stride;
+  RD_STATS this_rdc;
+
+  (void)block;
+
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (plane == 0) {
+    int64_t this_sse = INT64_MAX;
+    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+              AOMMIN(tx_size, TX_16X16));
+  } else {
+    unsigned int var = 0;
+    unsigned int sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
+  }
+
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
+}
+
+static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int *freq_fact = &x->thresh_freq_fact[bsize][thr_mode_idx];
+  if (thr_mode_idx == best_mode_idx) {
+    *freq_fact -= (*freq_fact >> 4);
+  } else {
+    *freq_fact =
+        AOMMIN(*freq_fact + RD_THRESH_INC,
+               cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  }
+}
+
+static INLINE int get_force_skip_low_temp_var_small_sb(uint8_t *variance_low,
+                                                       int mi_row, int mi_col,
+                                                       BLOCK_SIZE bsize) {
+  // Relative indices of MB inside the superblock.
+  const int mi_x = mi_row & 0xF;
+  const int mi_y = mi_col & 0xF;
+  // Relative indices of 16x16 block inside the superblock.
+  const int i = mi_x >> 2;
+  const int j = mi_y >> 2;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[1];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[2];
+      }
+      break;
+    case BLOCK_32X64:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[3];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[4];
+      }
+      break;
+    case BLOCK_32X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[5];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[6];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[7];
+      } else if (mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[8];
+      }
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+      break;
+    default: break;
+  }
+
+  return force_skip_low_temp_var;
+}
+
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  int x, y;
+  x = (mi_col & 0x1F) >> 4;
+  // y = (mi_row & 0x1F) >> 4;
+  // const int idx64 = (y << 1) + x;
+  y = (mi_row & 0x17) >> 3;
+  const int idx64 = y + x;
+
+  x = (mi_col & 0xF) >> 3;
+  // y = (mi_row & 0xF) >> 3;
+  // const int idx32 = (y << 1) + x;
+  y = (mi_row & 0xB) >> 2;
+  const int idx32 = y + x;
+
+  x = (mi_col & 0x7) >> 2;
+  // y = (mi_row & 0x7) >> 2;
+  // const int idx16 = (y << 1) + x;
+  y = (mi_row & 0x5) >> 1;
+  const int idx16 = y + x;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_128X64:
+      assert((mi_col & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+      break;
+    case BLOCK_64X128:
+      assert((mi_row & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+      break;
+    case BLOCK_64X64:
+      // Location of this 64x64 block inside the 128x128 superblock
+      force_skip_low_temp_var = variance_low[5 + idx64];
+      break;
+    case BLOCK_64X32:
+      x = (mi_col & 0x1F) >> 4;
+      y = (mi_row & 0x1F) >> 3;
+      /*
+      .---------------.---------------.
+      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+      :---------------+---------------:
+      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+      :---------------+---------------:
+      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+      :---------------+---------------:
+      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+      '---------------'---------------'
+      */
+      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+      force_skip_low_temp_var = variance_low[9 + idx64x32];
+      break;
+    case BLOCK_32X64:
+      x = (mi_col & 0x1F) >> 3;
+      y = (mi_row & 0x1F) >> 4;
+      const int idx32x64 = (y << 2) + x;
+      force_skip_low_temp_var = variance_low[17 + idx32x64];
+      break;
+    case BLOCK_32X32:
+      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var =
+          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+      break;
+    default: break;
+  }
+  return force_skip_low_temp_var;
+}
+
+#define FILTER_SEARCH_SIZE 2
+static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+                              unsigned int *sse_y, int *this_early_term,
+                              int use_model_yrd_large, int64_t *sse_block_yrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  int pf_rate[FILTER_SEARCH_SIZE] = { 0 };
+  int64_t pf_dist[FILTER_SEARCH_SIZE] = { 0 };
+  unsigned int pf_var[FILTER_SEARCH_SIZE] = { 0 };
+  unsigned int pf_sse[FILTER_SEARCH_SIZE] = { 0 };
+  int64_t pf_sse_block_yrd[FILTER_SEARCH_SIZE] = { 0 };
+  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 };
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  int skip_txfm[FILTER_SEARCH_SIZE] = { 0 };
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_filter_index = -1;
+  InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR,
+                                               EIGHTTAP_SMOOTH };
+  int i;
+  for (i = 0; i < FILTER_SEARCH_SIZE; ++i) {
+    int64_t cost;
+    InterpFilter filter = filters[i];
+    mi->interp_filters = av1_broadcast_interp_filter(filter);
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    if (use_model_yrd_large)
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rate[i],
+                                &pf_dist[i], &pf_var[i], &pf_sse[i],
+                                this_early_term, 1);
+    else
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[i], &pf_dist[i],
+                        &skip_txfm[i], NULL, &pf_var[i], &pf_sse[i], 1);
+    pf_rate[i] += av1_get_switchable_rate(x, xd, cm->features.interp_filter);
+    cost = RDCOST(x->rdmult, pf_rate[i], pf_dist[i]);
+    pf_tx_size[i] = mi->tx_size;
+    if (cost < best_cost) {
+      best_filter_index = i;
+      best_cost = cost;
+      best_skip = skip_txfm[i];
+      best_early_term = *this_early_term;
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        current_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = current_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+  }
+  assert(best_filter_index >= 0 && best_filter_index < FILTER_SEARCH_SIZE);
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]);
+  mi->tx_size = pf_tx_size[best_filter_index];
+  this_rdc->rate = pf_rate[best_filter_index];
+  this_rdc->dist = pf_dist[best_filter_index];
+  *var_y = pf_var[best_filter_index];
+  *sse_y = pf_sse[best_filter_index];
+  *sse_block_yrd = pf_sse_block_yrd[best_filter_index];
+  this_rdc->skip = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  } else if (best_filter_index < FILTER_SEARCH_SIZE - 1) {
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+  }
+}
+
+#define COLLECT_PICK_MODE_STAT 0
+
+#if COLLECT_PICK_MODE_STAT
+typedef struct _mode_search_stat {
+  int32_t num_blocks[BLOCK_SIZES];
+  int64_t avg_block_times[BLOCK_SIZES];
+  int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  struct aom_usec_timer timer1;
+  struct aom_usec_timer timer2;
+} mode_search_stat;
+#endif  // COLLECT_PICK_MODE_STAT
+
+static void compute_intra_yprediction(const AV1_COMMON *cm,
+                                      PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int plane = 0;
+  int row, col;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+      av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                              block_size_high[bsize], tx_size, mode, 0, 0,
+                              FILTER_INTRA_MODES, pd->dst.buf, dst_stride,
+                              pd->dst.buf, dst_stride, 0, 0, plane);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
+void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS this_rdc, best_rdc;
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  const TX_SIZE intra_tx_size =
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+  int *bmode_costs;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  bmode_costs = x->y_mode_costs[A][L];
+
+  av1_invalid_rd_stats(&best_rdc);
+  av1_invalid_rd_stats(&this_rdc);
+
+  init_mbmi(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
+  mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (int i = 0; i < 4; ++i) {
+    PREDICTION_MODE this_mode = intra_mode_list[i];
+    this_rdc.dist = this_rdc.rate = 0;
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
+    av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
+                                           &args);
+    if (args.skippable) {
+      this_rdc.rate = av1_cost_symbol(av1_get_skip_cdf(xd)[1]);
+    } else {
+      this_rdc.rate += av1_cost_symbol(av1_get_skip_cdf(xd)[0]);
+    }
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      mi->mode = this_mode;
+    }
+  }
+
+  *rd_cost = best_rdc;
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, mi->mode);
+#else
+  store_coding_context(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                                  MACROBLOCK *x, RD_STATS *rd_cost,
+                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  BEST_PICKMODE best_pickmode;
+  int inter_mode_mask[BLOCK_SIZES];
+#if COLLECT_PICK_MODE_STAT
+  static mode_search_stat ms_stat;
+#endif
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+  struct buf_2d yv12_mb[8][MAX_MB_PLANE];
+  static const int flag_list[8] = { 0, AOM_LAST_FLAG, 0, 0, AOM_GOLD_FLAG, 0,
+                                    0, AOM_ALT_FLAG };
+  RD_STATS this_rdc, best_rdc;
+  // var_y and sse_y are saved to be used in skipping checking
+  unsigned int sse_y = UINT_MAX;
+  unsigned int var_y = UINT_MAX;
+  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+  InterpFilter filter_ref;
+  int ref_frame_skip_mask = 0;
+  int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  unsigned int ref_costs_single[REF_FRAMES],
+      ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int force_skip_low_temp_var = 0;
+  int skip_ref_find_pred[8] = { 0 };
+  unsigned int sse_zeromv_norm = UINT_MAX;
+  const unsigned int thresh_skip_golden = 500;
+  int gf_temporal_ref = 0;
+  const struct segmentation *const seg = &cm->seg;
+  int num_inter_modes = RT_INTER_MODES;
+  unsigned char segment_id = mi->segment_id;
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
+  PRED_BUFFER *this_mode_pred = NULL;
+  const int reuse_inter_pred =
+      cpi->sf.rt_sf.reuse_inter_pred_nonrd && cm->seq_params.bit_depth == 8;
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int pixels_in_block = bh * bw;
+  struct buf_2d orig_dst = pd->dst;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_start(&ms_stat.timer2);
+#endif
+  int intra_cost_penalty = av1_get_intra_cost_penalty(
+      quant_params->base_qindex, quant_params->y_dc_delta_q,
+      cm->seq_params.bit_depth);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+  const int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
+  int use_modeled_non_rd_cost = 0;
+  int enable_filter_search = 0;
+  InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+  int64_t thresh_sad_pred = INT64_MAX;
+
+  (void)best_rd_so_far;
+
+  init_best_pickmode(&best_pickmode);
+
+  for (int i = 0; i < BLOCK_SIZES; ++i) inter_mode_mask[i] = INTER_ALL;
+
+  // TODO(kyslov) Move this to Speed Features
+  inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEAR;
+
+  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+  struct scale_factors *const sf_golden =
+      get_ref_scale_factors(cm, GOLDEN_FRAME);
+  gf_temporal_ref = 1;
+  // For temporal long term prediction, check that the golden reference
+  // is same scale as last reference, otherwise disable.
+  if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
+      (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
+    gf_temporal_ref = 0;
+  }
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_single_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single);
+  if (cpi->sf.rt_sf.use_comp_ref_nonrd)
+    estimate_comp_ref_frame_costs(cm, xd, x, segment_id, ref_costs_comp);
+
+  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
+  if (reuse_inter_pred) {
+    for (int i = 0; i < 3; i++) {
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }
+
+  x->force_skip = 0;
+
+  // Instead of using av1_get_pred_context_switchable_interp(xd) to assign
+  // filter_ref, we use a less strict condition on assigning filter_ref.
+  // This is to reduce the probabily of entering the flow of not assigning
+  // filter_ref and then skip filter search.
+  filter_ref = cm->features.interp_filter;
+
+  // initialize mode decisions
+  av1_invalid_rd_stats(&best_rdc);
+  av1_invalid_rd_stats(&this_rdc);
+  av1_invalid_rd_stats(rd_cost);
+  mi->sb_type = bsize;
+  mi->ref_frame[0] = NONE_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+
+  usable_ref_frame =
+      cpi->sf.rt_sf.use_nonrd_altref_frame ? ALTREF_FRAME : GOLDEN_FRAME;
+
+  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+    skip_ref_find_pred[GOLDEN_FRAME] = 1;
+    if (!cpi->sf.rt_sf.use_nonrd_altref_frame) usable_ref_frame = LAST_FRAME;
+  }
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
+      x->nonrd_prune_ref_frame_search) {
+    if (is_small_sb)
+      force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    else
+      force_skip_low_temp_var = get_force_skip_low_temp_var(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
+  }
+
+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+  }
+
+  for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
+       ref_frame_iter <= usable_ref_frame; ++ref_frame_iter) {
+    // Skip find_predictor if the reference frame is not in the
+    // ref_frame_flags (i.e., not used as a reference for this frame).
+    skip_ref_find_pred[ref_frame_iter] =
+        !(cpi->ref_frame_flags & flag_list[ref_frame_iter]);
+    if (!skip_ref_find_pred[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, frame_mv, &ref_frame_skip_mask,
+                      flag_list, tile_data, yv12_mb, bsize,
+                      force_skip_low_temp_var);
+    }
+  }
+
+  thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+  // Increase threshold for less agressive pruning.
+  if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1)
+    thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
+
+  const int large_block = bsize >= BLOCK_32X32;
+  const int use_model_yrd_large =
+      cpi->oxcf.rc_mode == AOM_CBR && large_block &&
+      !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+      quant_params->base_qindex && cm->seq_params.bit_depth == 8;
+
+#if COLLECT_PICK_MODE_STAT
+  ms_stat.num_blocks[bsize]++;
+#endif
+  init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
+  mi->tx_size =
+      AOMMIN(AOMMIN(max_txsize_lookup[bsize],
+                    tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
+             TX_16X16);
+
+  // TODO(marpan): Look into reducing these conditions. For now constrain
+  // it to avoid significant bdrate loss.
+  if (cpi->sf.rt_sf.use_modeled_non_rd_cost &&
+      quant_params->base_qindex > 120 && x->source_variance > 100 &&
+      bsize <= BLOCK_16X16 && x->content_state_sb != kLowVarHighSumdiff &&
+      x->content_state_sb != kHighSad)
+    use_modeled_non_rd_cost = 1;
+
+  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
+    enable_filter_search = 1;
+    if (cpi->sf.interp_sf.cb_pred_filter_search) {
+      const int bsl = mi_size_wide_log2[bsize];
+      enable_filter_search =
+          (((mi_row + mi_col) >> bsl) +
+           get_chessboard_index(cm->current_frame.frame_number)) &
+          0x1;
+    }
+    if (x->source_variance <=
+        cpi->sf.interp_sf.disable_filter_search_var_thresh)
+      enable_filter_search = 0;
+  }
+
+  for (int idx = 0; idx < num_inter_modes; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+    int skip_this_mv = 0;
+    int comp_pred = 0;
+    int force_mv_inter_layer = 0;
+    PREDICTION_MODE this_mode;
+    MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+    second_ref_frame = NONE_FRAME;
+
+    this_mode = ref_mode_set[idx].pred_mode;
+    ref_frame = ref_mode_set[idx].ref_frame;
+
+#if COLLECT_PICK_MODE_STAT
+    aom_usec_timer_start(&ms_stat.timer1);
+    ms_stat.num_searches[bsize][this_mode]++;
+#endif
+    mi->mode = this_mode;
+    mi->ref_frame[0] = ref_frame;
+
+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
+
+    // Skip non-zero motion for SVC if skip_nonzeromv_ref is set.
+    if (cpi->use_svc && frame_mv[this_mode][ref_frame].as_int != 0) {
+      if (ref_frame == LAST_FRAME && cpi->svc.skip_nonzeromv_last)
+        continue;
+      else if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_nonzeromv_gf)
+        continue;
+    }
+
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    if (ref_frame != LAST_FRAME && cpi->oxcf.rc_mode == AOM_CBR &&
+        sse_zeromv_norm < thresh_skip_golden && this_mode == NEWMV)
+      continue;
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+
+    if (!(inter_mode_mask[bsize] & (1 << this_mode))) continue;
+
+    // Skip testing non-LAST if this flag is set.
+    if (x->nonrd_prune_ref_frame_search) {
+      if (x->nonrd_prune_ref_frame_search > 1 && ref_frame != LAST_FRAME &&
+          (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && this_mode == NEWMV)))
+        continue;
+
+      if (ref_frame != LAST_FRAME && this_mode == NEARMV) continue;
+    }
+
+    // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (!force_mv_inter_layer && force_skip_low_temp_var &&
+        ref_frame != LAST_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
+#if 0
+        if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
+        (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64))
+        && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode ==
+            NEWMV)  {
+          continue;
+        }
+#endif
+
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we
+    // end up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+      if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
+          x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) {
+        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred)
+          ref_frame_skip_mask |= (1 << ref_frame);
+      }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
+    }
+
+    // Select prediction reference frames.
+    for (int i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+                         ? rd_threshes[mode_index] << 1
+                         : rd_threshes[mode_index];
+
+    // Increase mode_rd_thresh value for non-LAST for improved encoding
+    // speed
+    if (ref_frame != LAST_FRAME) {
+      mode_rd_thresh = mode_rd_thresh << 1;
+      if (ref_frame == GOLDEN_FRAME && cpi->rc.frames_since_golden > 4)
+        mode_rd_thresh = mode_rd_thresh << 1;
+    }
+
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
+
+    if (this_mode == NEWMV && !force_mv_inter_layer) {
+      if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+                        mi_row, mi_col, best_pred_sad, &rate_mv, &best_rdc))
+        continue;
+    }
+
+    for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
+         inter_mv_mode++) {
+      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (mode_checked[inter_mv_mode][ref_frame] &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[inter_mv_mode][ref_frame].as_int) {
+        skip_this_mv = 1;
+        break;
+      }
+    }
+
+    if (skip_this_mv) continue;
+
+    mi->mode = this_mode;
+    mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[1].as_int = 0;
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+#if COLLECT_PICK_MODE_STAT
+    ms_stat.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+    if (enable_filter_search &&
+        ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
+        (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
+      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
+                        &this_early_term, use_model_yrd_large, &this_sse);
+    } else {
+      mi->interp_filters =
+          (filter_ref == SWITCHABLE)
+              ? av1_broadcast_interp_filter(default_interp_filter)
+              : av1_broadcast_interp_filter(filter_ref);
+      av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      if (use_model_yrd_large) {
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, NULL, NULL,
+                                  &var_y, &sse_y, &this_early_term,
+                                  use_modeled_non_rd_cost);
+      } else {
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &this_rdc.skip, NULL, &var_y, &sse_y,
+                          use_modeled_non_rd_cost);
+      }
+    }
+
+    if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
+      sse_zeromv_norm =
+          sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    }
+
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int skip_cost = x->skip_cost[skip_ctx][1];
+    const int no_skip_cost = x->skip_cost[skip_ctx][0];
+    if (!this_early_term) {
+      if (use_modeled_non_rd_cost) {
+        if (this_rdc.skip) {
+          this_rdc.rate = skip_cost;
+        } else {
+          this_rdc.rate += no_skip_cost;
+        }
+      } else {
+        this_sse = (int64_t)sse_y;
+        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, &this_sse,
+                  bsize, mi->tx_size);
+        if (this_rdc.skip) {
+          this_rdc.rate = skip_cost;
+        } else {
+          if (RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
+              RDCOST(x->rdmult, 0,
+                     this_sse)) {  // this_sse already multiplied by 16 in
+                                   // block_yrd
+            this_rdc.skip = 1;
+            this_rdc.rate = skip_cost;
+            this_rdc.dist = this_sse;
+          } else {
+            this_rdc.rate += no_skip_cost;
+          }
+        }
+      }
+    } else {
+      this_rdc.skip = 1;
+      this_rdc.rate = skip_cost;
+      this_rdc.dist = sse_y << 4;
+    }
+
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
+      RD_STATS rdc_uv;
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+      if (x->color_sensitivity[0]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_U, AOM_PLANE_U);
+      }
+      if (x->color_sensitivity[1]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_V, AOM_PLANE_V);
+      }
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
+      this_rdc.skip = this_rdc.skip && rdc_uv.skip;
+    }
+
+    // TODO(kyslov) account for UV prediction cost
+    this_rdc.rate += rate_mv;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+    this_rdc.rate += cost_mv_ref(x, this_mode, mode_ctx);
+
+    this_rdc.rate += ref_costs_single[ref_frame];
+
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      newmv_diff_bias(xd, this_mode, &this_rdc, bsize,
+                      frame_mv[this_mode][ref_frame].as_mv.row,
+                      frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed,
+                      x->source_variance);
+    }
+
+    mode_checked[this_mode][ref_frame] = 1;
+#if COLLECT_PICK_MODE_STAT
+    aom_usec_timer_mark(&ms_stat.timer1);
+    ms_stat.nonskipped_search_times[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&ms_stat.timer1);
+#endif
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      best_early_term = this_early_term;
+      best_pickmode.best_mode = this_mode;
+      best_pickmode.best_pred_filter = mi->interp_filters;
+      best_pickmode.best_tx_size = mi->tx_size;
+      best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_mode_skip_txfm = this_rdc.skip;
+      best_pickmode.best_second_ref_frame = second_ref_frame;
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pickmode.best_pred);
+        best_pickmode.best_pred = this_mode_pred;
+      }
+    } else {
+      if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+    }
+    if (best_early_term && idx > 0) {
+      x->force_skip = 1;
+      break;
+    }
+  }
+
+  mi->mode = best_pickmode.best_mode;
+  mi->interp_filters = best_pickmode.best_pred_filter;
+  mi->tx_size = best_pickmode.best_tx_size;
+  memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->mv[0].as_int =
+      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  x->force_skip = best_rdc.skip;
+
+  // Perform intra prediction search, if the best SAD is above a certain
+  // threshold.
+  mi->angle_delta[PLANE_TYPE_Y] = 0;
+  mi->angle_delta[PLANE_TYPE_UV] = 0;
+  mi->filter_intra_mode_info.use_filter_intra = 0;
+
+  uint32_t spatial_var_thresh = 50;
+  int motion_thresh = 32;
+  // Adjust thresholds to make intra mode likely tested if the other
+  // references (golden, alt) are skipped/not checked.
+  if (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+      cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0) {
+    spatial_var_thresh = 150;
+    motion_thresh = 0;
+  }
+  int do_early_exit_rdthresh = 1;
+  // Some adjustments to checking intra mode based on source variance.
+  if (x->source_variance < spatial_var_thresh) {
+    // If the best inter mode is large motion or non-LAST ref reduce intra cost
+    // penalty, so intra mode is more likely tested.
+    if (best_pickmode.best_ref_frame != LAST_FRAME ||
+        abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+        abs(mi->mv[0].as_mv.col) >= motion_thresh) {
+      intra_cost_penalty = intra_cost_penalty >> 2;
+      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+      do_early_exit_rdthresh = 0;
+    }
+    // For big blocks worth checking intra (since only DC will be checked),
+    // even if best_early_term is set.
+    if (bsize >= BLOCK_32X32) best_early_term = 0;
+  }
+
+  if (best_rdc.rdcost == INT64_MAX ||
+      (perform_intra_pred && !best_early_term &&
+       best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.part_sf.max_intra_bsize)) {
+    int64_t this_sse = INT64_MAX;
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+    TX_SIZE intra_tx_size =
+        AOMMIN(AOMMIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
+               TX_16X16);
+
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst.buf) {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        aom_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride, 0, 0, 0,
+                          0, bw, bh);
+        best_pickmode.best_pred = this_mode_pred;
+      }
+    }
+    pd->dst = orig_dst;
+
+    for (int i = 0; i < 4; ++i) {
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      const THR_MODES mode_index =
+          mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+      const int mode_rd_thresh = rd_threshes[mode_index];
+
+      // Only check DC for blocks >= 32X32.
+      if (this_mode > 0 && bsize >= BLOCK_32X32) continue;
+
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                              rd_thresh_freq_fact[mode_index]) &&
+          (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+        continue;
+      }
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+      mi->mode = this_mode;
+      mi->ref_frame[0] = INTRA_FRAME;
+      mi->ref_frame[1] = NONE_FRAME;
+
+      this_rdc.dist = this_rdc.rate = 0;
+      args.mode = this_mode;
+      args.skippable = 1;
+      args.rdc = &this_rdc;
+      mi->tx_size = intra_tx_size;
+      compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+      // Look into selecting tx_size here, based on prediction residual.
+      if (use_modeled_non_rd_cost)
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &this_rdc.skip, NULL, &var_y, &sse_y, 1);
+      else
+        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, &this_sse,
+                  bsize, mi->tx_size);
+      // TODO(kyslov@) Need to account for skippable
+      if (x->color_sensitivity[0]) {
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
+                                               estimate_block_intra, &args);
+      }
+      if (x->color_sensitivity[1]) {
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
+                                               estimate_block_intra, &args);
+      }
+
+      int mode_cost = 0;
+      if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+        mode_cost += x->angle_delta_cost[this_mode - V_PRED]
+                                        [MAX_ANGLE_DELTA +
+                                         mi->angle_delta[PLANE_TYPE_Y]];
+      }
+      if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+        mode_cost += x->filter_intra_cost[bsize][0];
+      }
+      this_rdc.rate += ref_costs_single[INTRA_FRAME];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rate += mode_cost;
+      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        best_pickmode.best_mode = this_mode;
+        best_pickmode.best_intra_tx_size = mi->tx_size;
+        best_pickmode.best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_second_ref_frame = NONE_FRAME;
+        mi->uv_mode = this_mode;
+        mi->mv[0].as_int = INVALID_MV;
+        mi->mv[1].as_int = INVALID_MV;
+      }
+    }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_pickmode.best_tx_size;
+    } else {
+      mi->tx_size = best_pickmode.best_intra_tx_size;
+    }
+  }
+
+  pd->dst = orig_dst;
+  mi->mode = best_pickmode.best_mode;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+
+  if (!is_inter_block(mi)) {
+    mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+  }
+
+  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+      aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+                        pd->dst.stride, 0, 0, 0, 0, bw, bh);
+    }
+  }
+  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
+    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+      for (int i = 0; i < intra_modes; i++) {
+        update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
+                                intra_mode_list[i]);
+      }
+    } else {
+      for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+        PREDICTION_MODE this_mode;
+        if (best_pickmode.best_ref_frame != ref_frame) continue;
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          update_thresh_freq_fact(cpi, x, bsize, ref_frame, best_mode_idx,
+                                  this_mode);
+        }
+      }
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, mi->mode);
+#else
+  store_coding_context(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_mark(&ms_stat.timer2);
+  ms_stat.avg_block_times[bsize] += aom_usec_timer_elapsed(&ms_stat.timer2);
+  //
+  if ((mi_row + mi_size_high[bsize] >= (cpi->common.mi_params.mi_rows)) &&
+      (mi_col + mi_size_wide[bsize] >= (cpi->common.mi_params.mi_cols))) {
+    int i, j;
+    PREDICTION_MODE used_modes[3] = { NEARESTMV, NEARMV, NEWMV };
+    BLOCK_SIZE bss[5] = { BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+                          BLOCK_128X128 };
+    int64_t total_time = 0l;
+    int32_t total_blocks = 0;
+
+    printf("\n");
+    for (i = 0; i < 5; i++) {
+      printf("BS(%d) Num %d, Avg_time %f: ", bss[i], ms_stat.num_blocks[bss[i]],
+             ms_stat.num_blocks[bss[i]] > 0
+                 ? (float)ms_stat.avg_block_times[bss[i]] /
+                       ms_stat.num_blocks[bss[i]]
+                 : 0);
+      total_time += ms_stat.avg_block_times[bss[i]];
+      total_blocks += ms_stat.num_blocks[bss[i]];
+      for (j = 0; j < 3; j++) {
+        printf("Mode %d, %d/%d tps %f ", used_modes[j],
+               ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]],
+               ms_stat.num_searches[bss[i]][used_modes[j]],
+               ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]] > 0
+                   ? (float)ms_stat
+                             .nonskipped_search_times[bss[i]][used_modes[j]] /
+                         ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]]
+                   : 0l);
+      }
+      printf("\n");
+    }
+    printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+  }
+  //
+#endif  // COLLECT_PICK_MODE_STAT
+  *rd_cost = best_rdc;
+}

diff --git a/libaom/av1/encoder/partition_cnn_weights.h b/libaom/av1/encoder/partition_cnn_weights.h
new file mode 100644
index 0000000..504038c
--- /dev/null
+++ b/libaom/av1/encoder/partition_cnn_weights.h

@@ -0,0 +1,2139 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/ml.h"
+
+#define CNN_BRANCH_0_OUT_CH 20
+#define CNN_BRANCH_1_OUT_CH 4
+#define CNN_BRANCH_2_OUT_CH 20
+#define CNN_BRANCH_3_OUT_CH 20
+#define CNN_TOT_OUT_CH                                                      \
+  (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \
+    (CNN_BRANCH_3_OUT_CH)))
+#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH)
+#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2)
+#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4)
+#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8)
+#define CNN_OUT_BUF_SIZE                                \
+  (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \
+    (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE)))
+
+#define NUM_DNN_BRANCHES 4
+#define NUM_CNN_LAYERS 5
+#define BRANCH_0_NUM_DNN_LAYERS 2
+#define BRANCH_1_NUM_DNN_LAYERS 2
+#define BRANCH_2_NUM_DNN_LAYERS 2
+#define BRANCH_3_NUM_DNN_LAYERS 2
+#define CNN_LAYER_0_HEIGHT 5
+#define CNN_LAYER_0_WIDTH 5
+#define CNN_LAYER_0_IN_CH 1
+#define CNN_LAYER_0_OUT_CH 20
+#define CNN_LAYER_0_HORZ_STRIDE 4
+#define CNN_LAYER_0_VERT_STRIDE 4
+#define CNN_LAYER_1_HEIGHT 2
+#define CNN_LAYER_1_WIDTH 2
+#define CNN_LAYER_1_IN_CH 20
+#define CNN_LAYER_1_OUT_CH 20
+#define CNN_LAYER_1_HORZ_STRIDE 2
+#define CNN_LAYER_1_VERT_STRIDE 2
+#define CNN_LAYER_2_HEIGHT 2
+#define CNN_LAYER_2_WIDTH 2
+#define CNN_LAYER_2_IN_CH 20
+#define CNN_LAYER_2_OUT_CH 20
+#define CNN_LAYER_2_HORZ_STRIDE 2
+#define CNN_LAYER_2_VERT_STRIDE 2
+#define CNN_LAYER_3_HEIGHT 2
+#define CNN_LAYER_3_WIDTH 2
+#define CNN_LAYER_3_IN_CH 20
+#define CNN_LAYER_3_OUT_CH 4
+#define CNN_LAYER_3_HORZ_STRIDE 2
+#define CNN_LAYER_3_VERT_STRIDE 2
+#define CNN_LAYER_4_HEIGHT 2
+#define CNN_LAYER_4_WIDTH 2
+#define CNN_LAYER_4_IN_CH 4
+#define CNN_LAYER_4_OUT_CH 20
+#define CNN_LAYER_4_HORZ_STRIDE 2
+#define CNN_LAYER_4_VERT_STRIDE 2
+#define BRANCH_0_NUM_DNN_FEATURES 37
+#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_0_NUM_LOGITS 1
+#define BRANCH_1_NUM_DNN_FEATURES 25
+#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_1_NUM_LOGITS 1
+#define BRANCH_2_NUM_DNN_FEATURES 25
+#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_2_NUM_LOGITS 1
+#define BRANCH_3_NUM_DNN_FEATURES 41
+#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_3_NUM_LOGITS 1
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = {
+  0.131894f,    -0.593536f,  -0.212935f,  -0.00220011f, -0.396949f,
+  0.287753f,    -0.91875f,   -0.0095057f, 0.804197f,    -0.395239f,
+  0.516604f,    1.16439f,    0.445784f,   -0.163349f,   0.746488f,
+  -0.33891f,    -0.562652f,  0.481403f,   0.755378f,    -0.200753f,
+  0.0784307f,   0.105657f,   0.0205673f,  -0.524089f,   -0.476146f,
+  -0.161206f,   -0.65079f,   0.137474f,   0.28584f,     0.508768f,
+  -0.643386f,   0.227068f,   -0.899507f,  -0.413382f,   0.631466f,
+  0.398203f,    -0.544392f,  0.825155f,   0.671847f,    -0.249779f,
+  0.323121f,    0.125357f,   -0.719564f,  -0.0714854f,  -0.168472f,
+  -0.213246f,   -0.674525f,  0.330148f,   -0.138414f,   0.20462f,
+  -0.518571f,   -0.15091f,   -0.605116f,  -0.448732f,   -0.475599f,
+  0.738f,       -0.328526f,  0.755035f,   0.969414f,    -0.321039f,
+  -0.23068f,    0.408567f,   -0.377813f,  -0.273974f,   1.0684f,
+  0.373968f,    -0.450305f,  0.439258f,   -0.381846f,   -0.267331f,
+  0.30613f,     -0.39369f,   0.622438f,   -0.52877f,    -0.334991f,
+  0.263193f,    -0.402121f,  0.64142f,    0.793048f,    -0.0231174f,
+  -0.68474f,    -0.293338f,  -0.737511f,  -0.462654f,   0.474629f,
+  0.141397f,    -0.152529f,  0.345879f,   -0.499991f,   0.00174024f,
+  0.337387f,    -0.131151f,  0.427385f,   -0.457449f,   -0.879614f,
+  -0.425908f,   -0.263172f,  0.0344974f,  1.07861f,     -0.00416662f,
+  0.0208952f,   0.233905f,   0.765965f,   0.0423685f,   -0.117554f,
+  -0.248237f,   0.49848f,    -0.845131f,  0.223648f,    -0.838709f,
+  0.5834f,      0.309956f,   -0.0625093f, -0.619619f,   0.918957f,
+  0.358271f,    -0.668459f,  0.518783f,   -0.418963f,   -0.206788f,
+  0.364983f,    -0.0396087f, 0.624309f,   -0.138679f,   -0.142453f,
+  0.28309f,     0.895092f,   -0.215713f,  0.439025f,    0.659333f,
+  -0.366025f,   -0.413518f,  0.66657f,    -0.265919f,   0.473471f,
+  -1.0729f,     -0.526702f,  0.2838f,     0.367648f,    -0.61242f,
+  0.121656f,    0.547727f,   -0.0636793f, -0.33006f,    -0.306604f,
+  -0.00897731f, 0.688242f,   0.0944626f,  0.321508f,    0.0437392f,
+  -0.560035f,   -0.768334f,  0.0571051f,  -0.0427601f,  -0.0437806f,
+  -0.816209f,   -0.395829f,  0.293733f,   0.217645f,    -0.646428f,
+  0.132448f,    -0.435806f,  -0.0556814f, 0.0218857f,   0.348525f,
+  -0.17296f,    0.669057f,   0.638604f,   -0.0995596f,  -0.024099f,
+  -0.262332f,   -0.548975f,  0.357894f,   0.43873f,     -0.688234f,
+  -0.425519f,   0.190986f,   -0.074778f,  0.294232f,    -0.548969f,
+  -0.731198f,   0.03616f,    -0.475969f,  -0.306075f,   -0.111929f,
+  -0.234146f,   0.612669f,   0.882254f,   -0.622893f,   0.262431f,
+  0.465242f,    0.245384f,   -0.811016f,  0.501798f,    -0.925875f,
+  0.264373f,    0.307766f,   -0.26872f,   0.113027f,    -0.158875f,
+  0.0711483f,   0.220275f,   -0.0699022f, -0.0111303f,  -0.435384f,
+  -0.720014f,   0.593484f,   -0.964082f,  0.750925f,    0.252433f,
+  0.964332f,    -0.256904f,  -0.421715f,  -0.403851f,   -0.188081f,
+  0.694014f,    -1.00183f,   0.798921f,   0.0603123f,   0.213814f,
+  0.739642f,    -0.0203375f, 0.72569f,    -0.260224f,   0.0199516f,
+  -0.322451f,   0.318204f,   -0.38392f,   0.740994f,    -0.265215f,
+  -0.54541f,    -0.51479f,   -0.458397f,  0.519564f,    0.0509182f,
+  0.0363331f,   -0.293051f,  0.317714f,   -0.327488f,   -0.0840401f,
+  0.318437f,    -0.619403f,  0.641094f,   -0.288435f,   -0.260185f,
+  0.181083f,    -0.169294f,  0.292645f,   0.140405f,    0.0572885f,
+  -0.637428f,   -0.102616f,  0.288955f,   0.817314f,    0.116855f,
+  0.635532f,    0.283334f,   -0.236391f,  -0.305035f,   -0.217365f,
+  -0.033021f,   -0.455858f,  0.439922f,   -0.104039f,   0.373376f,
+  0.310659f,    0.388789f,   0.266341f,   0.0746306f,   -0.428192f,
+  -0.202695f,   -0.347625f,  0.00585741f, 0.366203f,    0.221413f,
+  0.518856f,    0.57245f,    -0.375071f,  -0.2436f,     -0.511895f,
+  -1.03708f,    0.681455f,   -0.111544f,  -0.183563f,   0.109729f,
+  -0.422646f,   -0.529777f,  0.747473f,   -0.270223f,   -0.11435f,
+  0.378931f,    0.420456f,   0.236331f,   0.49261f,     -0.0666801f,
+  0.0475846f,   0.906095f,   -0.4146f,    -0.020588f,   -0.653285f,
+  0.135335f,    0.543846f,   -0.309061f,  0.11899f,     -0.639168f,
+  -0.719994f,   -0.219706f,  -0.645631f,  -0.829049f,   -0.0114746f,
+  0.834604f,    0.0378035f,  0.107957f,   0.546929f,    -0.674395f,
+  -0.854817f,   -1.1443f,    0.223413f,   -0.326324f,   0.440971f,
+  0.383582f,    -0.495084f,  0.280091f,   -0.53116f,    0.0333923f,
+  -0.354339f,   -0.0449156f, -0.538896f,  -0.753355f,   0.463995f,
+  0.000969967f, -0.2832f,    0.587276f,   0.853094f,    -0.481985f,
+  -0.138202f,   0.180989f,   -0.349044f,  -0.417534f,   0.455591f,
+  0.287332f,    0.251496f,   0.381416f,   0.339632f,    -0.0825727f,
+  0.352739f,    0.161697f,   -0.319764f,  -0.258015f,   0.668833f,
+  -0.553303f,   -0.578815f,  -0.3758f,    0.289f,       0.247368f,
+  0.00681103f,  0.421092f,   -0.191033f,  -0.425868f,   -0.1239f,
+  0.0540422f,   -0.0856856f, 0.481168f,   -0.0283741f,  -0.196018f,
+  0.230923f,    -0.145288f,  0.52188f,    0.00628462f,  -0.604556f,
+  -0.562879f,   0.319282f,   0.323799f,   0.453941f,    0.271129f,
+  -0.0520196f,  0.684571f,   -0.391779f,  -0.404614f,   0.134097f,
+  -0.825482f,   0.0913949f,  0.483543f,   0.159084f,    0.301637f,
+  0.427013f,    0.196153f,   0.460091f,   -0.730573f,   -0.12278f,
+  0.221665f,    0.674622f,   -0.623363f,  -0.0761517f,  0.637979f,
+  -0.468498f,   0.527276f,   -0.596894f,  -0.34675f,    -0.251241f,
+  0.418533f,    -0.476696f,  -0.901267f,  -0.0088241f,  -0.12421f,
+  -0.660316f,   -0.0222117f, -0.470898f,  -1.10739f,    -0.441645f,
+  0.39516f,     -0.0117906f, 0.254122f,   0.00722599f,  -1.00697f,
+  0.48908f,     -0.122287f,  -0.378608f,  -0.339145f,   0.682463f,
+  0.305606f,    0.453628f,   -0.49923f,   -0.791388f,   -0.202515f,
+  0.23214f,     -0.434209f,  -0.778283f,  -0.538015f,   0.145769f,
+  0.446281f,    -0.339329f,  -0.198478f,  -0.183717f,   -0.855441f,
+  -0.105778f,   0.575067f,   -0.18592f,   -0.348094f,   0.740614f,
+  0.041549f,    -0.109663f,  0.0434492f,  0.245242f,    -1.22192f,
+  0.685896f,    -0.208115f,  -0.0616216f, -1.00552f,    0.31045f,
+  -0.184394f,   0.466705f,   -0.0984364f, -0.506252f,   0.144874f,
+  0.357038f,    0.675221f,   -0.822171f,  -0.52729f,    0.991212f,
+  0.432422f,    0.383493f,   -0.372395f,  0.35651f,     -0.25369f,
+  0.660208f,    -0.117745f,  -0.142433f,  -0.724115f,   -1.0035f,
+  -0.59178f,    0.563444f,   -0.282531f,  -0.599989f,   0.507424f,
+  -0.782875f,   0.755029f,   -0.754962f,  -0.617825f,   0.565984f,
+  -0.826878f,   -0.456563f,  0.0212161f,  0.469867f,    -0.144864f,
+  0.225748f,    -0.279029f,  0.21052f,    -0.440183f,   0.936069f,
+  0.170595f,    0.40966f,    0.452453f,   -0.576006f,   1.50696f,
+  0.649049f,    0.094957f,   -0.167706f,  -0.258342f,   0.59269f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = {
+  0.00475215f,  -0.00362332f, -0.00317542f, 0.190083f,    0.0488147f,
+  -0.0268093f,  -0.00432231f, 0.0112229f,   0.0626653f,   -0.0025698f,
+  0.0018675f,   -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f,
+  0.000136436f, 0.0667295f,   0.0251274f,   0.00226553f,  -0.000638344f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = {
+  0.228403f,    0.241933f,     0.181079f,    0.101728f,    0.278455f,
+  -0.222078f,   0.387578f,     0.0847356f,   -0.0737012f,  0.26518f,
+  -1.0817f,     0.0404161f,    -0.805199f,   0.336576f,    -0.541494f,
+  0.246264f,    0.116597f,     -0.756804f,   -0.914136f,   0.410265f,
+  0.413294f,    0.07873f,      0.450017f,    -0.264346f,   0.549095f,
+  1.03755f,     -0.203542f,    1.61018f,     0.374131f,    0.402515f,
+  -2.36115f,    0.116427f,     -0.172157f,   -0.231482f,   -0.905736f,
+  -0.0183059f,  -0.575746f,    0.110348f,    -0.268018f,   0.140399f,
+  0.427196f,    0.0718528f,    0.247936f,    -0.326661f,   0.150404f,
+  -0.659979f,   -0.157148f,    0.00826241f,  -0.679275f,   -0.131564f,
+  -1.04822f,    1.06039f,      -0.207898f,   0.510167f,    0.484233f,
+  0.138972f,    -0.0801639f,   -0.184416f,   0.0741107f,   -0.0299281f,
+  0.112263f,    0.380071f,     -0.0185269f,  -0.0821188f,  0.918796f,
+  -0.576106f,   0.593007f,     0.479446f,    0.0440703f,   0.322379f,
+  0.176783f,    -0.147111f,    0.0953247f,   -0.636377f,   0.0702104f,
+  0.130979f,    0.293892f,     -0.0112124f,  -0.040347f,   -0.16034f,
+  0.3252f,      -0.586802f,    0.601786f,    -0.487148f,   -0.458777f,
+  0.463835f,    0.144942f,     0.00339965f,  -0.779966f,   0.0585298f,
+  -1.20758f,    -0.275614f,    0.292346f,    -0.132781f,   0.337892f,
+  -0.357677f,   1.48511f,      0.172907f,    -0.148668f,   0.243184f,
+  -0.503392f,   -0.0791543f,   0.0265389f,   -0.102267f,   0.213294f,
+  0.0657801f,   0.156996f,     0.0891168f,   0.120805f,    0.261285f,
+  -0.343025f,   -0.0792235f,   -0.106415f,   0.133878f,    -0.112981f,
+  -0.00151126f, -0.0643829f,   0.0458938f,   -0.0452731f,  -0.00147422f,
+  0.1871f,      -0.0208793f,   0.0752037f,   0.0794674f,   0.167666f,
+  0.198028f,    -0.361015f,    -0.0661721f,  -0.10672f,    -0.0773641f,
+  -1.15856f,    -0.516443f,    -0.322702f,   0.15668f,     0.0075841f,
+  -0.157731f,   0.270926f,     -0.241551f,   0.0169097f,   -0.0263953f,
+  -0.303556f,   -0.239237f,    0.117792f,    -0.137871f,   0.122054f,
+  -0.587381f,   0.112938f,     0.0867262f,   -0.27909f,    -0.203622f,
+  -0.622195f,   0.42623f,      0.670704f,    0.190826f,    -0.304979f,
+  -0.570075f,   -0.240699f,    0.43744f,     0.632896f,    -0.563846f,
+  -0.0160434f,  -0.0709745f,   0.816662f,    0.269999f,    -0.358734f,
+  0.193644f,    1.19339f,      -0.118223f,   -0.363291f,   -0.723616f,
+  -1.58825f,    0.0222856f,    0.769852f,    0.322713f,    0.0857619f,
+  -0.669756f,   -1.08414f,     1.18593f,     0.486166f,    -0.520646f,
+  0.0861854f,   -0.134197f,    0.258337f,    0.223345f,    0.697639f,
+  -0.57261f,    0.54031f,      0.892644f,    0.497572f,    -0.287076f,
+  -1.95928f,    -0.0568128f,   -0.253335f,   0.00233392f,  -0.192787f,
+  -0.115203f,   -0.0975649f,   0.277954f,    0.000704534f, -0.315884f,
+  0.309583f,    0.357458f,     0.0939298f,   -0.072701f,   0.433045f,
+  -0.536938f,   0.534523f,     0.184585f,    -0.0415175f,  -0.120909f,
+  -1.2622f,     0.412449f,     -0.114741f,   0.290453f,    -0.441671f,
+  -0.0242497f,  -0.20746f,     0.139019f,    -0.422668f,   -0.146732f,
+  -0.688828f,   -0.00339426f,  0.04166f,     0.41755f,     0.405675f,
+  0.562564f,    0.0216812f,    0.0271391f,   0.215227f,    0.328183f,
+  -1.6442f,     -0.827838f,    0.115491f,    0.0951442f,   -0.133779f,
+  -0.0482928f,  0.203177f,     0.322953f,    -0.513259f,   0.0676788f,
+  -0.0877928f,  0.224448f,     0.451957f,    0.314243f,    0.307403f,
+  0.35653f,     0.0286278f,    2.27554f,     0.569313f,    -0.0488753f,
+  -2.48809f,    0.274555f,     -0.248375f,   -0.635634f,   -0.187663f,
+  0.1827f,      -0.409634f,    -0.0280568f,  -0.207119f,   -0.208192f,
+  -0.410268f,   -0.017669f,    0.134856f,    0.434551f,    0.165201f,
+  0.584608f,    -0.389997f,    -0.088713f,   0.118087f,    0.00210905f,
+  -1.07698f,    -0.520967f,    -0.198742f,   0.190255f,    -0.162639f,
+  0.0122759f,   0.460774f,     -0.684633f,   -0.149512f,   0.167556f,
+  -0.295034f,   -0.0650964f,   0.0868653f,   -0.691352f,   0.089795f,
+  0.0620608f,   0.0531289f,    0.0124286f,   0.151921f,    1.51067f,
+  -0.10586f,    -0.0311871f,   0.114706f,    0.0565205f,   -0.159634f,
+  -0.423987f,   -0.226896f,    0.0605352f,   -0.36324f,    -0.142205f,
+  -0.252249f,   0.0666312f,    0.316655f,    0.00687196f,  0.131079f,
+  -0.128281f,   -0.293468f,    1.3327f,      0.542277f,    -0.060088f,
+  -1.73475f,    0.0542297f,    -0.227522f,   -0.376004f,   -0.147028f,
+  0.0228252f,   0.0569538f,    -0.0796497f,  0.0937596f,   -0.0660153f,
+  -0.979219f,   -0.377322f,    0.0523787f,   0.467299f,    0.0824278f,
+  0.437147f,    0.263637f,     0.0325681f,   0.303581f,    0.353479f,
+  -0.142369f,   -0.394797f,    0.597185f,    0.116482f,    -0.0782593f,
+  0.364539f,    -0.30396f,     0.119016f,    -0.0022429f,  -0.044292f,
+  -0.0110531f,  0.233571f,     0.000975879f, 0.447332f,    -0.0320396f,
+  0.541609f,    0.14232f,      0.163905f,    0.848609f,    0.19954f,
+  -0.186591f,   -0.44465f,     -0.431672f,   0.159037f,    -0.129977f,
+  -0.141778f,   0.246818f,     -0.197539f,   -0.70115f,    0.185449f,
+  0.400274f,    -0.0350744f,   0.239727f,    -0.290504f,   0.0698443f,
+  -0.180374f,   -0.759591f,    -0.0569088f,  -0.50246f,    -0.0986616f,
+  -0.892114f,   0.306737f,     -0.133937f,   0.285625f,    0.495471f,
+  -0.686222f,   -0.168647f,    -0.0926158f,  0.351772f,    -0.0215394f,
+  0.361223f,    0.0657142f,    0.268229f,    -0.616299f,   0.0564718f,
+  -0.294013f,   -0.588019f,    0.0234195f,   -0.426863f,   -0.511253f,
+  -0.72177f,    0.420903f,     0.0987506f,   0.309368f,    0.523532f,
+  1.06073f,     -0.33028f,     0.0818142f,   0.0130354f,   0.0180882f,
+  0.0316898f,   -0.416614f,    -0.566344f,   -0.163083f,   0.285085f,
+  -0.0534352f,  0.385496f,     0.151068f,    -0.208295f,   -0.175648f,
+  0.0476705f,   0.190428f,     -0.643391f,   0.484004f,    -0.421836f,
+  -0.19829f,    -0.227574f,    -0.0869152f,  1.09881f,     0.345129f,
+  -0.236732f,   -0.381935f,    -1.46271f,    0.465914f,    0.610375f,
+  0.689968f,    -0.688546f,    1.95033f,     0.420946f,    0.0282428f,
+  0.147823f,    0.669393f,     0.429085f,    -0.328385f,   -0.150439f,
+  -0.419097f,   -0.828102f,    0.248743f,    0.24644f,     0.0186131f,
+  -0.384319f,   -0.126294f,    -0.417067f,   0.271483f,    -0.0128456f,
+  -0.881351f,   0.152581f,     0.185584f,    -0.745827f,   0.0551359f,
+  0.127083f,    0.936983f,     -0.0225341f,  0.575861f,    0.767417f,
+  -0.140867f,   -0.762518f,    0.422446f,    -0.0611973f,  0.0515641f,
+  -0.144168f,   -0.298882f,    0.308461f,    0.0208704f,   0.213872f,
+  -0.258708f,   1.13186f,      0.314083f,    -0.347536f,   -0.137768f,
+  0.653953f,    -0.217883f,    -0.56112f,    -0.864661f,   0.488836f,
+  0.268133f,    -0.548664f,    -0.765226f,   0.117082f,    0.326798f,
+  -0.678246f,   0.477785f,     -1.27584f,    0.198912f,    -0.710395f,
+  1.39096f,     -0.411577f,    -0.55119f,    0.51092f,     -0.295023f,
+  0.245983f,    -0.0957192f,   -0.312001f,   0.0175991f,   0.524423f,
+  -0.126379f,   0.124687f,     -1.53945f,    -0.342856f,   0.514072f,
+  0.400884f,    -0.00581101f,  -0.219327f,   0.0977873f,   0.337551f,
+  -0.058603f,   0.20034f,      0.0429945f,   0.676803f,    -0.273585f,
+  -0.173435f,   -0.581596f,    0.226263f,    -0.0946223f,  -0.060088f,
+  -0.0100809f,  -0.022242f,    -0.22218f,    -0.030463f,   -0.141389f,
+  -0.190757f,   -0.00526518f,  -0.77519f,    -0.0825695f,  0.308403f,
+  0.262792f,    -0.601842f,    0.0783697f,   0.197527f,    0.0714048f,
+  0.0392629f,   -0.388628f,    0.172541f,    -0.0222009f,  0.252096f,
+  0.0728652f,   0.173632f,     0.192914f,    -0.00969965f, 0.0530136f,
+  -0.00765759f, 0.440234f,     -0.0943323f,  0.112319f,    0.0878737f,
+  -0.739021f,   0.385305f,     0.133334f,    -0.396697f,   0.177818f,
+  -0.0712558f,  0.516923f,     0.102174f,    0.17158f,     -0.211068f,
+  0.295795f,    -0.36198f,     0.179087f,    -0.845744f,   -0.242514f,
+  -1.49073f,    0.272702f,     0.59011f,     -0.408184f,   -0.0731313f,
+  0.234643f,    0.589642f,     -0.100778f,   0.516921f,    -0.700154f,
+  0.316432f,    0.36117f,      0.0380282f,   0.480101f,    -0.0975487f,
+  0.941452f,    0.231705f,     -0.151182f,   -1.20305f,    0.28255f,
+  -0.0427662f,  -0.00717175f,  -0.842085f,   -0.357376f,   0.545581f,
+  -0.290714f,   0.741498f,     1.00377f,     0.483864f,    0.150405f,
+  0.0834512f,   -0.10031f,     0.424054f,    -0.0223491f,  -0.0696701f,
+  -0.134479f,   -0.747227f,    0.422208f,    0.123858f,    -0.392624f,
+  -0.0299847f,  -0.0376142f,   -0.392536f,   -0.0343114f,  0.298224f,
+  -0.375899f,   0.693119f,     0.27909f,     -0.53463f,    0.105459f,
+  -0.0267383f,  0.5094f,       -0.411557f,   0.451749f,    -0.348479f,
+  -0.0497316f,  -0.353913f,    -0.14858f,    0.241838f,    0.331039f,
+  0.756607f,    -0.0701661f,   -0.827264f,   -0.367772f,   0.447201f,
+  0.834616f,    -0.00497265f,  -0.0557285f,  0.055088f,    -0.300115f,
+  -0.143833f,   -1.07838f,     -0.106896f,   0.16945f,     0.0170324f,
+  0.108754f,    0.335893f,     -0.0923708f,  0.450209f,    -0.0713308f,
+  -0.0233037f,  -0.0129902f,   -1.40664f,    -0.0996218f,  0.711236f,
+  0.400716f,    0.227871f,     2.01499f,     0.572926f,    0.135673f,
+  -0.0340458f,  -0.316736f,    0.24257f,     -0.700768f,   -0.194985f,
+  0.312011f,    -0.179599f,    0.128114f,    0.0725977f,   -0.193816f,
+  0.352143f,    0.070641f,     -0.467808f,   -0.399047f,   0.10136f,
+  0.671574f,    -0.553965f,    0.105729f,    0.210383f,    0.065048f,
+  0.248198f,    -0.731674f,    0.588725f,    -0.308237f,   0.24511f,
+  0.00608906f,  0.170906f,     0.246175f,    0.149521f,    0.106071f,
+  0.160246f,    0.118487f,     -0.104102f,   0.872823f,    0.227478f,
+  0.0182631f,   -0.115083f,    0.0142445f,   0.307947f,    -0.884925f,
+  0.0767105f,   0.0414042f,    -0.448021f,   -0.0400193f,  -0.0765448f,
+  -0.411931f,   -0.199624f,    0.333371f,    0.17267f,     -0.0431816f,
+  0.190826f,    -0.0758961f,   -1.02831f,    -0.0414525f,  0.605374f,
+  -0.0188181f,  -0.2207f,      1.30004f,     -0.207005f,   -0.0333617f,
+  0.227145f,    0.105059f,     -0.0473393f,  -0.448752f,   -0.0342152f,
+  -0.0244812f,  0.220329f,     0.0313591f,   -0.0902074f,  -0.0731945f,
+  0.88488f,     0.306306f,     -0.275613f,   -0.476372f,   0.00678104f,
+  0.442029f,    0.122049f,     0.118042f,    0.270527f,    -0.462538f,
+  0.0665021f,   -0.260255f,    0.209182f,    0.162321f,    0.0629934f,
+  -0.244896f,   -0.078863f,    0.655585f,    -0.0506617f,  -0.487128f,
+  0.118765f,    -0.34408f,     0.0930615f,   -0.365632f,   -0.0670776f,
+  0.44428f,     0.286734f,     0.146608f,    0.686757f,    -0.0738428f,
+  -0.10034f,    -0.928438f,    -0.172601f,   -0.0959575f,  -0.010532f,
+  0.277549f,    0.28773f,      -0.318883f,   0.71254f,     0.273593f,
+  -0.382845f,   -0.0104587f,   -0.647769f,   0.25541f,     0.194625f,
+  0.265197f,    -0.750938f,    -0.0650515f,  -0.567092f,   0.070613f,
+  0.209531f,    0.429699f,     0.130676f,    0.514914f,    0.615778f,
+  0.594535f,    -0.0878778f,   0.40593f,     -0.303383f,   0.0907863f,
+  -0.320068f,   0.0137162f,    -0.303424f,   0.594207f,    -0.236524f,
+  -0.692627f,   -0.990063f,    -0.0262934f,  0.222375f,    0.503412f,
+  0.220224f,    0.676871f,     -0.150996f,   0.379777f,    0.841339f,
+  -1.05981f,    0.259943f,     -0.781745f,   0.0346478f,   0.115791f,
+  -0.25171f,    -0.00872158f,  0.395561f,    -0.0849893f,  -1.20134f,
+  -0.313938f,   0.789542f,     0.159606f,    -0.782095f,   -0.229754f,
+  0.266687f,    -0.0354282f,   -0.3041f,     0.0338618f,   -0.390001f,
+  -0.28362f,    -0.436144f,    0.777351f,    0.855321f,    0.653338f,
+  -0.0382912f,  -0.204577f,    1.13828f,     0.220395f,    -4.60853f,
+  0.575694f,    0.0453189f,    1.76567f,     0.466151f,    -0.366109f,
+  0.594717f,    0.278891f,     -0.750676f,   -0.332739f,   -0.942304f,
+  0.280363f,    0.284561f,     0.209326f,    0.238347f,    -0.0124311f,
+  -0.439463f,   -0.036186f,    0.165997f,    0.374717f,    -0.481148f,
+  -0.626417f,   0.0223598f,    0.039337f,    -0.379918f,   0.211046f,
+  0.0795812f,   0.863355f,     -0.341448f,   0.421494f,    0.410477f,
+  -0.117025f,   -0.511108f,    0.565193f,    -0.063582f,   -0.031349f,
+  -0.0750174f,  0.387941f,     0.541266f,    0.0919753f,   1.05041f,
+  0.263004f,    0.289006f,     0.0439694f,   -1.22439f,    -0.247832f,
+  0.260967f,    0.355794f,     0.599694f,    -0.69418f,    0.372805f,
+  -0.161731f,   0.0720574f,    0.0394657f,   0.122772f,    -0.458067f,
+  -0.370826f,   -1.34495e-05f, -0.373404f,   0.0245539f,   -2.3472f,
+  -2.61448f,    0.264794f,     0.0601582f,   -0.968597f,   -0.196022f,
+  -0.727067f,   0.167346f,     0.517478f,    0.0035377f,   0.777219f,
+  0.553128f,    0.727211f,     0.606202f,    -0.495604f,   2.41445f,
+  0.465214f,    -0.0443004f,   0.142972f,    0.141459f,    -0.17771f,
+  0.0156117f,   0.169264f,     0.0428022f,   -0.164827f,   -0.240632f,
+  0.215289f,    -0.213134f,    -0.184163f,   0.0161321f,   -0.20025f,
+  -0.0311616f,  0.00292108f,   -0.0131921f,  0.0437664f,   -0.104817f,
+  -0.131906f,   0.0822771f,    0.237307f,    -0.347567f,   -1.2485f,
+  0.253616f,    -0.442217f,    0.0514077f,   0.337561f,    -0.0147658f,
+  -0.132888f,   -0.643821f,    0.445573f,    -0.0146213f,  0.235511f,
+  0.53583f,     -0.640644f,    0.0280044f,   0.00628834f,  0.143885f,
+  0.380077f,    -0.542342f,    0.363101f,    0.0647334f,   -0.476556f,
+  -0.822676f,   0.482454f,     -0.0467326f,  -0.253083f,   0.116726f,
+  0.317333f,    0.548131f,     -0.234667f,   0.579923f,    -0.420683f,
+  0.595613f,    -0.279864f,    -0.753204f,   -0.516844f,   -0.436574f,
+  -0.120682f,   -0.278939f,    0.752202f,    -0.183443f,   -0.14632f,
+  -0.0344068f,  0.127638f,     -0.225245f,   0.489391f,    0.145082f,
+  -0.73672f,    0.980065f,     -0.0367412f,  0.40632f,     -0.802509f,
+  0.356897f,    0.366172f,     1.23858f,     -0.978381f,   -0.684924f,
+  -0.0870693f,  -0.353628f,    0.695788f,    -0.244593f,   -1.8897f,
+  -0.257803f,   0.686937f,     0.405155f,    -0.125696f,   0.258075f,
+  0.570584f,    -0.439481f,    -0.59798f,    0.0745711f,   -0.235162f,
+  0.133048f,    -0.243033f,    0.0415527f,   -0.00118735f, 0.00980514f,
+  -0.297429f,   -0.144983f,    0.463093f,    0.0965441f,   -0.338508f,
+  -0.651077f,   0.817577f,     -0.0364773f,  -0.388465f,   0.113288f,
+  0.231198f,    0.316208f,     -0.592201f,   0.530376f,    -0.431434f,
+  0.0200985f,   0.104303f,     -0.130705f,   0.4374f,      0.362342f,
+  0.70641f,     0.20037f,      0.309128f,    -0.484535f,   -1.18469f,
+  0.513893f,    0.201236f,     -0.022396f,   0.179638f,    -0.361289f,
+  -0.0794946f,  -1.04704f,     -0.0281103f,  0.0494822f,   0.00196415f,
+  0.0625478f,   -0.229033f,    0.12018f,     0.542629f,    -0.222423f,
+  -0.0123321f,  -0.0988525f,   0.773192f,    -0.192218f,   -3.19156f,
+  0.300606f,    0.462751f,     2.2968f,      0.137182f,    0.132539f,
+  0.165884f,    0.128818f,     -0.155856f,   -0.558538f,   -0.231742f,
+  -0.244377f,   -0.442397f,    0.250947f,    0.0850658f,   -0.00820139f,
+  0.391284f,    0.17453f,      0.306003f,    -0.531499f,   -0.624451f,
+  0.564584f,    -0.343953f,    -0.0278713f,  0.212664f,    -0.135969f,
+  -0.0179867f,  -0.687887f,    0.371065f,    -0.0537029f,  0.0499509f,
+  0.0980684f,   -0.0438569f,   0.186731f,    0.182105f,    0.172254f,
+  -0.149446f,   -0.0247637f,   0.148098f,    1.20772f,     -0.136664f,
+  0.00983112f,  0.0181381f,    -0.0147549f,  -0.0846561f,  -0.827022f,
+  0.00207177f,  0.0478215f,    0.0652549f,   0.0898219f,   -0.0224959f,
+  -0.0274246f,  0.0166498f,    -0.0211715f,  -0.502932f,   0.0961452f,
+  0.251206f,    -0.0623632f,   0.741566f,    0.0078449f,   -2.99162f,
+  -0.187244f,   0.0743479f,    1.46425f,     0.0737923f,   0.0133544f,
+  0.20922f,     -0.178671f,    -0.0528492f,  -0.526717f,   0.0282125f,
+  -0.0363201f,  0.37406f,      -0.303658f,   -0.066803f,   0.132237f,
+  0.962057f,    -0.399733f,    0.191765f,    -0.452606f,   -0.348732f,
+  0.444939f,    0.153025f,     0.0796317f,   0.265985f,    -0.319638f,
+  0.0278161f,   -0.333734f,    0.226108f,    0.147895f,    -0.124066f,
+  -0.37306f,    0.19541f,      0.200175f,    -0.0593244f,  0.0333887f,
+  -0.0284278f,  0.462491f,     0.0686487f,   -0.332435f,   -0.437166f,
+  0.302795f,    0.100542f,     0.0265019f,   0.767212f,    -0.140621f,
+  0.11558f,     -0.70584f,     -0.00017415f, 0.00793092f,  -0.0490901f,
+  0.0598338f,   0.484876f,     -0.13025f,    0.660349f,    0.147503f,
+  -0.462766f,   0.0843824f,    0.218493f,    0.310921f,    -0.162284f,
+  0.210404f,    -0.788799f,    0.0698512f,   -0.484799f,   0.0311505f,
+  -0.308243f,   0.417298f,     0.0593723f,   0.208908f,    0.451437f,
+  0.354546f,    -0.0700888f,   -0.281678f,   -0.311177f,   0.00914652f,
+  -0.372084f,   0.135036f,     0.185393f,    0.461347f,    -0.114241f,
+  -0.402347f,   -0.692327f,    0.0376155f,   -0.200267f,   0.565963f,
+  -0.0627442f,  0.429677f,     0.170514f,    0.350565f,    0.699528f,
+  -0.948126f,   -0.364205f,    0.348878f,    -0.137832f,   -0.0791649f,
+  -0.0462295f,  -0.255078f,    -0.398509f,   0.136783f,    -0.0164628f,
+  -0.555472f,   0.690396f,     0.147715f,    0.000523095f, 0.14874f,
+  0.524804f,    0.162974f,     0.797599f,    0.277473f,    -0.500696f,
+  0.189917f,    -0.333309f,    0.00613646f,  -1.07817f,    0.0470502f,
+  0.210766f,    0.159768f,     -0.447774f,   -0.252968f,   -1.72739f,
+  0.0658259f,   -0.448747f,    2.26511f,     0.349651f,    0.157232f,
+  0.956842f,    0.856676f,     0.149227f,    -0.626957f,   -0.566771f,
+  -0.0980846f,  0.351668f,     -0.362741f,   -0.0272282f,  -0.113632f,
+  0.366015f,    -0.00790003f,  -0.458632f,   -0.31157f,    -0.182257f,
+  -0.953975f,   0.0583582f,    0.164721f,    -0.900107f,   -0.115542f,
+  0.0654192f,   0.99056f,      -0.247976f,   0.48254f,     0.670196f,
+  0.098585f,    -0.212855f,    0.310072f,    0.0894616f,   0.151944f,
+  0.119629f,    -0.26735f,     0.162257f,    -0.0305818f,  0.681526f,
+  -0.229847f,   1.01556f,      0.29132f,     0.740113f,    0.0703937f,
+  0.537892f,    -0.18653f,     -0.0252359f,  -0.420014f,   0.197631f,
+  -0.176629f,   0.00674754f,   0.301288f,    -0.162816f,   0.636235f,
+  -0.341362f,   0.197296f,     -0.589747f,   -0.749363f,   -0.277197f,
+  -1.27291f,    -0.0857908f,   -0.147591f,   -0.0956297f,  -0.109097f,
+  0.0717554f,   0.359078f,     0.301457f,    0.486934f,    -0.260955f,
+  -0.126821f,   1.55756f,      0.477469f,    -1.45363f,    1.42198f,
+  -0.360847f,   -0.0211924f,   -0.0184957f,  -0.110706f,   -0.152136f,
+  0.104703f,    0.267615f,     0.127392f,    0.172996f,    0.258326f,
+  0.268578f,    -0.431123f,    -0.114419f,   0.0101172f,   -0.195671f,
+  0.0792025f,   -0.151505f,    -0.064077f,   0.0479777f,   -0.141882f,
+  0.121492f,    -0.139132f,    -0.348252f,   0.341043f,    -0.565367f,
+  -0.0791259f,  -0.781086f,    0.0140045f,   0.571094f,    -0.00875077f,
+  0.217132f,    -0.202345f,    0.157213f,    0.228445f,    0.366612f,
+  -0.529989f,   0.42241f,      -0.540538f,   -0.0425556f,  -0.207774f,
+  -0.0663941f,  0.37836f,      -0.0650245f,  -0.0828694f,  -0.0835478f,
+  -0.795512f,   0.470268f,     0.1551f,      -0.69017f,    -0.116735f,
+  0.157614f,    0.555973f,     -0.293311f,   0.245428f,    -0.0853701f,
+  -0.449278f,   -0.0551647f,   -0.00137429f, 0.709439f,    -0.456796f,
+  0.132062f,    -0.0449484f,   -0.308599f,   0.180608f,    -2.24196f,
+  0.421478f,    -0.640946f,    -0.460397f,   -0.920628f,   -0.184949f,
+  -0.0416982f,  0.6484f,       -0.22806f,    0.412229f,    -0.468079f,
+  -0.72372f,    -0.347698f,    -1.3899f,     0.631876f,    0.0611046f,
+  0.0294258f,   -0.128091f,    -0.205615f,   0.355348f,    -0.267725f,
+  -0.644835f,   0.435879f,     0.517477f,    -0.338123f,   -0.157764f,
+  0.32762f,     -0.166454f,    0.221007f,    -0.0438278f,  -0.0777725f,
+  0.10986f,     0.941545f,     -0.542284f,   -0.172312f,   -0.256597f,
+  -0.0181391f,  0.220623f,     -0.432456f,   0.0164074f,   0.250226f,
+  -0.522576f,   0.783109f,     0.198703f,    -0.784554f,   -0.0929628f,
+  0.326861f,    0.470293f,     0.442684f,    0.271879f,    -0.108256f,
+  0.0483558f,   -0.403151f,    0.36183f,     -0.268186f,   0.270851f,
+  -0.696826f,   -0.166037f,    -0.354658f,   0.405977f,    -0.473447f,
+  0.649689f,    -0.0863114f,   -0.147319f,   0.0869966f,   0.319792f,
+  0.493026f,    -1.07456f,     0.354751f,    0.114605f,    -0.120647f,
+  -0.238315f,   0.0290955f,    -0.355299f,   -0.45381f,    0.0812865f,
+  -0.0180434f,  0.00861318f,   -0.892943f,   -0.0127801f,  -1.66398f,
+  0.290505f,    0.126832f,     2.08173f,     -0.0454847f,  -0.162481f,
+  1.07426f,     0.228566f,     0.280528f,    -0.537625f,   -0.175288f,
+  -0.118012f,   0.649114f,     -0.349926f,   -0.0189864f,  -0.30934f,
+  -0.363178f,   -0.119822f,    -0.22656f,    0.484513f,    -0.173269f,
+  0.41987f,     -0.448517f,    -0.0950466f,  0.482443f,    0.061558f,
+  0.4219f,      -0.536388f,    0.0781972f,   0.212489f,    0.104229f,
+  -0.0792804f,  0.402066f,     -0.676313f,   -0.2272f,     -0.16379f,
+  0.260145f,    -0.0504658f,   -0.0826579f,  -1.37749f,    0.00790747f,
+  0.0841031f,   -0.0671308f,   -0.00301736f, -0.386206f,   0.190311f,
+  0.0702639f,   0.0643968f,    0.133741f,    -0.0141555f,  -0.0365324f,
+  0.87028f,     0.207894f,     -0.421266f,   0.689256f,    0.145037f,
+  -0.270796f,   0.212604f,     -0.345326f,   0.0074631f,   -1.72379f,
+  0.0672097f,   -0.273153f,    1.30503f,     -1.01324f,    0.00284696f,
+  0.851459f,    0.176847f,     0.30948f,     -0.57144f,    -0.0596695f,
+  -0.111189f,   0.130361f,     -0.298286f,   0.0567591f,   -0.0885215f,
+  -0.847601f,   0.238624f,     -0.162391f,   0.452357f,    -0.0192713f,
+  0.226661f,    0.0762922f,    -0.0894055f,  0.332702f,    0.424484f,
+  0.0443207f,   -0.162345f,    -0.601036f,   0.280527f,    -0.137362f,
+  0.266345f,    0.729438f,     -0.887182f,   0.152943f,    -0.573548f,
+  -0.0201383f,  -0.56521f,     0.033582f,    0.300284f,    -0.144472f,
+  0.633026f,    0.30866f,      0.0653073f,   0.316901f,    0.0721326f,
+  0.192252f,    -0.833162f,    0.194292f,    -0.08663f,    -0.189401f,
+  -0.178242f,   0.111488f,     0.522487f,    -0.65497f,    0.457049f,
+  0.390654f,    0.0522936f,    -0.39712f,    -0.293717f,   -0.374656f,
+  -0.118916f,   -0.853076f,    -0.0829578f,  -0.17335f,    -0.0218694f,
+  0.367968f,    0.478469f,     0.0913813f,   0.519251f,    0.803526f,
+  -0.272516f,   -0.341329f,    0.0897285f,   0.247653f,    0.000898686f,
+  0.313196f,    0.000587979f,  -0.314189f,   -0.449439f,   -0.0291611f,
+  -0.356287f,   -0.722904f,    -0.0480958f,  -0.523758f,   -0.576146f,
+  0.133754f,    0.616921f,     -0.085494f,   0.487487f,    0.745129f,
+  0.993267f,    0.256555f,     0.0822743f,   0.0411971f,   0.139388f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = {
+  0.00447951f,  0.0202534f,  0.00970833f, -0.00460874f,  0.0942288f,
+  -0.0534704f,  0.00829869f, -0.0255174f, -0.0809143f,   0.00169117f,
+  0.0177427f,   0.0259387f,  0.0291077f,  -0.0267599f,   0.100275f,
+  -0.00389366f, 0.0315499f,  0.0265846f,  -0.000206604f, 0.0302221f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = {
+  0.153048f,    0.0725422f,   0.068901f,     -0.475608f,   0.0736706f,
+  -0.134076f,   0.229289f,    0.0217921f,    0.0449205f,   -1.00002f,
+  0.149133f,    0.0497258f,   0.118988f,     0.0741764f,   0.0385486f,
+  0.225181f,    0.012966f,    0.155593f,     -3.07175f,    -0.0641051f,
+  0.09161f,     0.0259005f,   -0.209998f,    -0.420298f,   0.0587126f,
+  0.00352744f,  0.0451313f,   -0.049384f,    0.11516f,     0.083135f,
+  0.103675f,    -0.0185604f,  0.0623248f,    -0.0993726f,  0.0448522f,
+  0.0134017f,   -0.294776f,   -0.251924f,    0.0712635f,   -0.0764298f,
+  -0.463766f,   -0.0295011f,  -0.579168f,    0.573853f,    -0.00596607f,
+  0.0237762f,   -0.0500104f,  -0.0969275f,   0.155573f,    0.0515382f,
+  -0.178454f,   -0.154008f,   -0.278299f,    -0.166421f,   0.0149533f,
+  -0.0700236f,  0.239287f,    -1.19545f,     -0.0744625f,  0.143037f,
+  0.141874f,    0.086302f,    0.0838633f,    -0.454179f,   0.120308f,
+  -0.0896718f,  0.254909f,    0.0714462f,    0.00471098f,  -0.869494f,
+  0.209407f,    0.138285f,    0.0816641f,    0.0666266f,   0.0848555f,
+  0.173313f,    0.0695633f,   0.285667f,     -3.15384f,    0.00140275f,
+  -0.969824f,   -0.0318689f,  -0.00487396f,  0.412541f,    0.0263593f,
+  -0.249824f,   0.0897776f,   0.0208836f,    -0.0982745f,  -0.16049f,
+  -0.12719f,    -0.186166f,   0.102338f,     0.273931f,    -0.0886306f,
+  -0.19513f,    -0.0135712f,  -0.194127f,    -0.0834291f,  0.426623f,
+  -0.0705446f,  0.0327476f,   0.0800862f,    0.478757f,    -0.00849111f,
+  -0.554911f,   -0.0489312f,  -0.184029f,    -0.227428f,   0.159989f,
+  -0.0677731f,  -0.0901436f,  0.00308696f,   -0.352243f,   0.278715f,
+  0.306374f,    -0.0772054f,  -0.0122733f,   -0.0693457f,  0.074365f,
+  -0.267458f,   -0.123612f,   -0.495954f,    0.552604f,    -0.103951f,
+  -0.121771f,   0.179966f,    -0.377947f,    -1.35472f,    0.153294f,
+  -0.445284f,   -0.089813f,   -0.00529807f,  0.254047f,    -0.0378426f,
+  0.114597f,    -0.143052f,   0.0815258f,    -0.10528f,    0.00833533f,
+  -0.117508f,   0.129052f,    0.0706719f,    -1.39506f,    0.0124731f,
+  0.109831f,    -0.0744156f,  0.181612f,     0.0787894f,   0.0293352f,
+  0.494929f,    0.00997207f,  -0.585882f,    -0.0844138f,  -0.00864134f,
+  -0.109943f,   0.0713114f,   0.14883f,      0.0610554f,   0.204145f,
+  -0.00390313f, 0.0184763f,   -0.111387f,    0.175442f,    -0.0840215f,
+  -0.178785f,   -0.0693612f,  -0.254507f,    -0.191549f,   0.501561f,
+  -0.0858995f,  -0.164921f,   0.0250706f,    -0.0916282f,  0.247085f,
+  0.13877f,     -0.419487f,   -0.295065f,    -0.213812f,   -0.10362f,
+  0.138243f,    0.086985f,    0.113633f,     -0.459273f,   0.12388f,
+  -0.139296f,   0.253792f,    0.0421624f,    0.0665065f,   -0.977282f,
+  0.199927f,    0.115194f,    0.099045f,     0.0534806f,   0.089283f,
+  0.0815367f,   0.150901f,    0.253458f,     -3.24825f,    -0.0118163f,
+  -0.544565f,   0.0201825f,   -0.0682201f,   0.759028f,    0.00479696f,
+  -0.00625607f, 0.058007f,    -0.0811189f,   -0.114617f,   -0.0998578f,
+  0.133312f,    0.0246256f,   -0.0167416f,   0.196118f,    0.109823f,
+  0.109489f,    0.474682f,    -0.763475f,    0.0818745f,   0.0798777f,
+  -0.0994905f,  -0.00138143f, -0.108563f,    0.697289f,    -0.103702f,
+  -0.306085f,   -0.0996705f,  -0.142618f,    -0.130989f,   0.0813303f,
+  -0.0909275f,  -0.10786f,    -0.0280431f,   0.206877f,    -1.70798f,
+  0.525568f,    0.559891f,    -0.166132f,    -0.227574f,   -0.150955f,
+  0.0849226f,   0.00497342f,  -0.168667f,    -0.282575f,   0.00537805f,
+  -0.0185572f,  0.0607167f,   -0.0534948f,   -0.0215776f,  -0.14825f,
+  -0.0164577f,  -0.0611978f,  0.0347562f,    0.286917f,    0.226598f,
+  0.149497f,    -0.478101f,   -0.246006f,    0.0663239f,   -0.121728f,
+  0.267087f,    0.0802681f,   -0.184741f,    -0.558267f,   0.0437066f,
+  0.13816f,     -0.0710939f,  0.0725697f,    0.339857f,    0.161069f,
+  0.304871f,    0.108138f,    0.193396f,     0.0891607f,   -0.0701939f,
+  -0.182038f,   -0.451873f,   -0.233883f,    0.0444747f,   0.0436545f,
+  -0.245894f,   -0.0721136f,  0.309013f,     0.278996f,    0.0259377f,
+  0.0278116f,   0.0686773f,   -0.271237f,    0.235082f,    -0.0778285f,
+  -0.456541f,   -0.109303f,   -0.074565f,    -0.407301f,   -0.162191f,
+  -0.801819f,   0.372435f,    -0.559083f,    -0.039189f,   0.0477762f,
+  0.0875363f,   0.0699926f,   0.116552f,     -0.308217f,   0.0341607f,
+  -0.14202f,    0.135517f,    0.0316971f,    0.153297f,    -0.759722f,
+  0.12849f,     0.114229f,    0.0814893f,    0.275402f,    0.0403976f,
+  0.0357503f,   0.212295f,    0.0673998f,    -2.59822f,    -0.0475021f,
+  -0.0594725f,  0.0659163f,   0.0469717f,    -0.0370461f,  -0.12863f,
+  -0.381743f,   -0.0445055f,  -0.106843f,    -0.0880648f,  0.00591106f,
+  0.235514f,    -0.165162f,   -0.0696645f,   0.115374f,    0.245558f,
+  0.192049f,    -0.388628f,   -0.48291f,     0.154313f,    -0.160207f,
+  0.125928f,    0.122039f,    0.0713794f,    -0.161244f,   0.128082f,
+  -0.234659f,   0.0680219f,   0.0597933f,    0.208421f,    -0.163623f,
+  0.196873f,    0.156603f,    0.184179f,     -0.278331f,   -0.0481286f,
+  0.0828152f,   0.247004f,    0.0915582f,    -0.0906229f,  -0.20376f,
+  0.136593f,    0.0740336f,   -0.0134935f,   -0.355048f,   0.0898485f,
+  -0.0962068f,  0.185804f,    -0.0145596f,   0.0966589f,   -0.515784f,
+  0.121602f,    0.0320428f,   0.11093f,      -0.0559421f,  0.0355484f,
+  0.192128f,    0.0500888f,   0.133641f,     -1.73282f,    -0.0624599f,
+  0.122524f,    0.0757292f,   -0.0974648f,   -0.193649f,   0.0561096f,
+  0.0159959f,   0.0334472f,   -0.0168832f,   -0.12386f,    -0.112419f,
+  0.19552f,     0.0308502f,   0.0537643f,    -0.0181012f,  0.0392183f,
+  0.0461833f,   -0.52623f,    -0.238252f,    0.0821762f,   -0.212384f,
+  0.112901f,    0.096063f,    0.0540225f,    0.0773583f,   0.143045f,
+  -0.101551f,   0.282418f,    0.0176749f,    -0.00244542f, -0.780154f,
+  -0.254428f,   -5.82215f,    0.106638f,     0.11746f,     0.0486823f,
+  0.164562f,    0.0303006f,   0.229614f,     -2.41845f,    -0.117122f,
+  0.0451654f,   0.0237383f,   -0.208731f,    0.0721137f,   0.0761163f,
+  -0.0569416f,  -0.00830511f, -0.045256f,    0.14535f,     -0.0189222f,
+  -0.283363f,   -3.15502f,    0.0971161f,    -0.035913f,   0.00813281f,
+  0.0187974f,   -0.361573f,   -0.302067f,    0.118014f,    -0.0956148f,
+  -0.596567f,   0.0105443f,   -0.49019f,     -0.0801959f,  0.0322344f,
+  -0.0280032f,  0.0555038f,   -0.111495f,    -0.0994456f,  0.0178021f,
+  0.0358362f,   1.07063f,     -0.0833138f,   0.0621246f,   0.0637157f,
+  0.0999207f,   0.191975f,    -1.2811f,      0.0341681f,   0.14818f,
+  0.0957259f,   0.109909f,    0.0566115f,    0.0585633f,   0.179939f,
+  -0.104372f,   0.309091f,    0.0172941f,    0.0243182f,   -0.935252f,
+  -0.296257f,   -5.83634f,    0.0899249f,    0.455347f,    0.129505f,
+  0.220212f,    0.0214801f,   0.284802f,     -2.94585f,    -0.0805413f,
+  -1.01819f,    0.00534034f,  -0.057203f,    0.0869331f,   0.0207575f,
+  -0.124479f,   -0.0465806f,  0.0894252f,    0.32203f,     0.0858497f,
+  0.25178f,     0.0932205f,   0.0888455f,    0.233153f,    -0.446398f,
+  -0.00791233f, 0.0909603f,   -0.0904397f,   0.131835f,    0.475597f,
+  -0.1236f,     0.0231622f,   0.138602f,     -0.097731f,   -0.0282484f,
+  -0.549095f,   -0.0457428f,  -0.0895407f,   -0.293965f,   0.166872f,
+  0.46719f,     0.236254f,    0.0615991f,    0.499236f,    0.540366f,
+  0.402035f,    0.0606324f,   -0.0499928f,   -0.0155198f,  0.0994403f,
+  -0.14773f,    -0.183433f,   -0.612093f,    -0.334201f,   -0.110877f,
+  -0.143441f,   0.05815f,     -0.318586f,    -0.344235f,   0.199593f,
+  0.51109f,     -0.252281f,   -0.028834f,    0.0615421f,   0.0623699f,
+  0.210745f,    -0.236448f,   0.166279f,     0.127516f,    -0.0971157f,
+  -0.204389f,   0.208112f,    0.0377023f,    0.271837f,    -0.00859528f,
+  0.0797081f,   -0.00582115f, 0.140018f,     -0.384865f,   -0.0853243f,
+  -0.586727f,   -0.0664489f,  -0.631436f,    -0.245828f,   -0.0647894f,
+  -0.171912f,   -0.0801706f,  0.0731614f,    -0.11725f,    0.281478f,
+  -0.03047f,    0.0363488f,   -0.0481651f,   -0.326329f,   -0.0155898f,
+  -0.428316f,   -0.0989367f,  -0.271902f,    -0.00263837f, 0.366168f,
+  0.325989f,    0.165463f,    0.0668512f,    -0.142202f,   0.419992f,
+  0.164971f,    -0.515479f,   -0.187585f,    -0.151783f,   -0.0682468f,
+  0.0910191f,   0.117086f,    0.106579f,     0.0961825f,   0.162148f,
+  -0.129645f,   0.301039f,    0.000320343f,  -0.0558097f,  -0.844295f,
+  -0.218919f,   -5.7571f,     0.0982612f,    0.238955f,    0.0703565f,
+  0.0969388f,   0.107202f,    0.321585f,     -3.00594f,    -0.058755f,
+  -0.620004f,   0.052114f,    0.128423f,     -0.177673f,   -0.00341509f,
+  -0.146756f,   -0.0414309f,  -0.0893262f,   -0.0584779f,  -0.129552f,
+  0.127629f,    0.13275f,     -0.0973342f,   -0.215617f,   0.0724309f,
+  0.0102229f,   0.178137f,    -0.943374f,    -0.171465f,   0.304949f,
+  -0.0963836f,  -0.0346437f,  -0.138667f,    -0.234184f,   0.0344159f,
+  -0.319592f,   -0.0990766f,  -0.16065f,     0.369432f,    0.194911f,
+  0.363348f,    -0.356009f,   -0.00736217f,  0.241788f,    -2.21311f,
+  0.704816f,    0.697019f,    0.129186f,     -0.132799f,   -0.11861f,
+  0.0383451f,   0.0247782f,   -0.12687f,     0.0256552f,   0.048413f,
+  0.00660549f,  0.0457962f,   -0.012819f,    0.115991f,    -0.1117f,
+  -0.291045f,   -0.646138f,   0.0813613f,    0.112063f,    0.191675f,
+  0.120835f,    -0.444267f,   -0.340385f,    0.0391936f,   -0.151132f,
+  0.184419f,    0.124998f,    -0.14089f,     0.214087f,    0.00108535f,
+  0.119611f,    0.0236965f,   0.0715074f,    -0.225997f,   -0.0126552f,
+  -0.459214f,   -0.490444f,   0.173716f,     0.355811f,    -0.13607f,
+  -0.191091f,   -0.530085f,   -0.400666f,    0.011221f,    0.10527f,
+  -0.11498f,    -0.011864f,   0.364376f,     0.0319587f,   -0.0528563f,
+  0.0353899f,   0.0393453f,   -0.289211f,    -0.347785f,   -0.0417157f,
+  0.545848f,    0.741785f,    -0.0732565f,   -1.29687f,    -0.0433128f,
+  -1.44162f,    0.318894f,    -0.377784f,    0.123751f,    -0.00444347f,
+  0.0957118f,   0.0893616f,   0.0911595f,    0.092917f,    0.127681f,
+  -0.159929f,   0.190417f,    -0.0297948f,   -0.00132599f, -0.742756f,
+  -0.0364169f,  -4.00108f,    0.0784767f,    0.223048f,    0.0430138f,
+  0.0180493f,   0.212842f,    0.122987f,     -2.83267f,    -0.0641464f,
+  -0.173247f,   0.100946f,    0.0804885f,    0.0172631f,   0.0877408f,
+  -0.353222f,   0.0108262f,   -0.0452121f,   -0.116127f,   0.268154f,
+  -0.132587f,   -0.27481f,    -0.0316914f,   0.0610525f,   0.439691f,
+  0.00966415f,  -0.78962f,    -0.424823f,    -0.0214365f,  -0.113846f,
+  0.100793f,    0.126482f,    0.0415354f,    0.0427995f,   0.14273f,
+  -0.315674f,   0.110095f,    0.0061568f,    0.0320474f,   -0.3596f,
+  -0.12533f,    -1.28837f,    0.174673f,     -0.235912f,   0.00495439f,
+  0.0695473f,   0.266489f,    0.049248f,     0.0868526f,   -0.0685969f,
+  0.102984f,    0.0924639f,   -0.027535f,    0.0709277f,   0.155776f,
+  -0.190944f,   0.188273f,    -0.00897471f,  0.0964232f,   -0.475822f,
+  -0.209374f,   -5.00252f,    0.103495f,     0.110698f,    0.00682092f,
+  0.208586f,    0.0489575f,   0.0966254f,    -1.42973f,    -0.0645128f,
+  0.0515961f,   0.0571281f,   -0.0992321f,   0.00791648f,  0.0087609f,
+  0.0607367f,   0.0315705f,   0.0183317f,    0.0756087f,   -0.0292847f,
+  -0.212932f,   -0.782259f,   0.0899944f,    0.102677f,    0.0681135f,
+  0.0447764f,   -0.481969f,   -0.221459f,    0.0794475f,   -0.229157f,
+  0.136781f,    0.0832359f,   0.0297807f,    -0.00287225f, -5.97897f,
+  -0.0960581f,  0.250945f,    -0.00133314f,  -0.112396f,   -0.856922f,
+  0.115776f,    0.124536f,    0.0914194f,    -0.160775f,   0.128684f,
+  0.106718f,    0.100665f,    0.139579f,     -0.86141f,    -0.190323f,
+  0.0884896f,   0.0363845f,   -0.19831f,     0.121601f,    0.0264453f,
+  -0.00557822f, 0.0720238f,   -0.0140132f,   -0.166814f,   -0.266214f,
+  0.00500545f,  0.0146905f,   0.126035f,     0.0812372f,   0.0615973f,
+  0.0766063f,   -0.420156f,   -0.126157f,    -0.0284299f,  -0.112513f,
+  -0.567008f,   -0.0100263f,  -0.607567f,    0.193053f,    0.0067527f,
+  -0.0753897f,  0.00134269f,  -0.0512249f,   -0.161661f,   0.0667741f,
+  -0.113702f,   -0.071606f,   -0.300563f,    0.276479f,    -0.155318f,
+  -0.0512306f,  0.0896443f,   -0.987911f,    0.0440889f,   0.430958f,
+  0.175427f,    0.101385f,    0.0303662f,    0.0672653f,   -6.62463f,
+  -0.10475f,    0.228249f,    -0.00482173f,  -0.0608713f,  -0.895836f,
+  0.187976f,    0.162173f,    0.0747544f,    0.219953f,    0.0682489f,
+  0.142665f,    0.100287f,    0.301887f,     -1.97736f,    -0.295001f,
+  -1.0733f,     -0.0562668f,  -0.0604295f,   0.0304073f,   0.194274f,
+  -0.243593f,   0.0727137f,   0.0610967f,    -0.0692415f,  -0.02967f,
+  0.055633f,    0.0192402f,   0.105841f,     0.102236f,    -0.0757102f,
+  -0.0067639f,  0.0102317f,   -0.257959f,    -0.0638652f,  0.45521f,
+  -0.114967f,   0.0921177f,   0.223796f,     0.277072f,    -0.0613282f,
+  -0.564693f,   -0.151333f,   -0.158035f,    0.228491f,    0.12997f,
+  -0.192625f,   -0.125344f,   0.0983258f,    -0.931206f,   0.618715f,
+  0.273759f,    -0.145527f,   -0.099431f,    -0.119551f,   0.0663484f,
+  -0.161419f,   -0.202377f,   -0.545393f,    0.0917645f,   0.042263f,
+  -0.17117f,    -0.178622f,   -0.336977f,    0.866715f,    0.0376922f,
+  -0.319728f,   -0.127406f,   0.0599384f,    0.268804f,    -0.0331844f,
+  0.355326f,    -0.103902f,   0.0425935f,    0.00525512f,  -0.133687f,
+  -0.122695f,   0.145582f,    0.139013f,     -0.0053352f,  0.0313566f,
+  0.327295f,    -0.0117993f,  0.233524f,     0.162388f,    -0.0793262f,
+  0.454543f,    0.0442224f,   -0.742673f,    -0.144882f,   0.0874983f,
+  -0.0707259f,  0.0219869f,   0.201728f,     0.0204537f,   0.0788857f,
+  -0.0374329f,  0.0724169f,   0.0743593f,    -0.0193526f,  -0.313546f,
+  -0.418882f,   -0.0815754f,  -0.197144f,    0.305053f,    0.330196f,
+  -0.131006f,   -0.00113249f, 0.0750458f,    -0.541764f,   0.299935f,
+  0.308516f,    -0.20547f,    -0.333066f,    0.0285833f,   0.191147f,
+  0.160372f,    0.0724649f,   0.0426326f,    0.153046f,    -6.59656f,
+  -0.081237f,   0.219163f,    0.0147081f,    -0.0109837f,  -1.01487f,
+  0.170055f,    0.163386f,    0.106413f,     0.150188f,    0.0688875f,
+  0.0541359f,   0.156307f,    0.178844f,     -1.51054f,    -0.149477f,
+  -0.504503f,   0.017878f,    -0.181821f,    -0.0999659f,  0.0484548f,
+  -0.32211f,    0.0406744f,   0.0017627f,    0.0220593f,   0.0900512f,
+  -0.561625f,   0.107279f,    -0.0861521f,   -0.0862376f,  0.0816765f,
+  0.168072f,    0.150063f,    -0.816825f,    -0.13569f,    0.557555f,
+  -0.155265f,   0.025135f,    -0.109304f,    -0.0487062f,  -0.00347487f,
+  -0.454803f,   -0.0394371f,  -0.214597f,    -0.248898f,   0.286501f,
+  -0.249246f,   -0.138935f,   0.00391409f,   -0.122544f,   -2.14993f,
+  0.588942f,    0.541231f,    0.0154047f,    -0.359742f,   0.0520729f,
+  0.0667058f,   0.0418163f,   -0.132533f,    -0.184759f,   0.0546118f,
+  -0.131198f,   0.109664f,    -0.0714679f,   -0.114163f,   -0.243081f,
+  -0.0405089f,  0.0342795f,   0.0801825f,    -0.268408f,   0.192207f,
+  0.0800494f,   -0.586539f,   -0.118155f,    -0.0508569f,  -0.193987f,
+  0.261478f,    0.105719f,    -0.125361f,    -0.0956201f,  0.0233802f,
+  0.271098f,    0.0113352f,   0.0910447f,    0.00628244f,  -0.071722f,
+  0.21439f,     0.0747191f,   0.207765f,     -0.0782454f,  -0.0151716f,
+  -0.196505f,   -0.44798f,    -0.228597f,    0.0549039f,   -0.120715f,
+  -0.19388f,    -0.0768461f,  0.361102f,     0.122936f,    -0.0334211f,
+  -0.202503f,   -0.0450776f,  -0.272345f,    0.662321f,    0.109247f,
+  -0.218026f,   -0.0669386f,  -0.0864701f,   -0.633421f,   -0.158007f,
+  -1.10778f,    0.351211f,    -0.541458f,    -0.0171707f,  0.149606f,
+  0.106105f,    0.0880349f,   0.0968455f,    0.113269f,    -5.01949f,
+  -0.106404f,   0.175578f,    -0.030045f,    -0.0267249f,  -0.563713f,
+  0.173885f,    0.130772f,    0.0334519f,    0.0770157f,   0.0394389f,
+  -0.0290326f,  0.220003f,    0.180901f,     -1.62203f,    -0.151858f,
+  -0.202386f,   -0.0067836f,  0.0287665f,    -0.194183f,   -0.239834f,
+  -0.484159f,   0.00671722f,  -0.122459f,    0.0808959f,   -0.263769f,
+  -0.015066f,   -0.0429868f,  -0.111255f,    -0.231872f,   0.219659f,
+  -0.0437412f,  -0.536618f,   -0.477831f,    0.0421895f,   -0.0815851f,
+  0.119638f,    0.0786293f,   -0.000668378f, 0.0305567f,   -0.0868189f,
+  -0.178327f,   0.0799657f,   0.0280923f,    -0.211395f,   -0.464577f,
+  0.216912f,    0.0761976f,   0.160288f,     -0.416372f,   -0.10286f,
+  -0.0733786f,  0.261033f,    0.0493698f,    0.143137f,    -0.179979f,
+  0.15655f,     0.0897976f,   -0.0258041f,   -0.152852f,   -6.15512f,
+  -0.118917f,   0.227283f,    -0.0514043f,   -0.0786432f,  -0.523485f,
+  0.1644f,      0.0869001f,   0.0984082f,    -0.428288f,   0.0791992f,
+  0.141904f,    0.0652073f,   0.104429f,     -0.775125f,   -0.121479f,
+  0.0841637f,   0.0135705f,   -0.208863f,    -0.0629523f,  0.0455794f,
+  0.0513898f,   -0.0147657f,  0.0401145f,    0.0660079f,   0.0210609f,
+  -0.0151801f,  0.0562111f,   0.140308f,     -0.0196394f,  0.0230753f,
+  -0.0336115f,  -0.422411f,   -0.196974f,    -0.0405748f,  -0.283428f,
+  0.15458f,     0.0876296f,   0.0314038f,    0.16389f,     -7.01385f,
+  -0.117146f,   0.197273f,    -0.0400688f,   0.0143951f,   -0.964007f,
+  -0.0618919f,  0.0406891f,   0.07992f,      -0.144132f,   0.116416f,
+  0.0326838f,   0.103641f,    0.171805f,     -1.05158f,    -0.182589f,
+  0.116991f,    0.0530774f,   -0.212454f,    -0.016727f,   -0.0565992f,
+  0.0712873f,   0.0445466f,   -0.000107032f, -0.121449f,   -0.15148f,
+  0.0220338f,   0.0762024f,   0.12253f,      0.0622466f,   0.0835822f,
+  0.0465119f,   -0.388743f,   -0.34665f,     -0.0720734f,  -0.101581f,
+  -0.630565f,   -0.0512685f,  -0.520541f,    0.0530119f,   -0.0245276f,
+  -0.19116f,    -0.0144446f,  -0.0604486f,   0.187251f,    -0.021341f,
+  -0.217823f,   0.0510256f,   -0.197946f,    0.060955f,    -0.0617316f,
+  0.0741673f,   0.117591f,    -1.47844f,     -0.0911093f,  0.359225f,
+  0.145027f,    0.127513f,    0.0617905f,    0.141154f,    -7.63868f,
+  -0.0808127f,  0.274843f,    0.00693195f,   -0.0283113f,  -0.853871f,
+  -0.15737f,    0.0858904f,   0.0746279f,    0.109912f,    0.193775f,
+  0.0698094f,   0.174159f,    0.259556f,     -1.49885f,    -0.156706f,
+  -1.04113f,    -0.0329546f,  -0.0491449f,   -0.0304125f,  0.0514892f,
+  -0.244284f,   0.126814f,    -0.0387081f,   -0.153173f,   -0.0566748f,
+  0.294111f,    -0.0170534f,  0.102381f,     0.447606f,    -0.0613267f,
+  -0.0636869f,  -0.0347599f,  -0.259572f,    -0.0657846f,  0.454352f,
+  -0.169453f,   -0.00177987f, 0.133279f,     -0.0863932f,  -0.134423f,
+  -0.475107f,   -0.00448962f, -0.214607f,    0.111413f,    0.194377f,
+  -0.0710837f,  0.0562353f,   0.0401193f,    0.248595f,    0.538374f,
+  0.449469f,    -0.39111f,    0.0125057f,    0.0448811f,   -0.00707751f,
+  -0.164894f,   -0.317516f,   -0.56231f,     -0.270262f,   0.127016f,
+  -0.12092f,    -0.0881587f,  -0.323908f,    0.872344f,    0.103391f,
+  0.267971f,    -0.155088f,   -0.0136683f,   0.309517f,    0.119901f,
+  0.271307f,    -0.188463f,   0.185121f,     -0.142777f,   -0.110535f,
+  -0.163107f,   0.175502f,    0.0801924f,    0.240499f,    0.0874759f,
+  0.308907f,    -0.00222504f, 0.193366f,     0.109018f,    -0.0772158f,
+  -0.520675f,   0.0259432f,   -0.736666f,    -0.296579f,   0.043486f,
+  -0.128932f,   0.0417669f,   0.125747f,     0.157879f,    0.112857f,
+  -0.0595681f,  0.0611936f,   -0.042125f,    -0.270338f,   0.120072f,
+  -0.36675f,    -0.0347962f,  -0.119539f,    0.0873369f,   0.296432f,
+  -0.069501f,   -0.0383859f,  0.0913597f,    -0.40747f,    0.234276f,
+  0.332536f,    -0.732132f,   -0.312291f,    0.137759f,    0.227593f,
+  0.14165f,     0.129068f,    0.102734f,     0.135818f,    -7.35883f,
+  -0.101533f,   0.256027f,    -0.0142278f,   -0.0561601f,  -1.09899f,
+  -0.106538f,   0.0612256f,   0.099487f,     -0.0605983f,  0.134311f,
+  0.052226f,    0.143672f,    0.219944f,     -1.47539f,    -0.101828f,
+  -0.429979f,   0.010478f,    -0.0132605f,   0.103363f,    0.0267373f,
+  -0.338865f,   0.0090188f,   0.0810085f,    -0.124368f,   -0.0133776f,
+  0.595666f,    -0.00162201f, -0.212444f,    -0.26342f,    0.0913656f,
+  -0.106279f,   0.414515f,    -0.709901f,    -0.00198859f, 0.305288f,
+  -0.188536f,   -0.0377482f,  -0.131909f,    -0.116099f,   -0.236827f,
+  -0.36356f,    0.0179455f,   -0.202143f,    -0.00395508f, 0.177363f,
+  0.0630679f,   -0.145173f,   -0.0558639f,   -0.44879f,    -1.55687f,
+  0.473398f,    0.50531f,     -0.0656231f,   -0.137197f,   0.064707f,
+  0.122083f,    0.0321111f,   -0.167096f,    0.0406581f,   -0.0793592f,
+  -0.0777081f,  0.0321379f,   -0.0108834f,   -0.0652323f,  -0.102918f,
+  0.0178664f,   0.0781873f,   0.0613189f,    -0.04177f,    0.159566f,
+  0.15134f,     -0.445996f,   -0.384905f,    0.0951659f,   -0.175046f,
+  0.255746f,    0.177047f,    -0.150632f,    0.200522f,    0.00778549f,
+  0.232168f,    -0.0304652f,  0.083155f,     -0.125395f,   -0.0203289f,
+  -0.23874f,    0.0349836f,   0.231701f,     -0.14849f,    -0.204272f,
+  -0.198309f,   -0.364955f,   -0.228428f,    0.0614142f,   -0.040976f,
+  -0.227785f,   -0.0898404f,  0.271566f,     -0.209196f,   0.0226431f,
+  -0.0911715f,  0.0840369f,   -0.299411f,    -0.529182f,   0.0622292f,
+  0.202475f,    0.0155583f,   -0.083114f,    0.124253f,    -0.22721f,
+  -1.02565f,    0.193961f,    -0.54287f,     -0.00849364f, 0.11124f,
+  0.0993531f,   0.120621f,    0.0959537f,    0.136274f,    -5.23358f,
+  -0.107433f,   0.155286f,    -0.0136043f,   -0.0246768f,  -0.631187f,
+  -0.0493852f,  0.0446751f,   0.0588353f,    0.160766f,    -0.0354385f,
+  -0.0672548f,  0.243743f,    0.186004f,     -1.20199f,    -0.151872f,
+  -0.0760096f,  -0.00775123f, -0.0122227f,   0.0891327f,   -0.377876f,
+  -0.469926f,   -0.134715f,   -0.0969362f,   0.212542f,    0.0871489f,
+  0.164638f,    -0.0485785f,  -0.167754f,    -0.515052f,   0.13821f,
+  0.0515572f,   -0.430691f,   -0.394719f,    0.143947f,    -0.00670816f,
+  0.129623f,    0.140299f,    0.0336978f,    0.153545f,    -0.350927f,
+  -0.213485f,   0.0344809f,   0.0405889f,    0.0749967f,   -0.369352f,
+  -0.109398f,   0.0350649f,   0.190893f,     -0.284106f,   -0.185376f,
+  0.0105842f,   0.263692f,    0.160429f,     0.0998209f,   -0.127779f,
+  0.140558f,    0.108968f,    -0.0122672f,   0.102875f,    -5.72172f,
+  -0.161288f,   0.135935f,    -0.0143087f,   0.106556f,    -0.649813f,
+  -0.123049f,   -0.0108861f,  0.102918f,     -0.298137f,   0.0329013f,
+  0.100763f,    0.12018f,     0.100782f,     -0.648036f,   -0.111122f,
+  0.12363f,     0.0211952f,   -0.225201f,    0.0506021f,   0.0167621f,
+  0.0608759f,   -0.0245646f,  0.0503477f,    -0.0972749f,  -0.0415155f,
+  -0.00578366f, -0.0977591f,  0.124867f,     0.0134788f,   -0.0375816f,
+  -0.00581233f, -0.272292f,   -0.250393f,    0.024511f,    -0.184891f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = {
+  0.182474f,  0.0223202f,  0.204111f, 0.0573683f,  0.111143f,
+  0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f,
+  0.0133081f, 0.119719f,   0.237522f, -0.266705f,  0.129427f,
+  0.0695857f, 0.22068f,    0.231667f, 0.405829f,   -0.0972567f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = {
+  -0.0393876f,  -0.269924f,   -0.0703231f,   -0.0236484f,  0.170478f,
+  0.245566f,    0.175963f,    0.104194f,     -0.0490501f,  -0.157605f,
+  -0.0275165f,  -0.0169499f,  -0.250725f,    0.215203f,    -0.00733655f,
+  0.0111298f,   0.205606f,    0.928046f,     0.15139f,     0.0955483f,
+  -0.015115f,   -0.126643f,   0.0957605f,    -0.140178f,   -0.0246866f,
+  0.097097f,    0.116287f,    0.177746f,     0.0570021f,   -0.0518686f,
+  -0.0446482f,  -0.0125318f,  0.0116092f,    0.102431f,    0.0898519f,
+  0.0870372f,   -0.843274f,   0.383311f,     -0.102761f,   -0.0246494f,
+  0.0312555f,   0.19472f,     0.111573f,     0.0920392f,   -0.0555618f,
+  0.326461f,    0.219357f,    -0.133727f,    -0.118399f,   -0.0611432f,
+  -0.169931f,   0.123733f,    -0.204607f,    0.082592f,    0.0323181f,
+  0.201618f,    -0.00388867f, -0.053583f,    0.0266333f,   -0.0951787f,
+  -0.0358283f,  -0.0649549f,  0.0119263f,    -0.11812f,    0.209851f,
+  -0.036616f,   -0.014911f,   -0.138096f,    -0.139664f,   -0.207395f,
+  0.0128848f,   -0.201816f,   0.0899419f,    0.343308f,    -0.0096243f,
+  -0.212605f,   -0.0905284f,  -0.0597114f,   -0.055261f,   -0.0653405f,
+  0.0330484f,   -0.27681f,    -0.0994095f,   -0.0468272f,  0.145713f,
+  0.267216f,    0.185335f,    0.1798f,       -0.0437882f,  -0.200401f,
+  -0.0398117f,  -0.0736501f,  -0.166349f,    0.203316f,    0.0710647f,
+  0.061825f,    0.281131f,    0.733323f,     0.215488f,    0.00145659f,
+  -0.138995f,   -0.0833713f,  0.107809f,     -0.105343f,   -0.0672139f,
+  0.101852f,    0.135455f,    0.132903f,     0.0312017f,   -0.0643586f,
+  -0.0274546f,  -0.0687466f,  -0.020233f,    0.109444f,    0.0774587f,
+  0.139497f,    -0.800587f,   0.325783f,     -0.0546695f,  -0.092003f,
+  -0.0773301f,  0.189672f,    0.0604666f,    0.0939425f,   0.679495f,
+  0.114789f,    -0.161153f,   0.12843f,      -0.0345385f,  -0.134641f,
+  -0.153995f,   0.0823055f,   -0.0349296f,   0.0299183f,   -0.0606872f,
+  0.137588f,    0.0449805f,   -0.0555399f,   -0.00553351f, -0.120719f,
+  -0.204701f,   -0.0739813f,  0.0584115f,    -0.104833f,   -0.110989f,
+  0.00845446f,  0.0630702f,   -0.147861f,    0.0268545f,   -0.216419f,
+  0.00531986f,  -0.206641f,   0.253082f,     0.413215f,    -0.05909f,
+  -0.0939983f,  -0.116818f,   -0.0450892f,   -0.0551134f,  -0.00696931f,
+  -0.113003f,   -0.289192f,   -0.00884866f,  -0.0365724f,  0.0401887f,
+  0.238622f,    0.149151f,    0.175751f,     -0.157425f,   -0.138924f,
+  -0.0277598f,  -0.0285915f,  0.10165f,      0.209532f,    0.0862249f,
+  0.0256428f,   0.623204f,    -0.0941196f,   0.20345f,     -0.132869f,
+  0.00947298f,  -0.14753f,    0.103918f,     -0.161799f,   0.125566f,
+  0.10916f,     0.115446f,    0.135627f,     -0.0181667f,  -0.0734694f,
+  -0.0154729f,  -0.085849f,   -0.000427605f, 0.113614f,    0.0776308f,
+  0.111899f,    -0.214917f,   0.393234f,     -0.132223f,   0.020783f,
+  -0.074902f,   0.217477f,    0.107883f,     0.109466f,    0.146609f,
+  0.317061f,    0.074379f,    -0.0505457f,   -0.0503772f,  -0.0678954f,
+  -0.220003f,   0.114878f,    0.176014f,     -0.00657996f, -0.0875497f,
+  0.065582f,    0.00238612f,  -0.063395f,    0.0295323f,   -0.127126f,
+  0.099813f,    -0.115452f,   0.0106309f,    -0.179632f,   -0.0436553f,
+  0.0120295f,   0.0652713f,   -0.131512f,    -0.081714f,   -0.205363f,
+  -0.0374944f,  -0.196707f,   0.680568f,     -0.00991824f, -0.0212223f,
+  -0.186258f,   -0.432361f,   -0.0291303f,   -0.0475983f,  -0.071383f,
+  -0.0116416f,  -0.28257f,    -0.0635272f,   -0.0576546f,  -0.280129f,
+  0.286528f,    0.199997f,    0.192851f,     0.323829f,    -0.185006f,
+  -0.04791f,    -0.0882187f,  -0.0496895f,   0.293135f,    0.125539f,
+  0.0341828f,   0.993452f,    0.0369177f,    0.0453796f,   0.0329807f,
+  0.157673f,    -0.153195f,   0.122383f,     -0.161983f,   -0.317619f,
+  0.105129f,    0.155673f,    0.152489f,     0.0685417f,   -0.0595907f,
+  -0.026657f,   -0.0954336f,  -0.0359557f,   0.105617f,    0.0825066f,
+  0.100189f,    -0.22125f,    0.382508f,     -0.0247677f,  -0.115807f,
+  -0.0639787f,  0.177786f,    0.0566206f,    0.0496389f,   1.31533f,
+  0.0482907f,   -0.118743f,   0.190632f,     0.172867f,    -0.108446f,
+  -0.200186f,   0.122572f,    0.0897468f,    0.0155328f,   -0.0380217f,
+  0.125161f,    -0.141723f,   -0.023157f,    0.0270805f,   -0.101961f,
+  0.12358f,     -0.0866255f,  0.00306761f,   -0.131764f,   -0.461118f,
+  -0.00803936f, 0.0895496f,   -0.153905f,    0.207623f,    -0.249099f,
+  -0.0198487f,  -0.160013f,   0.81136f,      -0.109978f,   -0.0880332f,
+  -0.0761368f,  -0.0755881f,  -0.0384827f,   -0.0554777f,  -0.0750048f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = {
+  0.0106809f, 0.136699f, 0.285316f, 0.395746f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = {
+  -0.0161019f,  -0.088871f,  0.0463358f,   -0.198037f,   0.038122f,
+  0.0135483f,   -0.196641f,  -0.433531f,   0.527972f,    -0.143716f,
+  0.558627f,    0.459889f,   0.322864f,    -0.491514f,   -0.190915f,
+  -0.0765601f,  0.210329f,   0.689389f,    -0.100415f,   -1.8788f,
+  0.2228f,      0.292781f,   -0.954838f,   -0.0788763f,  -0.131402f,
+  -0.17154f,    0.049934f,   -0.0541183f,  -0.530529f,   -0.666165f,
+  0.195492f,    0.218548f,   -0.314895f,   0.0749444f,   -0.191344f,
+  0.349469f,    0.00811248f, -0.760157f,   0.0707434f,   -0.0719285f,
+  -0.264495f,   -0.432009f,  -0.432686f,   0.155738f,    -0.020197f,
+  0.19278f,     -0.658335f,  -0.273143f,   -0.286079f,   0.243402f,
+  0.497701f,    0.0121003f,  -0.666308f,   0.028172f,    -0.547901f,
+  -0.11755f,    0.322028f,   0.0878274f,   -0.0328334f,  0.311816f,
+  0.0951026f,   -1.11429f,   -0.0417486f,  0.123467f,    -0.0910681f,
+  -0.0154255f,  0.311201f,   -0.0156158f,  -0.600437f,   0.0274156f,
+  -0.174907f,   -1.29313f,   -0.178656f,   0.596556f,    -0.421725f,
+  -0.289137f,   0.529297f,   0.114833f,    -0.0155887f,  -0.308232f,
+  -0.0228361f,  0.184017f,   0.138232f,    0.146347f,    -0.117867f,
+  0.248351f,    -0.282846f,  -0.18058f,    0.348355f,    -0.415754f,
+  0.0657168f,   0.431728f,   -0.231043f,   -0.186745f,   0.137401f,
+  -0.282329f,   -0.159678f,  0.754262f,    0.037824f,    -1.68521f,
+  -0.290175f,   0.289588f,   -0.18683f,    -0.300385f,   0.285449f,
+  -0.00386456f, 0.0563485f,  -0.376541f,   0.159899f,    -0.697312f,
+  0.0284389f,   0.437307f,   0.3968f,      -0.372082f,   -0.232535f,
+  0.394629f,    0.00315248f, -0.38374f,    0.0311291f,   -0.624353f,
+  0.498083f,    -0.342663f,  -0.125978f,   0.186797f,    0.187723f,
+  0.149335f,    -0.82727f,   -0.0740974f,  -0.659039f,   0.42671f,
+  -0.448835f,   0.150677f,   0.830742f,    -0.233148f,   -0.65308f,
+  -0.0878935f,  -0.407797f,  -0.511826f,   -0.0739023f,  0.506305f,
+  -0.187451f,   0.0284968f,  -0.822238f,   0.362523f,    -0.270865f,
+  0.032335f,    0.560413f,   -0.00388247f, -0.446333f,   0.163147f,
+  -0.409633f,   -0.372575f,  0.306993f,    0.55953f,     -0.24362f,
+  -0.0929369f,  -0.520298f,  -0.444022f,   0.186077f,    -0.0942208f,
+  0.624049f,    -0.429625f,  -0.869528f,   0.405257f,    -0.120445f,
+  0.537685f,    -0.3911f,    0.142142f,    0.0913808f,   -0.00375967f,
+  0.382781f,    0.60505f,    -0.271608f,   -0.0630436f,  -0.150625f,
+  -0.0124598f,  0.0132878f,  0.138475f,    -0.106264f,   -0.416581f,
+  -0.518415f,   0.185127f,   -0.464622f,   -0.0102925f,  0.0389567f,
+  0.406439f,    -0.0414264f, -0.366185f,   -0.511867f,   -0.650255f,
+  0.278252f,    0.0270234f,  0.262788f,    -0.0294793f,  0.12651f,
+  0.421537f,    0.0300837f,  0.0742187f,   0.281954f,    -0.122069f,
+  -0.450145f,   -0.312206f,  -0.402633f,   -0.0868137f,  0.190433f,
+  -0.149602f,   -0.175029f,  0.00900023f,  -0.266596f,   0.21721f,
+  -0.245079f,   -1.09798f,   0.319409f,    -0.337938f,   0.358514f,
+  0.0771549f,   0.447087f,   -0.305507f,   -0.285492f,   0.383896f,
+  0.145933f,    -0.264944f,  -0.118486f,   0.068805f,    -0.194231f,
+  -1.79133f,    0.363408f,   -0.17434f,    -0.229629f,   0.132188f,
+  0.207548f,    -0.876264f,  0.265634f,    0.139332f,    0.236206f,
+  -0.0145184f,  0.562865f,   0.526612f,    -0.0333508f,  -0.421885f,
+  0.273485f,    -0.110882f,  0.425557f,    0.513303f,    -0.422322f,
+  0.0563155f,   -0.0409693f, 0.194768f,    -0.419828f,   -0.107195f,
+  -1.19224f,    0.48552f,    0.132782f,    -0.00932096f, -0.225484f,
+  -0.428484f,   -0.0392684f, 0.750697f,    0.337615f,    0.158476f,
+  0.413484f,    0.326017f,   -0.757107f,   -0.183962f,   0.00884361f,
+  0.126507f,    -0.0751588f, -0.308782f,   -0.104237f,   -0.703877f,
+  -0.491806f,   -0.204251f,  -0.317212f,   0.0815479f,   0.296323f,
+  0.219632f,    -0.039859f,  0.556257f,    0.176144f,    -0.0750654f,
+  -0.106419f,   0.00400385f, -0.172266f,   0.000178763f, 0.146532f,
+  0.255202f,    -0.427235f,  -0.182198f,   -0.256557f,   0.260255f,
+  -0.0143364f,  0.0868664f,  -0.564373f,   -0.0876947f,  0.726289f,
+  0.0160001f,   -0.381562f,  -0.638214f,   -0.803803f,   0.25945f,
+  -0.371542f,   -0.419611f,  0.238617f,    0.371834f,    -0.226777f,
+  -0.894602f,   0.37458f,    -0.354866f,   0.0249312f,   0.142374f,
+  0.433813f,    -0.0218183f, -0.33248f,    0.107223f,    0.390823f,
+  -0.0271108f,  -0.616878f,  -0.604984f,   0.517269f,    -0.293573f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = {
+  -0.290371f, -0.0560272f,  -0.118144f,  -0.270583f,  0.401388f,
+  -0.308677f, 0.150729f,    -0.0324442f, -0.135937f,  0.0875581f,
+  0.0206493f, -0.212682f,   -0.0266535f, -0.326656f,  0.0185105f,
+  -1.01429f,  -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f
+};
+
+static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = {
+  NUM_CNN_LAYERS,  // num_layers
+  0,               // is_residue
+  0,               // ext_width
+  0,               // ext_height
+  0,               // strict_bounds
+  {
+      {
+          CNN_LAYER_0_IN_CH,                                // in_channels
+          CNN_LAYER_0_WIDTH,                                // filter_width
+          CNN_LAYER_0_WIDTH,                                // filter_height
+          CNN_LAYER_0_OUT_CH,                               // out_channels
+          CNN_LAYER_0_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_0_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_0_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_0_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          -1,                // output_num
+      },
+      {
+          CNN_LAYER_1_IN_CH,                                // in_channels
+          CNN_LAYER_1_WIDTH,                                // filter_width
+          CNN_LAYER_1_WIDTH,                                // filter_height
+          CNN_LAYER_1_OUT_CH,                               // out_channels
+          CNN_LAYER_1_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_1_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_1_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_1_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          3,                 // output_num
+      },
+      {
+          CNN_LAYER_2_IN_CH,                                // in_channels
+          CNN_LAYER_2_WIDTH,                                // filter_width
+          CNN_LAYER_2_WIDTH,                                // filter_height
+          CNN_LAYER_2_OUT_CH,                               // out_channels
+          CNN_LAYER_2_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_2_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_2_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_2_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          2,                 // output_num
+      },
+      {
+          CNN_LAYER_3_IN_CH,                                // in_channels
+          CNN_LAYER_3_WIDTH,                                // filter_width
+          CNN_LAYER_3_WIDTH,                                // filter_height
+          CNN_LAYER_3_OUT_CH,                               // out_channels
+          CNN_LAYER_3_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_3_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_3_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_3_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          1,                 // output_num
+      },
+      {
+          CNN_LAYER_4_IN_CH,                                // in_channels
+          CNN_LAYER_4_WIDTH,                                // filter_width
+          CNN_LAYER_4_WIDTH,                                // filter_height
+          CNN_LAYER_4_OUT_CH,                               // out_channels
+          CNN_LAYER_4_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_4_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_4_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_4_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          0,                 // output_num
+      },
+  },
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = {
+      0.604356f,    -0.236007f,   0.342172f,   0.531397f,     -0.635698f,
+      -0.591573f,   0.833872f,    0.492814f,   -0.100308f,    0.186385f,
+      0.202779f,    0.263578f,    0.330001f,   -0.15531f,     0.879584f,
+      -0.0048796f,  0.490796f,    0.242254f,   -0.292211f,    -0.696912f,
+      0.746664f,    0.129371f,    -0.0122443f, 0.196234f,     -0.251605f,
+      -0.385617f,   0.157707f,    0.699963f,   0.0432536f,    -0.11141f,
+      -0.0353473f,  -0.0364045f,  -0.113556f,  -0.520842f,    0.231248f,
+      0.230638f,    -0.323852f,   -1.08633f,   -0.0469168f,   -0.481821f,
+      0.366838f,    0.189627f,    -0.0637262f, -0.484917f,    -0.109874f,
+      0.292237f,    0.368702f,    -0.183896f,  -0.109038f,    -1.22613f,
+      -0.880355f,   -1.63768f,    0.337426f,   -0.940994f,    0.413097f,
+      -0.37879f,    -0.480525f,   -0.594819f,  -0.0172653f,   -0.499436f,
+      -0.298395f,   -0.840181f,   -0.0758645f, -0.772089f,    -0.232727f,
+      -0.815968f,   0.160785f,    -0.0767165f, 0.0064244f,    -0.540491f,
+      0.417776f,    -0.384337f,   -0.497377f,  0.68414f,      0.00797514f,
+      0.262626f,    0.203732f,    0.702047f,   0.0617544f,    0.0878249f,
+      -0.315032f,   -0.0169776f,  0.403986f,   0.815872f,     0.135388f,
+      0.0858594f,   0.169172f,    -0.638227f,  -1.65268f,     -0.0476042f,
+      -0.982685f,   0.45707f,     -0.0577537f, 0.367329f,     0.176513f,
+      -0.356454f,   0.0979095f,   -0.277476f,  0.257271f,     -0.333451f,
+      0.0241497f,   0.0671127f,   0.221216f,   0.106065f,     0.537151f,
+      0.0257329f,   0.265559f,    -0.348353f,  0.285569f,     -0.0610511f,
+      -1.59334f,    -1.63826f,    -0.164898f,  -0.36605f,     -0.489304f,
+      0.729241f,    0.0197627f,   0.200291f,   -0.231506f,    -0.255715f,
+      -0.0932264f,  -0.728793f,   0.468297f,   -1.09592f,     -0.079791f,
+      -1.76531f,    -0.182904f,   -2.05897f,   -0.371894f,    0.207124f,
+      0.255029f,    0.186501f,    -0.005805f,  0.00160733f,   -0.178206f,
+      -0.352757f,   -0.164741f,   -0.557583f,  -0.559692f,    -0.00731467f,
+      0.149326f,    0.409735f,    0.22083f,    -0.332572f,    -0.1741f,
+      -0.0519008f,  -0.266402f,   0.294031f,   -2.4453f,      0.339851f,
+      -0.573747f,   -5.97783f,    -0.084142f,  0.20286f,      -0.576038f,
+      -0.111081f,   0.101238f,    -5.83427f,   -1.98537f,     0.322796f,
+      -0.60171f,    0.212412f,    0.247176f,   0.603694f,     -0.54357f,
+      -0.693439f,   0.250725f,    -4.31988f,   0.0935924f,    0.43669f,
+      -0.139706f,   -0.158391f,   0.244309f,   0.619213f,     -0.309154f,
+      -0.135341f,   0.475815f,    -0.290804f,  -0.109038f,    -0.0937104f,
+      0.0385907f,   -0.29105f,    -0.0597651f, -0.451187f,    -1.51821f,
+      0.141772f,    0.822204f,    -0.729661f,  -0.109908f,    0.178217f,
+      -0.750278f,   0.113762f,    -0.0959985f, 0.066579f,     -0.104209f,
+      -0.951378f,   1.4087f,      -1.13175f,   -1.09103f,     -1.50416f,
+      -0.182273f,   -1.80129f,    -0.152135f,  0.356931f,     0.205591f,
+      0.183148f,    -0.498671f,   -0.183034f,  -0.176428f,    0.395706f,
+      -0.589908f,   -0.318276f,   -0.421162f,  0.658766f,     -0.186752f,
+      0.0656253f,   0.248002f,    0.289618f,   -0.458111f,    -0.130789f,
+      -0.542988f,   0.405804f,    -0.35364f,   -0.311927f,    0.218339f,
+      0.309215f,    -0.130347f,   -0.0257543f, 0.0413234f,    -0.190205f,
+      -0.242382f,   0.819886f,    -0.255157f,  -0.181219f,    -0.290903f,
+      -0.301995f,   -0.0469988f,  0.702936f,   0.209122f,     0.0234243f,
+      0.598637f,    0.0305196f,   0.0423457f,  -0.618799f,    0.0190867f,
+      0.420584f,    -0.224752f,   -0.410077f,  0.127854f,     0.395261f,
+      -0.393685f,   -0.282822f,   0.0289504f,  0.0406515f,    -0.511531f,
+      -0.497611f,   0.0252715f,   0.0812549f,  0.80205f,      1.29084f,
+      0.764972f,    0.561258f,    -0.23499f,   0.217594f,     -0.690935f,
+      -0.26607f,    0.357955f,    0.391608f,   0.448352f,     0.458586f,
+      -0.790071f,   0.719959f,    -0.468052f,  1.24579f,      0.220705f,
+      0.284044f,    0.141346f,    0.246687f,   0.147826f,     -0.403557f,
+      -0.00648195f, 0.398034f,    -0.100464f,  -0.77107f,     -0.188274f,
+      -0.219245f,   -0.0330375f,  0.367585f,   -0.220391f,    0.308736f,
+      0.221399f,    0.340292f,    0.037597f,   0.606083f,     0.665634f,
+      -0.755529f,   -0.95989f,    -0.243673f,  0.233709f,     -0.454628f,
+      -0.110952f,   0.776062f,    0.731136f,   -0.140422f,    0.19261f,
+      0.355086f,    0.975026f,    0.190936f,   0.776205f,     0.982781f,
+      0.555569f,    0.42382f,     -0.409721f,  0.25053f,      -0.271328f,
+      0.859941f,    -0.0210901f,  0.0176916f,  -0.562895f,    -0.0787431f,
+      -0.861032f,   -0.34022f,    -0.571995f,  0.205436f,     0.346968f,
+      0.377033f,    -1.08484f,    0.297007f,   -1.01693f,     0.189463f,
+      -0.483242f,   0.147058f,    0.0159503f,  0.0908779f,    -0.46962f,
+      0.174024f,    -0.490704f,   -0.383501f,  -0.0507626f,   0.00902188f,
+      -0.202495f,   0.205047f,    0.0562261f,  -0.143371f,    0.219524f,
+      -0.317294f,   -0.0575756f,  -0.0595825f, -0.000625279f, -0.278864f,
+      -0.0516874f,  -0.225259f,   0.429046f,   -0.0952421f,   0.0799135f,
+      -0.122883f,   -0.262308f,   -0.481006f,  -0.0466122f,   -0.402822f,
+      0.150595f,    -0.0919558f,  -0.356765f,  -0.199222f,    0.219389f,
+      -0.214452f,   -0.196361f,   -0.095758f,  -0.115891f,    -0.143777f,
+      0.549843f,    -0.113036f,   0.764895f,   -0.0114812f,   -0.0684054f,
+      -0.98045f,    -0.0170634f,  0.247719f,   -0.18718f,     -0.381566f,
+      0.150758f,    -0.526257f,   1.00851f,    0.776634f,     1.69728f,
+      -0.303058f,   0.228967f,    -0.414134f,  0.0858226f,    -0.285472f,
+      0.431459f,    0.315318f,    0.587835f,   0.335737f,     -0.0222039f,
+      0.18945f,     0.274008f,    0.609263f,   0.320232f,     -0.214137f,
+      -0.0297668f,  0.0439046f,   -0.52821f,   -0.0127375f,   0.431885f,
+      0.508846f,    -0.329189f,   -0.166778f,  -0.94338f,     -0.358807f,
+      0.208641f,    -0.517986f,   -0.128278f,  0.693464f,     -0.24408f,
+      -0.0669412f,  -0.410287f,   0.0444145f,  -0.264179f,    0.143884f,
+      0.276842f,    0.498934f,    -0.682557f,  -0.217198f,    -0.8249f,
+      -0.40446f,    -0.115376f,   0.417934f,   0.65605f,      -0.00570035f,
+      -0.365742f,   -0.367625f,   0.526824f,   -0.0164913f,   -0.255998f,
+      0.247292f,    0.0846536f,   0.109302f,   -0.302996f,    0.160564f,
+      0.0228132f,   0.035211f,    -0.236951f,  0.493801f,     1.37315f,
+      -0.182348f,   0.234437f,    -0.256906f,  0.12523f,      0.667113f,
+      -0.437981f,   -0.0721831f,  0.303976f,   -0.041336f,    -0.145894f,
+      -0.733741f,   0.436056f,    0.368542f,   -0.149072f,    -0.290281f,
+      0.0946743f,   -0.0579292f,  0.264539f,   0.170048f,     0.262411f,
+      0.049679f,    0.371369f,    0.760675f,   0.482157f,     -0.0196783f,
+      0.260888f,    0.948856f,    0.170228f,   -0.134432f,    -0.942235f,
+      -1.23226f,    -0.373963f,   -0.0381773f, -0.17947f,     0.00947998f,
+      0.01086f,     0.389578f,    -0.380389f,  -0.0865851f,   -0.220328f,
+      -0.171901f,   -0.384325f,   -0.0787615f, 0.392678f,     0.123392f,
+      -0.0895824f,  0.00480886f,  -0.162918f,  0.214336f,     -0.00147339f,
+      0.203899f,    -0.00292344f, -0.148594f,  0.0425697f,    -0.306896f,
+      -0.342225f,   -0.45088f,    -0.184454f,  -0.00923638f,  -0.521993f,
+      -0.334464f,   0.156497f,    -0.0856832f, -0.277661f,    -0.0721105f,
+      -0.488781f,   -0.509543f,   -0.012664f,  0.0940558f,    -0.29869f,
+      0.0434843f,   -0.0178945f,  -0.0525666f, -0.303178f,    0.713507f,
+      -0.137413f,   -0.170289f,   -0.142942f,  -0.316002f,    0.229125f,
+      -0.277585f,   0.0125026f,   0.508316f,   -1.20614f,     -0.915129f,
+      -1.63389f,    -0.454604f,   -0.893951f,  -0.447403f,    -0.751423f,
+      1.3886f,      0.617818f,    0.611458f,   -0.884173f,    -0.7779f,
+      -0.608639f,   -0.164759f,   -0.631846f,  -0.176894f,    -0.459361f,
+      -0.187119f,   0.173283f,    -0.477191f,  -0.156736f,    0.182675f,
+      0.598854f,    -0.489941f,   -0.420493f,  -0.162002f,    0.344418f,
+      0.33832f,     -0.187463f,   -0.388721f,  -0.0733151f,   -0.138835f,
+      0.313699f,    0.0625967f,   -0.291488f,  0.114088f,     -0.356843f,
+      0.197506f,    0.0320749f,   1.16745f,    -0.36081f,     1.63416f,
+      0.198392f,    1.13928f,     -0.317971f,  0.531019f,     0.526518f,
+      0.185814f,    0.0923607f,   0.192858f,   -0.234378f,    0.18091f,
+      -0.228837f,   0.397216f,    0.581501f,   0.284376f,     -0.130434f,
+      0.20076f,     0.242662f,    -0.0480872f, 0.131746f,     0.362712f,
+      0.0146821f,   0.475679f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = {
+  0.477356f,   0.385222f,  0.389122f, 0.539506f,   -0.0272558f, 0.581605f,
+  -0.800961f,  0.142229f,  0.117549f, -0.0724944f, 0.102095f,   -0.71319f,
+  -0.0162434f, -0.132858f, 0.543411f, -0.626599f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = {
+      0.195436f,   -0.623354f,  1.27907f,    0.270071f,   -0.677612f,
+      0.0266141f,  0.272991f,   -0.425446f,  0.891889f,   -0.299836f,
+      -0.611825f,  -0.0322273f, 0.185276f,   0.238639f,   -0.150954f,
+      0.083495f,   -0.472106f,  0.573506f,   1.16465f,    -0.154947f,
+      0.640631f,   -1.59467f,   -9.8166f,    -0.22889f,   -0.189912f,
+      0.227052f,   -0.540787f,  0.0840873f,  -3.04293f,   -0.0209975f,
+      -6.10979f,   -5.92801f,   0.288467f,   -0.169476f,  0.0527948f,
+      -1.21202f,   -0.280915f,  0.290863f,   -0.601877f,  0.0598784f,
+      -0.592136f,  -0.535588f,  -0.0434018f, -0.653223f,  0.00339129f,
+      -0.133273f,  0.279463f,   0.483879f,   0.463664f,   -0.14174f,
+      -1.56354f,   0.560043f,   -1.44639f,   0.673528f,   -0.108418f,
+      -0.707313f,  0.49633f,    -0.0321971f, 0.411475f,   -0.382184f,
+      -0.965501f,  -0.0507655f, 0.540415f,   -0.977297f,  0.370382f,
+      -0.375683f,  0.0844529f,  -2.0002f,    -0.346289f,  0.621251f,
+      -0.489855f,  0.191252f,   -0.576629f,  -0.35773f,   0.023167f,
+      0.180793f,   -0.417864f,  0.0587254f,  0.167824f,   0.0612058f,
+      -0.712108f,  0.155614f,   0.900036f,   -0.480124f,  0.146117f,
+      0.467011f,   0.412525f,   0.312724f,   0.551826f,   -0.179601f,
+      0.706261f,   0.00674965f, -0.495221f,  0.140829f,   -0.0619195f,
+      -0.0697912f, 0.511967f,   -0.0318237f, -0.285946f,  -0.28608f,
+      0.0894142f,  0.234351f,   -0.272328f,  -0.350369f,  -0.392605f,
+      0.287318f,   0.310426f,   0.293524f,   0.357681f,   -0.157868f,
+      0.149652f,   -0.259363f,  0.192941f,   -0.850096f,  0.456507f,
+      0.387857f,   -0.491187f,  -0.0541993f, -0.28118f,   0.193991f,
+      -0.0956664f, 0.0679829f,  0.0341118f,  0.141826f,   0.271538f,
+      -0.285295f,  -0.68666f,   0.306414f,   0.600678f,   0.494801f,
+      -1.11907f,   0.524849f,   0.151169f,   0.474068f,   -0.43441f,
+      -0.229138f,  0.0345483f,  0.682888f,   -0.471534f,  -0.0457066f,
+      -2.36721f,   0.446407f,   0.20396f,    -1.17868f,   0.815363f,
+      -1.13897f,   0.397217f,   -0.593796f,  -6.95512f,   0.650695f,
+      0.771657f,   0.15227f,    -0.824519f,  0.617854f,   -0.295353f,
+      -0.101207f,  0.600989f,   -0.550653f,  -0.722371f,  0.292006f,
+      -0.451891f,  0.54544f,    0.354278f,   0.0136258f,  0.192003f,
+      0.258275f,   -0.0443647f, 0.0928186f,  0.667775f,   0.239558f,
+      0.0523887f,  0.71586f,    0.292563f,   0.362479f,   0.373453f,
+      0.250638f,   -0.423037f,  -0.486574f,  -0.619397f,  0.343888f,
+      0.974971f,   0.574218f,   0.273989f,   -0.209956f,  -0.274333f,
+      0.0553766f,  0.263918f,   0.733824f,   0.038713f,   -0.0788992f,
+      0.292014f,   0.111808f,   -0.197507f,  0.593668f,   -0.0245337f,
+      0.0873662f,  0.530997f,   0.620717f,   0.310697f,   -1.54861f,
+      1.12915f,    0.0991346f,  -0.59214f,   0.422325f,   -0.0157936f,
+      0.380975f,   0.626403f,   0.268064f,   -0.615231f,  -1.43172f,
+      0.0928048f,  0.0949026f,  -0.470912f,  -0.0867527f, -0.0381206f,
+      0.178393f,   -1.13737f,   0.12798f,    0.258214f,   -0.803364f,
+      0.177506f,   0.542718f,   0.660656f,   0.145091f,   0.183056f,
+      -0.47338f,   0.469287f,   0.10832f,    0.0994899f,  -0.402719f,
+      0.157287f,   0.523071f,   -0.324493f,  0.343599f,   0.664839f,
+      -0.0375519f, -0.279238f,  -0.0722333f, 0.395344f,   -0.289316f,
+      0.0259298f,  -0.843245f,  -0.160021f,  0.741429f,   -1.38726f,
+      -0.2969f,    -0.240443f,  0.247731f,   -1.04088f,   -0.280454f,
+      -0.237054f,  -0.759227f,  0.0456369f,  -0.647453f,  -1.02372f,
+      -0.200395f,  -0.546839f,  -0.104226f,  -0.152727f,  -0.56685f,
+      -0.0559663f, -0.425494f,  -0.610679f,  -0.987096f,  -0.575138f,
+      -0.0887979f, 0.463646f,   -1.041f,     -0.49412f,   -0.175298f,
+      -0.463296f,  -0.955177f,  0.17852f,    -1.10694f,   0.181991f,
+      -0.18998f,   0.227818f,   0.688237f,   -1.10444f,   0.549108f,
+      -0.171849f,  -0.245614f,  0.120624f,   1.29571f,    0.607116f,
+      0.00809927f, 0.1041f,     -1.22918f,   -0.212948f,  0.430239f,
+      -1.57341f,   0.482054f,   0.275905f,   0.939785f,   -1.0209f,
+      -0.355534f,  0.397337f,   -0.0593077f, -0.239603f,  0.475483f,
+      -0.999101f,  -0.140578f,  1.04787f,    -0.591981f,  -0.306989f,
+      -0.879012f,  -0.994715f,  0.0343158f,  0.218509f,   0.34704f,
+      0.0672934f,  -0.178941f,  0.20509f,    -0.360031f,  0.161241f,
+      -0.324775f,  -0.359531f,  -0.0657085f, -0.864422f,  -0.444865f,
+      0.597095f,   -0.948691f,  0.240001f,   -0.783159f,  -0.569422f,
+      0.974205f,   -1.04539f,   0.345915f,   -0.681558f,  -0.246047f,
+      0.256174f,   0.493667f,   0.681324f,   0.155613f,   0.773309f,
+      -0.647027f,  -0.214744f,  -0.474202f,  -0.661092f,  -1.02316f,
+      0.0572593f,  -0.437082f,  -0.119874f,  -0.464877f,  -0.58067f,
+      -0.218029f,  0.319516f,   -0.378983f,  -0.0698695f, 0.554693f,
+      -0.537875f,  0.126429f,   -0.145113f,  -0.594312f,  -0.218021f,
+      -0.703569f,  0.0720548f,  0.261054f,   -0.81438f,   0.249921f,
+      0.165296f,   -0.079028f,  -0.322647f,  0.134458f,   0.0975046f,
+      0.538594f,   -0.250126f,  0.142309f,   0.526486f,   0.0532615f,
+      -0.383332f,  -0.38143f,   -0.101611f,  0.519776f,   -0.278364f,
+      -0.23287f,   -0.29139f,   0.22353f,    0.472085f,   0.366264f,
+      0.741187f,   0.42019f,    0.0676459f,  -0.230008f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = {
+  -0.48603f,  -0.578556f,  0.257639f, 0.459915f, 0.178156f,  -1.16663f,
+  0.828891f,  0.620291f,   0.413257f, -1.00508f, -0.574179f, -1.20623f,
+  -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f,
+  0.0233112f, 0.126045f,   0.361304f, 0.655317f, 0.413134f,  0.769947f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = {
+  0.67244f,   -2.59179f, 0.50425f,  -1.86481f,  1.15891f,   -1.26447f,
+  0.761081f,  0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f,
+  -0.560935f, 0.838959f, 0.502264f, -1.28958f,  -0.205551f, 0.635671f,
+  -1.12619f,  -1.68277f, 0.83361f,  1.57235f,   1.15839f,   0.35345f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = {
+  1.14463f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = {
+      0.364612f,    0.237868f,    -0.192821f,   0.12364f,      0.522205f,
+      -0.205785f,   -0.503288f,   -0.426503f,   -0.083073f,    0.0164429f,
+      0.184278f,    -0.426055f,   0.0717997f,   -0.261968f,    0.176412f,
+      -0.101226f,   0.0400285f,   -0.332051f,   0.344385f,     0.189565f,
+      0.441162f,    0.330462f,    -0.719857f,   -1.14209f,     0.557831f,
+      0.104756f,    0.0562001f,   -0.465923f,   -0.344592f,    -0.191554f,
+      -0.0656866f,  -0.640162f,   0.419388f,    0.409308f,     -1.68632f,
+      -1.10829f,    0.105485f,    -0.14561f,    -0.944738f,    0.104629f,
+      -0.146837f,   0.538823f,    -0.153157f,   0.321081f,     -1.77714f,
+      -0.0559296f,  0.324136f,    -0.497023f,   -1.15793f,     -0.740144f,
+      -0.0888472f,  0.010059f,    -0.18394f,    -0.234405f,    -0.10586f,
+      0.130958f,    -0.101944f,   -0.186483f,   -0.447049f,    -0.900026f,
+      0.128444f,    0.401696f,    0.128509f,    0.123778f,     0.062168f,
+      -0.321755f,   -0.0691584f,  0.254468f,    -0.115212f,    -0.848885f,
+      0.817005f,    0.0615853f,   0.153363f,    0.513855f,     0.789225f,
+      0.356168f,    0.371613f,    0.269541f,    0.268173f,     0.220481f,
+      -0.109063f,   -0.00620798f, -0.0334622f,  0.236267f,     -0.0235294f,
+      -0.0800253f,  0.0294184f,   0.047131f,    -0.224047f,    0.0890737f,
+      -0.356293f,   0.0989534f,   0.16799f,     0.498266f,     0.612581f,
+      -0.372897f,   -0.75125f,    0.77698f,     1.1032f,       -0.0764679f,
+      0.0266299f,   0.309532f,    0.461305f,    0.0193521f,    -0.0939161f,
+      -0.276156f,   -0.102714f,   -0.0828328f,  0.40003f,      0.122542f,
+      0.0867203f,   -0.170738f,   0.0850642f,   -0.130762f,    0.082324f,
+      -0.115218f,   -0.0244491f,  0.0434331f,   0.216453f,     0.443733f,
+      -0.173679f,   -0.161617f,   0.316209f,    -0.689656f,    -1.52007f,
+      -0.421018f,   0.430833f,    -0.00734122f, 0.284499f,     -0.0207885f,
+      0.0572024f,   -0.878942f,   0.388264f,    0.0191589f,    -0.123415f,
+      -0.0461196f,  -0.0444461f,  -0.00383171f, 0.0945655f,    -0.0597219f,
+      -0.374918f,   0.0182124f,   0.523083f,    0.00519547f,   0.80513f,
+      -0.221433f,   -1.30591f,    -0.416917f,   -0.718173f,    0.622999f,
+      0.941798f,    0.0477536f,   0.0303772f,   0.268078f,     0.414778f,
+      0.394325f,    0.299733f,    -0.583208f,   0.309379f,     0.416581f,
+      0.0299948f,   -0.409145f,   -0.161557f,   -0.214082f,    -0.0098119f,
+      0.221912f,    0.107135f,    0.0692518f,   0.00490957f,   0.107613f,
+      -0.368404f,   -0.548006f,   0.208274f,    0.550475f,     0.643678f,
+      -1.65859f,    0.095938f,    -0.0434245f,  -0.0792685f,   0.838109f,
+      -0.0138653f,  -0.527573f,   -0.123472f,   -0.235618f,    -0.677401f,
+      -0.125877f,   -0.175604f,   -0.203196f,   0.113478f,     -0.228323f,
+      -0.53539f,    0.134458f,    0.0534899f,   -0.213006f,    -0.138679f,
+      -2.15023f,    0.186303f,    0.48566f,     -1.22301f,     -0.240982f,
+      -0.486836f,   -0.121181f,   -0.131382f,   -0.0320283f,   0.278828f,
+      0.342581f,    -0.182257f,   -0.365193f,   -0.226351f,    0.108928f,
+      -0.100159f,   0.448355f,    -0.0768947f,  0.0633719f,    -0.104786f,
+      0.0456653f,   0.0965752f,   0.156403f,    -0.157337f,    0.212259f,
+      0.317939f,    0.124193f,    -0.329475f,   0.206868f,     -2.15986f,
+      -0.108385f,   -0.396769f,   -0.0317231f,  -0.271524f,    -0.184697f,
+      0.662615f,    0.412926f,    -0.0217462f,  -0.0285475f,   -0.118826f,
+      0.0252706f,   -0.137091f,   0.198973f,    0.329509f,     -0.0831966f,
+      -0.621237f,   0.0896179f,   0.805261f,    -0.019675f,    0.962452f,
+      0.307433f,    0.892168f,    -0.537587f,   -2.46145f,     0.125606f,
+      0.920491f,    0.219462f,    0.292765f,    -0.748238f,    -0.0537239f,
+      -0.224326f,   0.505492f,    0.176426f,    0.0343168f,    0.16708f,
+      -0.581393f,   0.951726f,    -1.1777f,     -0.561914f,    -1.53288f,
+      0.864567f,    -1.19648f,    -1.24141f,    -0.334688f,    -0.622026f,
+      0.666876f,    -0.197005f,   -0.600507f,   -0.851924f,    0.492299f,
+      0.31078f,     -0.0736115f,  0.030999f,    -6.02463e-05f, -0.0604341f,
+      -0.0254238f,  0.139222f,    0.333235f,    0.366534f,     -0.191982f,
+      -0.0156092f,  0.44234f,     -0.0193213f,  0.0938745f,    -0.015709f,
+      -0.12043f,    0.00895591f,  0.0464401f,   0.0530699f,    -0.623018f,
+      -1.23372f,    -0.538647f,   -1.12389f,    0.26742f,      0.548694f,
+      0.00540655f,  -0.219703f,   0.314894f,    -0.573463f,    -0.241555f,
+      0.441851f,    0.422491f,    0.253785f,    -0.384683f,    0.0370165f,
+      0.226669f,    0.245587f,    0.215265f,    -0.122272f,    0.0492235f,
+      0.000658591f, -0.312877f,   0.436487f,    -0.229199f,    -0.174373f,
+      0.904268f,    -0.855845f,   -0.877293f,   -0.65409f,     0.313795f,
+      0.461748f,    -0.737766f,   -0.228523f,   0.182181f,     0.334522f,
+      0.0629676f,   -0.151087f,   0.178798f,    -0.325809f,    -0.331672f,
+      0.0865837f,   -0.0684225f,  0.0252008f,   -0.0820631f,   0.0481863f,
+      0.209473f,    -0.0242151f,  -0.0898919f,  -0.163828f,    -0.164282f,
+      0.581888f,    0.816896f,    0.0607674f,   0.364855f,     -0.346512f,
+      -0.764174f,   0.595561f,    0.302872f,    0.206361f,     0.106917f,
+      -0.972338f,   0.176948f,    0.6415f,      -0.131897f,    -0.155802f,
+      0.216337f,    -0.342511f,   0.123743f,    -0.123014f,    0.0205439f,
+      0.15173f,     -0.23801f,    -1.00387f,    0.651328f,     0.237439f,
+      -0.542952f,   1.066f,       -0.161107f,   -0.593545f,    0.219343f,
+      -0.178094f,   0.0789992f,   0.428332f,    0.23827f,      -0.327421f,
+      0.416144f,    0.00394653f,  0.052046f,    -0.238289f,    0.405942f,
+      0.00141984f,  0.161017f,    0.077111f,    0.0823985f,    0.0981208f,
+      0.109949f,    -0.0428502f,  0.343629f,    -0.722978f,    -0.375269f,
+      -0.111634f,   -0.271523f,   0.712093f,    0.684904f,     -0.572331f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = {
+  0.583367f,  -0.202004f, -0.207626f, 0.412451f,  -0.258311f, 0.0304954f,
+  -0.102458f, 0.450087f,  -0.376851f, -0.338702f, 0.335226f,  0.889072f,
+  0.502411f,  0.649282f,  0.15345f,   -0.0109896f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = {
+      0.0214882f,    -0.934339f,  -0.173335f,  0.8362f,      -0.764234f,
+      0.525163f,     0.409749f,   0.821539f,   -0.784157f,   -0.455593f,
+      0.446099f,     0.406756f,   0.479242f,   -0.814038f,   -0.419332f,
+      0.328869f,     -0.340707f,  0.133219f,   0.0320347f,   0.25089f,
+      -0.324917f,    -0.0684265f, 0.0377777f,  -0.262556f,   0.673458f,
+      -0.0291454f,   -0.417957f,  -1.0075f,    -0.481537f,   0.922105f,
+      -0.000516239f, -0.40034f,   0.242067f,   -0.43178f,    0.32001f,
+      0.143599f,     -0.345172f,  0.126093f,   0.148518f,    -1.12151f,
+      -1.03435f,     0.551691f,   -0.310001f,  -0.323194f,   -0.595128f,
+      -0.395689f,    0.737268f,   -0.729227f,  0.590804f,    -0.590022f,
+      -1.01427f,     -0.521159f,  -0.617579f,  1.07292f,     -0.613047f,
+      -0.619093f,    0.335268f,   0.473753f,   -0.795027f,   1.24635f,
+      -0.556193f,    0.241046f,   -0.0354181f, -0.354215f,   0.716752f,
+      -0.00200745f,  -1.25171f,   -0.440731f,  -0.763918f,   -0.588614f,
+      -0.183901f,    -0.396056f,  0.226903f,   0.921471f,    1.10465f,
+      0.207053f,     0.57681f,    -0.555699f,  0.235469f,    -0.92149f,
+      0.625808f,     0.29653f,    -0.81775f,   -0.307889f,   -1.41384f,
+      -0.136205f,    -0.365314f,  -0.516741f,  0.748052f,    0.617947f,
+      0.0973239f,    0.839607f,   0.530668f,   -0.227032f,   -0.449044f,
+      -1.04725f,     -0.244363f,  -0.396888f,  -0.146161f,   0.359789f,
+      0.0436599f,    1.21645f,    -0.336069f,  0.0534646f,   -0.00200328f,
+      0.658551f,     -0.156142f,  -1.0728f,    0.0951015f,   0.234837f,
+      -0.380525f,    0.041783f,   -0.269273f,  0.0386013f,   -0.455589f,
+      -0.174338f,    0.0345251f,  0.17116f,    -0.507642f,   0.210453f,
+      0.739987f,     -0.0438776f, 0.570145f,   -0.118811f,   0.0548662f,
+      0.153458f,     -0.89887f,   0.493704f,   0.283351f,    0.785441f,
+      -0.586002f,    -0.0616167f, -0.714328f,  -0.145941f,   -0.449656f,
+      0.850117f,     0.279997f,   0.204143f,   -0.31356f,    0.947057f,
+      -0.135787f,    0.747071f,   0.0145968f,  -0.81414f,    0.431009f,
+      -0.275824f,    -0.342928f,  -0.0528272f, -0.592183f,   0.433915f,
+      -0.251752f,    -0.311815f,  -1.47533f,   -1.43677f,    0.0698436f,
+      1.01341f,      0.305063f,   -0.252003f,  -0.428915f,   -0.00104153f,
+      -0.368267f,    -0.354523f,  -0.27956f,   -0.771664f,   0.232092f,
+      -0.428495f,    0.424952f,   -0.343229f,  0.196899f,    -0.761084f,
+      -0.0110293f,   -0.335361f,  0.571637f,   -0.423489f,   -0.52773f,
+      0.0108043f,    -0.504715f,  -1.1419f,    -0.402904f,   -0.160747f,
+      -0.329184f,    0.375374f,   -1.02604f,   -0.601371f,   0.631652f,
+      0.0742486f,    -0.464765f,  0.467445f,   0.240562f,    -0.38211f,
+      -0.459004f,    0.704196f,   0.021357f,   0.860785f,    -1.16731f,
+      -0.479029f,    -0.139644f,  -0.444087f,  0.322326f,    -0.25455f,
+      0.874399f,     0.477696f,   0.0464487f,  1.20658f,     0.0993356f,
+      0.00682712f,   -0.10163f,   -0.371765f,  -0.629513f,   -0.679196f,
+      -0.193935f,    0.47405f,    -0.18238f,   0.254918f,    -0.35306f,
+      -0.375611f,    0.119771f,   -0.257282f,  -0.565124f,   0.162667f,
+      -0.356128f,    0.870351f,   0.241847f,   -0.264712f,   -0.384322f,
+      0.31807f,      0.211621f,   -0.180767f,  0.764944f,    0.368646f,
+      0.186111f,     1.02458f,    -0.494252f,  -0.483375f,   -0.699664f,
+      0.00415657f,   -0.189376f,  -0.677103f,  -0.030319f,   0.667087f,
+      0.810951f,     -0.488237f,  -0.387355f,  -0.726579f,   -0.304763f,
+      1.10392f,      -0.775977f,  -0.247731f,  0.532396f,    1.24089f,
+      0.206621f,     -0.670568f,  -1.08142f,   -0.342503f,   0.189854f,
+      -0.200846f,    0.784204f,   0.641112f,   -0.509346f,   0.0805264f,
+      -1.40006f,     0.322084f,   -0.823739f,  -1.12965f,    -0.215668f,
+      0.099673f,     0.425966f,   0.771697f,   0.338834f,    0.345364f,
+      -0.297826f,    -0.176746f,  -0.297299f,  -1.80029f,    -0.178348f,
+      0.421194f,     -0.19155f,   0.417653f,   0.374441f,    -0.135654f,
+      -0.895843f,    0.220647f,   0.368264f,   0.369233f,    0.382707f,
+      0.0800511f,    0.542053f,   0.318896f,   -0.385539f,   0.313305f,
+      -1.01166f,     -0.222379f,  -1.53708f,   1.32407f,     -0.665444f,
+      -0.102348f,    0.0410504f,  -0.616825f,  1.3108f,      0.405902f,
+      1.27777f,      0.0630558f,  -0.172696f,  0.16224f,     -1.10111f,
+      -3.31326f,     -0.242566f,  0.831422f,   0.917397f,    0.311749f,
+      -0.238613f,    0.438007f,   -0.407089f,  -0.0202555f,  -1.82502f,
+      -0.907965f,    -0.300031f,  -0.616669f,  -0.767921f,   0.285919f,
+      -0.112019f,    0.252677f,   0.350892f,   0.000214244f, 0.315915f,
+      0.260344f,     0.327362f,   -0.0211213f, -0.41241f,    0.0418355f,
+      0.103328f,     -0.0158439f, -0.230505f,  -0.0215114f,  0.266739f,
+      -0.234376f,    -0.352583f,  0.0709437f,  -0.90649f,    -0.535843f,
+      1.21322f,      -1.05144f,   -0.983682f,  -0.189956f,   1.14208f,
+      -0.0188492f,   -0.254821f,  -0.463214f,  -0.708714f,   0.0447348f,
+      -0.220831f,    0.476299f,   0.102544f,   1.1173f,      -0.36981f,
+      -0.814102f,    0.103604f,   -0.247871f,  0.0610701f,   -0.356616f,
+      -0.144093f,    1.66496f,    0.180206f,   -1.04384f,    -0.65883f,
+      0.0290771f,    -0.622728f,  0.761523f,   -0.909091f,   -0.0340348f,
+      0.666895f,     -0.0232575f, 0.962643f,   -2.50103f,    -1.69745f,
+      -0.0482305f,   0.771811f,   -1.32233f,   -0.778722f,   -0.203309f,
+      0.395875f,     -0.171812f,  0.253794f,   0.432799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = {
+  -0.152159f, 0.552347f,   -0.806068f, 0.227901f,  0.335896f,  0.180785f,
+  0.75277f,   0.982208f,   0.409823f,  -0.17755f,  -0.125365f, 0.738114f,
+  0.202331f,  0.751737f,   -0.360511f, 0.149254f,  0.085073f,  -0.214542f,
+  0.529727f,  -0.0348777f, -2.13162f,  -0.893332f, -0.136952f, -0.71258f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = {
+  -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f,  -2.31153f,
+  0.912733f,  0.879995f, -1.00602f,  -1.02467f,  0.0536835f, 1.76011f,
+  -0.898546f, 1.06959f,  1.60471f,   -1.7312f,   -0.877168f, -0.681185f,
+  -1.57286f,  -1.16038f, -4.11303f,  -3.06351f,  -3.02536f,  -2.92186f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = {
+  1.33207f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = {
+      0.0419551f,  0.0924078f,   -0.153084f,   0.191642f,    0.069586f,
+      -0.530661f,  0.431968f,    0.000453838f, 0.793047f,    0.0161817f,
+      -0.476075f,  -0.156638f,   -0.219066f,   0.372716f,    -0.0642299f,
+      0.156813f,   -0.105819f,   -0.0519422f,  0.149935f,    0.295544f,
+      0.192037f,   -0.0450383f,  0.828794f,    -0.0510661f,  -1.22549f,
+      -0.100293f,  -0.178274f,   0.0304427f,   -0.0664097f,  -0.0438936f,
+      0.948248f,   0.425486f,    -0.238206f,   1.3744f,      0.336897f,
+      0.0760769f,  -0.583508f,   0.0735519f,   -0.117024f,   0.0501598f,
+      0.332212f,   0.199531f,    0.424764f,    0.206712f,    0.342868f,
+      0.592673f,   -0.0961148f,  -0.190113f,   -0.155027f,   0.00789871f,
+      -0.0514839f, -0.416154f,   -0.290309f,   0.407541f,    0.48534f,
+      0.126564f,   0.0709566f,   -0.0469664f,  0.735403f,    -0.365963f,
+      0.150295f,   -0.50147f,    0.021383f,    0.76514f,     0.0085721f,
+      -0.416384f,  1.22268f,     0.0832438f,   0.367813f,    -0.12012f,
+      0.823183f,   -0.0525972f,  -0.325526f,   -0.0983032f,  0.370128f,
+      0.368778f,   0.138971f,    -0.0397997f,  0.411058f,    -0.0400404f,
+      0.588437f,   -0.29963f,    -0.107992f,   -1.75238f,    -0.274387f,
+      0.430418f,   0.495152f,    0.283172f,    -0.441166f,   0.195339f,
+      -0.436182f,  -0.252613f,   0.176204f,    -0.126541f,   -0.474833f,
+      -0.0721603f, -0.496599f,   -0.0608464f,  0.0333451f,   -0.0621485f,
+      0.0843859f,  0.0637854f,   -0.145291f,   0.14876f,     0.181665f,
+      -0.675805f,  0.294903f,    0.301118f,    -0.225957f,   0.0105897f,
+      -0.136427f,  -0.555925f,   -0.158853f,   -0.216779f,   0.0612481f,
+      -0.107158f,  0.352451f,    0.140536f,    -0.0148237f,  0.189371f,
+      -0.091046f,  -0.0476226f,  0.366054f,    -0.0723413f,  0.389883f,
+      -0.0213411f, 0.0279539f,   0.194827f,    -0.271502f,   -0.166474f,
+      0.0690549f,  0.0584665f,   0.0198415f,   -0.442348f,   0.1571f,
+      -0.113463f,  -0.16822f,    -0.0580659f,  -0.13441f,    -0.0022386f,
+      0.251521f,   -0.160494f,   -0.0753547f,  0.0897289f,   0.137917f,
+      0.129836f,   0.0816833f,   -0.626288f,   0.0643293f,   -1.20001f,
+      0.085631f,   -0.195602f,   0.251244f,    0.0321744f,   0.0493178f,
+      -0.220616f,  0.724075f,    -0.00831514f, 2.00319f,     0.407932f,
+      0.0710799f,  -0.166128f,   0.0126611f,   -0.229644f,   -0.0984299f,
+      0.632041f,   -0.0946141f,  0.295315f,    0.100934f,    0.184883f,
+      -0.236173f,  0.158081f,    0.195775f,    0.413542f,    0.789801f,
+      0.767741f,   0.166275f,    -0.348271f,   -0.384074f,   -0.291648f,
+      -0.119899f,  0.0368354f,   0.0751987f,   1.04217f,     -0.159002f,
+      -2.71592f,   -0.788502f,   -1.06268f,    0.536057f,    0.0575876f,
+      1.06811f,    0.12033f,     0.198578f,    -0.0419196f,  0.0631388f,
+      0.623138f,   -0.142226f,   1.33129f,     0.0868059f,   -0.0287825f,
+      0.139378f,   -0.143037f,   0.307452f,    0.0363987f,   -0.0976368f,
+      0.040544f,   0.0269327f,   -0.0845524f,  0.0674699f,   0.104501f,
+      -0.0351155f, 0.167071f,    0.00986971f,  0.10284f,     0.0300016f,
+      0.192601f,   0.0397177f,   0.0251346f,   -0.00912908f, -0.0452825f,
+      0.0164356f,  -0.0275149f,  0.194846f,    0.0943608f,   1.61674f,
+      0.0124345f,  0.523787f,    0.0397258f,   -0.17208f,    -0.147808f,
+      -1.23583f,   0.676385f,    0.551994f,    0.0233041f,   0.0116391f,
+      -0.466706f,  0.154725f,    -0.207371f,   0.606662f,    0.247286f,
+      0.31216f,    0.173765f,    -0.268033f,   0.224422f,    0.314649f,
+      0.481922f,   -0.190604f,   -0.0129162f,  0.270552f,    0.135195f,
+      0.0927735f,  -0.226099f,   0.53897f,     0.103309f,    -0.0257271f,
+      -0.0246776f, 0.442013f,    -0.179246f,   -1.02581f,    0.206176f,
+      -0.326365f,  0.391623f,    -0.103549f,   0.115645f,    0.0269328f,
+      -0.584517f,  -0.237502f,   0.157996f,    0.0447407f,   -0.161f,
+      -0.126072f,  -0.148967f,   -0.416347f,   0.0236496f,   -1.12612f,
+      0.0120709f,  -0.00979376f, 0.0507126f,   -0.172262f,   0.0697059f,
+      -0.212334f,  0.335731f,    -0.0301362f,  -0.839583f,   -0.238539f,
+      0.0636752f,  -0.0467217f,  -0.0372118f,  -0.144615f,   -0.161773f,
+      -0.648242f,  0.158197f,    -0.051471f,   -0.0615805f,  -0.0426936f,
+      -0.0745554f, 0.358975f,    0.358297f,    0.0568553f,   -1.14383f,
+      -0.103955f,  0.728194f,    -0.224945f,   -0.31659f,    -0.204458f,
+      0.171763f,   -0.465666f,   0.899234f,    -0.37042f,    -0.0894774f,
+      0.11478f,    -0.334957f,   0.0896514f,   0.413251f,    0.359471f,
+      1.41597f,    0.558082f,    0.153486f,    0.0270558f,   -0.0178797f,
+      0.124983f,   -0.12273f,    -1.04516f,    -0.125375f,   0.370336f,
+      -0.209423f,  -0.36816f,    -0.66077f,    -0.0180773f,  -0.628921f,
+      -0.178542f,  0.0346841f,   0.0319309f,   -0.470138f,   0.172763f,
+      0.0798846f,  -0.259737f,   -0.652461f,   -0.386283f,   -0.474447f,
+      -0.924054f,  -0.0154613f,  -0.613712f,   -0.138068f,   -0.337842f,
+      0.217921f,   -0.0711405f,  0.000404091f, -0.703766f,   0.0364683f,
+      0.150173f,   0.0126249f,   0.170594f,    0.0371879f,   -0.0862515f,
+      -0.23454f,   -0.0144143f,  0.164947f,    0.45591f,     0.115703f,
+      0.069752f,   -0.011993f,   0.0402097f,   0.00697581f,  0.0811613f,
+      0.384752f,   0.341977f,    0.06087f,     0.0590107f,   0.00812679f,
+      0.121211f,   -0.0612108f,  0.167851f,    0.195781f,    -1.62162f,
+      0.336292f,   -0.0772523f,  -0.310786f,   0.188257f,    -0.0325804f,
+      -0.240098f,  0.158748f,    -0.265264f,   3.19593f,     -0.449251f,
+      -1.33102f,   -0.482856f,   -0.435731f,   0.300808f,    0.346503f,
+      2.67378f,    -0.152379f,   0.219322f,    -0.146119f,   -0.0584806f,
+      -0.0276895f, -0.21955f,    -0.479179f,   -0.689545f,   0.152799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = {
+  -0.296575f, 0.101072f,  -0.208429f, 0.111585f, 0.699552f,   -0.379484f,
+  0.313244f,  -0.746369f, 0.867757f,  0.457318f, -0.0190943f, -0.290745f,
+  0.45592f,   -0.160465f, -0.634243f, 0.0829737f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = {
+      0.27511f,    -2.14172f,   1.25755f,    -0.554772f,  0.589508f,
+      0.228307f,   0.0754914f,  1.07061f,    0.293323f,   0.65162f,
+      -0.272016f,  -1.33519f,   -0.606759f,  -0.57827f,   0.368807f,
+      -1.48668f,   0.162439f,   0.0821667f,  0.225535f,   -0.795996f,
+      0.0328293f,  0.975476f,   -0.187514f,  2.47069f,    -1.5638f,
+      -0.461524f,  0.00310062f, 1.1556f,     -0.286206f,  0.00426021f,
+      0.585836f,   0.900007f,   0.384055f,   0.189435f,   -0.157291f,
+      -0.0710573f, -0.0663986f, -0.710772f,  -0.669136f,  -0.379493f,
+      -1.2634f,    -0.377524f,  0.824094f,   0.312308f,   0.125368f,
+      -0.382737f,  0.637109f,   0.61907f,    -0.741184f,  0.00257198f,
+      -0.0151343f, -0.669826f,  -0.439855f,  0.564852f,   -0.0588036f,
+      -1.38123f,   -1.1126f,    0.701831f,   0.198686f,   0.266866f,
+      0.270172f,   -0.692401f,  0.272533f,   -1.70914f,   0.66064f,
+      0.0886659f,  -0.132233f,  0.270531f,   -0.479581f,  0.704338f,
+      -0.307039f,  -0.111792f,  -2.05753f,   -0.231749f,  0.300528f,
+      0.383266f,   -0.130857f,  -0.373944f,  1.21025f,    0.704655f,
+      -0.589422f,  0.267185f,   -0.109065f,  -0.195991f,  0.20209f,
+      -0.0676526f, -0.183926f,  0.164894f,   0.0877923f,  0.565943f,
+      -0.0610466f, -0.86354f,   -0.80853f,   -0.176111f,  -1.45016f,
+      -2.29078f,   -0.124524f,  -0.139305f,  -0.187858f,  -0.0250151f,
+      -0.572544f,  0.185336f,   -0.69275f,   -0.430354f,  -0.30861f,
+      -0.754258f,  -0.468221f,  -0.160487f,  -0.766692f,  -0.636418f,
+      -0.71016f,   0.576125f,   -0.240476f,  -0.954556f,  -0.104693f,
+      0.155557f,   -0.840224f,  -0.685457f,  -0.0346927f, -0.644882f,
+      -1.92475f,   -0.314544f,  0.463569f,   0.323569f,   -0.990124f,
+      -0.213658f,  0.407183f,   1.19797f,    -4.77004f,   -0.0613379f,
+      -2.40345f,   -0.0591791f, -0.477622f,  -0.303556f,  0.104077f,
+      -0.974128f,  -0.035172f,  1.47064f,    0.233727f,   -0.0754056f,
+      0.158553f,   0.0614361f,  -1.38865f,   0.690729f,   0.568455f,
+      0.205866f,   -0.0236852f, -0.0921077f, -0.538954f,  0.336613f,
+      -0.427115f,  0.791754f,   -1.819f,     -0.404432f,  0.670242f,
+      -0.0343869f, -0.37191f,   0.0271262f,  0.988161f,   -0.547343f,
+      0.925304f,   0.548079f,   -0.430343f,  -0.214109f,  0.242013f,
+      1.39027f,    0.37648f,    -1.63524f,   -0.158864f,  -0.572779f,
+      -0.766801f,  -2.62032f,   0.47799f,    -1.12025f,   -0.115283f,
+      1.22349f,    -0.262132f,  -0.151274f,  0.390483f,   -0.496482f,
+      1.06166f,    -0.183052f,  0.54647f,    0.847486f,   0.0229506f,
+      0.653309f,   -0.020736f,  -1.27453f,   0.48386f,    -0.366625f,
+      -0.515725f,  -1.31196f,   0.140701f,   -0.183636f,  0.000413912f,
+      0.300993f,   -0.849529f,  -0.59764f,   -0.212992f,  -0.933365f,
+      -1.4054f,    -0.091982f,  0.41695f,    0.264004f,   -0.26379f,
+      -0.0738219f, 0.434052f,   1.16617f,    -0.639624f,  -0.146465f,
+      0.0409936f,  -0.900182f,  0.73517f,    0.805746f,   -0.208088f,
+      1.74459f,    -0.0592751f, 0.624865f,   -0.62325f,   -0.446315f,
+      0.150526f,   0.0526697f,  0.374254f,   -0.658043f,  1.02623f,
+      -0.941758f,  0.381217f,   -0.359448f,  0.160051f,   0.556455f,
+      0.239382f,   0.75851f,    0.437583f,   -0.122221f,  0.746136f,
+      0.218286f,   -0.426729f,  0.0353903f,  -0.830513f,  -0.877586f,
+      0.488077f,   -0.132354f,  -0.180756f,  0.736163f,   -0.202934f,
+      -0.882534f,  0.166305f,   0.183122f,   0.0599858f,  0.442687f,
+      0.0522908f,  -1.17755f,   -1.03733f,   0.392363f,   0.672718f,
+      -1.44704f,   0.360623f,   0.390298f,   -0.213968f,  0.169783f,
+      -0.717536f,  -0.830984f,  -0.445049f,  0.196772f,   -0.730634f,
+      -1.09497f,   0.344012f,   -0.292802f,  -0.67966f,   0.138515f,
+      -0.361803f,  0.936778f,   -0.189802f,  0.197777f,   -0.367507f,
+      -0.293653f,  0.447759f,   -0.409245f,  -0.687568f,  -0.431301f,
+      -0.271234f,  -0.585413f,  -0.936414f,  -0.396049f,  -0.29388f,
+      -0.0930843f, 0.0179339f,  0.262463f,   -0.166598f,  0.0171466f,
+      -0.329641f,  0.39343f,    0.657445f,   -0.579052f,  -0.312444f,
+      -0.0915881f, -0.432622f,  -0.247645f,  0.485749f,   -0.602508f,
+      -0.347936f,  0.287353f,   0.288705f,   0.168397f,   0.568228f,
+      -0.493586f,  1.04155f,    -0.097956f,  0.658928f,   -0.561007f,
+      0.0457783f,  2.12744f,    0.182683f,   -0.690282f,  0.183302f,
+      0.0309499f,  -0.722251f,  0.0660448f,  -0.333277f,  0.198929f,
+      -0.724102f,  -0.405597f,  0.614868f,   -0.292862f,  0.886513f,
+      0.142353f,   -1.48934f,   -0.97273f,   0.199683f,   0.522121f,
+      0.0877478f,  -0.172593f,  -1.58858f,   0.113191f,   -0.436178f,
+      0.640895f,   -0.504676f,  0.0658654f,  -0.361301f,  0.604323f,
+      0.315196f,   -0.423021f,  -0.323484f,  -0.563163f,  0.118989f,
+      -0.404508f,  -0.0550995f, -0.0359236f, -0.126574f,  -0.357288f,
+      -0.0494502f, 1.04959f,    -0.31646f,   -0.0376684f, -0.300744f,
+      -0.135016f,  0.102696f,   -0.392333f,  -1.17502f,   0.505227f,
+      0.337608f,   -0.348831f,  -0.420815f,  0.202791f,   -0.154264f,
+      -0.563686f,  0.0942187f,  0.353862f,   0.0303509f,  -0.132794f,
+      0.420746f,   0.143529f,   0.455822f,   -1.28348f,   -1.35662f,
+      -0.850688f,  -1.76361f,   -0.717546f,  0.443111f,   0.227155f,
+      -0.863307f,  -0.452033f,  -0.278151f,  1.86233f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = {
+  -0.103218f, -0.359587f, 0.619666f,  -0.473497f,  -0.649803f, 0.86992f,
+  -0.115561f, 0.335114f,  -0.285044f, -0.59295f,   0.24497f,   0.611583f,
+  0.38568f,   0.137913f,  -0.281191f, -0.0107777f, 0.487236f,  -0.262363f,
+  0.696962f,  0.121565f,  0.312511f,  0.430916f,   0.694134f,  0.393632f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = {
+  -2.42496f,  -1.239f,   0.832673f, 1.56923f,   -2.6175f,  -1.42492f,
+  -0.311387f, -1.94237f, 0.54071f,  -2.50391f,  0.352205f, -0.96572f,
+  1.47144f,   -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f,
+  0.789163f,  -0.65236f, 1.77018f,  0.273867f,  1.19506f,  1.07022f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = {
+  0.953424f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = {
+      0.0485154f,    0.0496279f,    0.0268229f,    -0.0584843f,   -0.166928f,
+      0.0316731f,    -0.0895094f,   -0.0433243f,   -0.00893639f,  -0.0886265f,
+      -0.0345622f,   -0.235395f,    -0.213754f,    -0.00212398f,  0.0218857f,
+      -0.0054983f,   -0.0248236f,   0.081822f,     -0.0355708f,   -0.0795593f,
+      -0.106995f,    -0.0596378f,   0.0350686f,    -0.133863f,    -0.00582928f,
+      0.114963f,     0.193906f,     -0.00419085f,  0.0430529f,    -0.128318f,
+      0.0614715f,    -0.000952935f, -0.0345722f,   -0.109459f,    0.074204f,
+      -0.0865131f,   0.0649158f,    -0.0942417f,   -0.10122f,     -0.047551f,
+      -1.27825f,     -0.0125456f,   -0.019722f,    -0.152058f,    0.280306f,
+      -0.121231f,    -0.0565484f,   0.0959188f,    0.0603919f,    0.0457468f,
+      0.967589f,     0.105892f,     -0.118326f,    0.198933f,     0.163437f,
+      -0.056824f,    -0.0302956f,   -0.07366f,     -0.681407f,    -0.0781575f,
+      0.255732f,     -0.0712105f,   0.177882f,     0.709206f,     -0.232457f,
+      1.33809f,      -0.0328557f,   0.0572231f,    -1.01361f,     0.130676f,
+      -0.205159f,    0.975398f,     0.356293f,     0.0766364f,    -0.297397f,
+      -0.0261066f,   -0.0933549f,   0.0568851f,    -0.0123034f,   -0.0433538f,
+      0.131003f,     0.890705f,     0.0084565f,    0.00547395f,   0.00157634f,
+      0.0047937f,    -0.0511092f,   0.0300034f,    -0.00604993f,  -0.0133502f,
+      -0.000274302f, 0.129728f,     -0.00532916f,  0.0855351f,    0.136885f,
+      0.0175562f,    -0.0123633f,   -0.000512229f, -0.019924f,    -0.0316328f,
+      0.422972f,     0.0460336f,    0.0170841f,    -0.00086795f,  -0.0655137f,
+      0.0287308f,    -0.0375644f,   -0.0329215f,   -0.0273072f,   0.0241426f,
+      -0.0429052f,   0.0221593f,    -0.063881f,    -0.0347391f,   -6.44339e-07f,
+      0.0476934f,    -0.0150068f,   0.0146403f,    -0.0653099f,   0.0107635f,
+      0.012407f,     0.0048935f,    1.50975f,      0.322256f,     0.17881f,
+      0.0943775f,    -0.100583f,    -0.367022f,    -0.156525f,    -0.0397161f,
+      0.0752784f,    -0.00219022f,  -0.887456f,    0.0153415f,    -0.0148185f,
+      -0.56435f,     0.163996f,     -0.0221024f,   -0.0115872f,   -0.0529284f,
+      0.156838f,     -1.13813f,     -0.207863f,    -0.00484959f,  0.135719f,
+      0.131004f,     0.0417939f,    0.31453f,      0.121719f,     -0.101515f,
+      0.267951f,     0.219727f,     0.0398821f,    0.0713504f,    3.65918e-06f,
+      -0.00659998f,  0.477343f,     -0.128426f,    0.0648877f,    0.111884f,
+      0.224552f,     0.0617426f,    0.117742f,     0.031377f,     0.0586865f,
+      -0.459293f,    0.100211f,     -0.14127f,     0.624412f,     0.014659f,
+      -1.41807f,     -0.382452f,    -0.695931f,    -0.103153f,    0.145808f,
+      0.333526f,     -0.256367f,    0.096842f,     0.102458f,     -0.181224f,
+      0.729272f,     0.151177f,     1.46729f,      0.111044f,     -4.28813f,
+      0.0178379f,    0.47641f,      -6.57533f,     0.0633335f,    0.496934f,
+      -0.154657f,    -9.07298e-05f, 0.848937f,     -5.40143f,     0.375685f,
+      0.23586f,      -0.166591f,    -0.0191648f,   -0.039862f,    -3.25093f,
+      0.168472f,     -0.260317f,    -5.51548f,     0.0575334f,    0.328979f,
+      0.112644f,     0.231339f,     -0.122641f,    0.0567331f,    1.19541f,
+      -0.038735f,    0.0630576f,    0.176668f,     0.0757184f,    -0.833104f,
+      0.133669f,     0.982669f,     0.0311783f,    0.0908558f,    -0.10065f,
+      -0.0386599f,   -0.231587f,    -0.83876f,     -0.347148f,    0.225529f,
+      -1.29625f,     0.0806834f,    0.369648f,     -1.63367f,     0.118057f,
+      -0.311948f,    0.95022f,      -0.354807f,    -0.648657f,    -1.72048f,
+      0.260397f,     0.915555f,     0.057737f,     -0.162019f,    -0.453543f,
+      -1.70388f,     -0.311632f,    -0.731593f,    -0.678089f,    0.10438f,
+      -0.293911f,    0.144864f,     0.039212f,     0.0289241f,    -0.0685266f,
+      0.634592f,     -0.0798614f,   -0.119197f,    -0.00517433f,  -0.04653f,
+      -0.127568f,    -0.0582645f,   0.0735302f,    -0.0946823f,   0.00865585f,
+      0.0115748f,    0.0194847f,    0.0455664f,    0.181006f,     -0.0824601f,
+      0.0869093f,    0.264767f,     -0.0750432f,   0.135136f,     0.316511f,
+      0.399015f,     0.0994808f,    -0.166944f,    -0.102126f,    0.457858f,
+      0.300488f,     0.467582f,     0.830244f,     -0.0511439f,   -0.522892f,
+      -0.183049f,    0.2626f,       0.118382f,     0.241674f,     0.250399f,
+      -0.0963507f,   -0.83231f,     -0.227699f,    -0.133314f,    0.231718f,
+      -0.0700274f,   0.891311f,     0.224742f,     -0.572836f,    0.402798f,
+      -0.191576f,    0.740922f,     -0.00374073f,  0.658178f,     -0.209364f,
+      -0.416259f,    0.166297f,     0.0095577f,    -0.0876076f,   0.424954f,
+      0.265226f,     -0.129343f,    -0.203146f,    -0.194637f,    -0.818142f,
+      -0.164152f,    -0.368962f,    0.273373f,     0.599927f,     -0.19859f,
+      0.0939651f,    -0.12458f,     -0.751816f,    -0.302997f,    -0.139176f,
+      -0.372737f,    0.332704f,     -0.206045f,    -0.00593763f,  -0.452363f,
+      -0.2704f,      -0.198846f,    0.0976308f,    -0.216124f,    0.110122f,
+      -0.220342f,    0.00763426f,   -0.0272775f,   -0.190395f,    -0.0359411f,
+      -0.0395759f,   0.000941162f,  -1.49959f,     0.0914233f,    0.448346f,
+      -0.420435f,    -0.0102102f,   -0.0757978f,   -0.0177687f,   -0.0231492f,
+      -0.142125f,    1.31774f,      0.0269368f,    0.134566f,     0.152079f,
+      -0.139933f,    0.139226f,     -0.214467f,    -0.194446f,    -0.555893f,
+      0.271197f,     -0.111047f,    0.0888069f,    -0.198121f,    0.0871713f,
+      0.100612f,     0.429782f,     -0.3787f,      0.123147f,     -0.12538f,
+      0.235678f,     0.139237f,     0.223326f,     0.85806f,      -0.00554756f,
+      0.285095f,     0.0954683f,    0.0464989f,    0.100806f,     -0.0211297f,
+      0.121672f,     0.242473f,     0.0810475f,    -0.834356f,    0.119629f,
+      0.111338f,     -0.227126f,    0.159296f,     -0.0584685f,   -0.108265f,
+      -0.0909221f,   -0.21749f,     0.0929309f,    -0.176815f,    0.178067f,
+      -0.0025905f,   0.317883f,     0.313045f,     0.26774f,      -0.589329f,
+      -1.19882f,     -0.285513f,    -0.109478f,    0.309441f,     -0.0604479f,
+      0.947461f,     -0.142342f,    -0.9086f,      -0.814788f,    0.184588f,
+      -0.0736317f,   0.276237f,     0.13132f,      -0.3931f,      -0.381744f,
+      -0.0122719f,   0.0246101f,    -0.0920412f,   0.11331f,      -0.110355f,
+      0.00848064f,   0.0931248f,    -0.0638655f,   -4.30869e-05f, -0.300367f,
+      0.0489508f,    0.464441f,     -0.0466243f,   -0.0137732f,   0.0099241f,
+      -0.223972f,    0.188966f,     -0.653173f,    -0.354322f,    0.189237f,
+      -0.624276f,    -1.46218f,     -0.075161f,    -0.516172f,    0.40993f,
+      0.291178f,     -1.95088f,     -0.0352157f,   0.196354f,     -0.335897f,
+      0.0857039f,    0.605319f,     -1.12923f,     -0.638387f,    1.41868f,
+      0.0955757f,    -0.00913477f,  0.315935f,     -0.671223f,    -0.851436f,
+      -0.157464f,    -0.296763f,    0.182277f,     -0.139309f,    0.232789f,
+      0.869562f,     0.248894f,     0.242709f,     0.195479f,     0.106153f,
+      0.358881f,     0.167443f,     0.982987f,     0.104767f,     -0.033925f,
+      -0.0263185f,   0.0045304f,    0.0722479f,    -0.111307f,    0.00128896f,
+      0.406128f,     -0.00944947f,  0.121592f,     0.546284f,     -0.00175696f,
+      0.776588f,     0.238846f,     0.064469f,     0.27082f,      0.269187f,
+      0.0294455f,    0.62364f,      -0.27872f,     -0.0488013f,   0.229024f,
+      0.154457f,     0.0445898f,    0.349943f,     0.0710998f,    0.0820674f,
+      0.0279449f,    0.172826f,     -0.122156f,    -0.164688f,    0.0292124f,
+      0.0496112f,    -0.741762f,    0.0673926f,    0.108159f,     -0.0942327f,
+      -0.0562883f,   0.558231f,     0.0552399f,    0.211393f,     0.0376817f,
+      -0.275788f,    0.0548436f,    0.212732f,     0.163603f,     0.0663363f,
+      -0.0252315f,   0.164533f,     0.0826088f,    0.0301389f,    0.345705f,
+      -0.0378046f,   -0.139581f,    1.30162f,      1.23551f,      -0.446693f,
+      0.682534f,     -0.0831157f,   -0.0121595f,   1.50505f,      0.0839017f,
+      -0.953413f,    0.0820985f,    -0.125556f,    0.699796f,     -0.140453f,
+      0.168438f,     -0.110966f,    0.173806f,     0.114683f,     0.132502f,
+      -0.0453539f,   -0.133096f,    0.511947f,     -0.180657f,    -0.0298605f,
+      0.291437f,     -0.0275017f,   -0.229703f,    -0.0504205f,   0.559622f,
+      0.384601f,     0.111024f,     -0.0773559f,   -0.0591752f,   -0.0866182f,
+      -0.189437f,    -0.262345f,    -0.0372182f,   0.149925f,     0.154644f,
+      -0.188298f,    0.236949f,     -0.199328f,    -0.378909f,    -0.680128f,
+      0.277184f,     -0.172784f,    0.184717f,     -0.23899f,     0.0712069f,
+      0.0235425f,    0.4225f,       -0.441487f,    0.177434f,     -0.298303f,
+      0.295696f,     0.17346f,      0.220542f,     -0.680116f,    0.00266223f,
+      -0.0408459f,   -0.15486f,     0.24335f,      0.237258f,     -0.0283245f,
+      0.19703f,      -0.100027f,    0.0554843f,    -1.03081f,     0.151745f,
+      0.538582f,     0.370368f,     0.196683f,     0.0222123f,    -0.0831401f,
+      -0.0832803f,   -0.286743f,    -0.686003f,    0.0995004f,    0.148901f,
+      -0.0436037f,   -0.316508f,    0.00391835f,   -0.228452f,    0.940058f,
+      0.520047f,     -0.334211f,    0.652142f,     -0.0755971f,   0.0965123f,
+      -0.98191f,     0.394096f,     -0.420466f,    0.327284f,     -0.134651f,
+      0.849297f,     -0.523372f,    0.010327f,     0.133636f,     0.298119f,
+      -0.257389f,    0.0376153f,    -0.198298f,    0.0736235f,    0.608809f,
+      0.0291836f,    -0.290005f,    -0.141316f,    0.0184599f,    0.0554437f,
+      0.0621519f,    0.485276f,     0.617062f,     -0.0924811f,   -0.0120834f,
+      0.0817611f,    0.100421f,     -0.0153553f,   -0.135958f,    -0.0185322f,
+      -0.395803f,    -0.204862f,    0.547916f,     -0.438117f,    0.0229788f,
+      0.406981f,     0.795584f,     -2.02756f,     -0.8355f,      -0.386789f,
+      0.00968368f,   1.2147f,       -0.740869f,    -1.18415f,     -0.954918f,
+      -0.541142f,    0.0596003f,    0.107189f,     -0.411708f,    -0.964593f,
+      0.511906f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = {
+  -0.485545f, 0.131552f,   0.796833f,   -0.157582f, -0.0948124f, 0.00818613f,
+  -0.485562f, 0.3826f,     -0.0839326f, 0.170998f,  0.279545f,   -0.287143f,
+  0.184986f,  -0.0719864f, 0.19748f,    0.404145f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = {
+      1.30172f,     0.720189f,   0.261675f,   -0.466201f,   1.21773f,
+      0.495525f,    0.62398f,    0.44567f,    -0.330993f,   -0.269798f,
+      0.835161f,    -0.294874f,  0.186981f,   0.0162467f,   0.367654f,
+      0.658468f,    1.08325f,    1.01558f,    0.12783f,     -0.280581f,
+      2.2204f,      0.0337286f,  -0.403649f,  -0.230908f,   -0.35188f,
+      0.437712f,    -0.103634f,  -0.645929f,  1.17407f,     0.157385f,
+      0.212438f,    1.41874f,    0.284242f,   -0.493105f,   1.0703f,
+      0.00632116f,  1.18222f,    -0.26003f,   0.276795f,    -0.823156f,
+      0.29577f,     -0.157467f,  -0.18092f,   0.0237336f,   0.205715f,
+      -0.295679f,   0.165443f,   -0.628279f,  1.00804f,     0.361232f,
+      0.646155f,    -0.028651f,  1.64317f,    0.334251f,    -1.50713f,
+      -1.51685f,    -0.488522f,  0.169694f,   -0.593176f,   -0.372682f,
+      -1.50223f,    0.35076f,    -0.24641f,   -0.237189f,   0.190502f,
+      -0.948191f,   -0.303346f,  0.45108f,    -0.794368f,   -2.3116f,
+      0.404008f,    -2.67269f,   -0.941992f,  -0.45336f,    0.0655987f,
+      -0.288432f,   0.106068f,   0.286978f,   0.121403f,    0.462739f,
+      0.0130292f,   0.240597f,   -2.30983f,   -0.453309f,   -0.149335f,
+      0.856424f,    -0.186576f,  0.769961f,   -0.0657097f,  -0.976188f,
+      0.972971f,    -0.532728f,  -0.699334f,  -0.168803f,   0.361945f,
+      0.950769f,    1.5368f,     -0.223899f,  1.17547f,     -0.281483f,
+      0.533619f,    0.315344f,   0.0854543f,  0.464701f,    0.346828f,
+      0.271794f,    -0.0185388f, 0.109517f,   0.371662f,    -0.10852f,
+      0.244092f,    0.491959f,   -0.750281f,  1.41865f,     -3.51221f,
+      0.298194f,    -0.0790832f, -0.134158f,  -0.424084f,   0.189593f,
+      -0.238361f,   -0.407872f,  -0.366222f,  -0.606813f,   -0.230498f,
+      0.387248f,    -0.102734f,  -0.190544f,  -1.43649f,    0.141338f,
+      -0.0438917f,  0.204628f,   1.57033f,    0.0366937f,   -0.14733f,
+      0.048198f,    -0.122631f,  0.183354f,   0.0658753f,   -0.243381f,
+      0.0246889f,   -0.768798f,  -0.0644054f, 0.775073f,    1.63419f,
+      0.491624f,    0.21898f,    -0.358944f,  3.31304f,     0.0195916f,
+      0.236174f,    0.530704f,   0.140124f,   0.0736778f,   -0.27361f,
+      -0.598836f,   -1.01659f,   0.361765f,   0.00455986f,  -0.345222f,
+      1.68731f,     0.764082f,   0.193555f,   0.322782f,    1.19801f,
+      0.538935f,    -0.0393231f, -0.0248292f, -0.151168f,   0.479879f,
+      -0.208582f,   0.22798f,    0.335473f,   -0.00295455f, 0.139539f,
+      0.400814f,    0.478307f,   -0.189376f,  0.540084f,    0.466072f,
+      0.920231f,    0.398774f,   -0.472403f,  -0.0431972f,  -0.581665f,
+      -0.990058f,   0.258995f,   -0.0148889f, 0.27105f,     0.340334f,
+      0.223576f,    -0.0405193f, -1.23888f,   -1.45229f,    -1.44543f,
+      -0.376146f,   0.132601f,   -0.4064f,    -0.583611f,   -0.374588f,
+      0.0659428f,   0.325652f,   -0.338456f,  0.253767f,    -0.0181164f,
+      0.681732f,    0.222041f,   0.837496f,   1.09735f,     0.156328f,
+      0.177236f,    -0.702702f,  0.473689f,   0.322118f,    0.43343f,
+      0.315441f,    -0.40798f,   0.0811291f,  0.631431f,    0.361929f,
+      0.0723276f,   0.0164498f,  0.0293847f,  0.156406f,    -1.10453f,
+      0.837977f,    -1.03449f,   -0.348408f,  1.71953f,     -0.401765f,
+      0.64272f,     -0.182438f,  -0.233954f,  0.364597f,    0.269177f,
+      -0.578512f,   0.397216f,   0.0425122f,  -0.258728f,   1.41621f,
+      -0.688768f,   0.0944726f,  0.253163f,   -0.989037f,   1.72726f,
+      1.15976f,     -0.0460612f, 0.534186f,   -0.136814f,   0.49327f,
+      0.115744f,    -0.633052f,  -0.433855f,  -1.01874f,    -0.324035f,
+      0.489487f,    1.08696f,    0.836376f,   -0.423477f,   -0.421309f,
+      1.07348f,     0.323266f,   0.717604f,   0.366422f,    0.32983f,
+      0.336583f,    0.749292f,   -0.210666f,  0.387101f,    -0.583376f,
+      0.0391101f,   -1.07537f,   0.914591f,   -0.51303f,    1.15023f,
+      -0.0378782f,  0.262889f,   -0.841128f,  0.41619f,     -0.669704f,
+      -0.109995f,   1.01825f,    -0.194853f,  0.120739f,    0.627889f,
+      -0.00269221f, 0.751152f,   -0.529865f,  -1.50238f,    0.184521f,
+      0.795464f,    0.106099f,   1.83117f,    0.0883305f,   0.306844f,
+      -0.0671504f,  -0.169306f,  -0.214575f,  -0.121606f,   -0.234965f,
+      0.109752f,    -0.35831f,   -0.07894f,   0.497203f,    -2.63013f,
+      0.815608f,    -0.193593f,  -0.62292f,   0.338941f,    0.0970922f,
+      -0.531178f,   0.723346f,   0.35063f,    0.182647f,    -0.257013f,
+      0.784924f,    -0.217915f,  -0.0797363f, -0.399706f,   -0.485602f,
+      1.23155f,     0.345998f,   0.322949f,   -0.168196f,   -0.173313f,
+      0.282205f,    0.45117f,    0.918706f,   -0.046172f,   -0.0873883f,
+      0.56103f,     -0.485768f,  0.546199f,   0.254997f,    0.394296f,
+      0.607178f,    0.667532f,   -0.343883f,  0.374402f,    -0.531439f,
+      2.27782f,     -1.13255f,   0.505867f,   -0.514742f,   0.998571f,
+      -1.60984f,    -0.172873f,  -0.0604094f, 0.719791f,    -0.733982f,
+      0.348905f,    1.39008f,    -0.895343f,  -0.677064f,   -1.84221f,
+      0.0434018f,   -0.534794f,  0.0434753f,  -0.266576f,   0.268099f,
+      -0.242935f,   0.00166289f, 0.0263789f,  -0.224794f,   -0.113493f,
+      -0.236397f,   0.0879936f,  0.510895f,   -0.511789f,   -1.48962f,
+      -2.78268f,    -0.0495784f, -0.0343907f, 0.440459f,    -0.364209f,
+      0.833223f,    -0.0589337f, 0.00181418f, 0.455499f,    0.101762f,
+      -1.16424f,    0.270405f,   0.219033f,   -4.91105f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = {
+  -0.40114f,  -0.372342f, -0.216186f, -0.240014f,  -0.341773f, -0.344489f,
+  -0.113037f, 0.198479f,  0.482958f,  -0.630072f,  -0.728704f, -0.171963f,
+  0.519883f,  0.253003f,  -0.121618f, -0.0569875f, -0.485568f, -0.147577f,
+  0.533305f,  -0.587251f, -0.120837f, -0.483953f,  0.445641f,  -0.125136f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = {
+  -1.57431f,  -1.09069f,  1.67996f,   -0.669702f, 0.499807f, -3.03145f,
+  -0.878135f, 0.637818f,  -1.58419f,  -3.79756f,  0.62755f,  -0.446646f,
+  0.653269f,  -0.667854f, -2.19774f,  -3.53349f,  2.6107f,   -0.685892f,
+  -1.2603f,   -0.89707f,  -0.715551f, 0.382202f,  2.09574f,  0.469386f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = {
+  -0.022787f
+};
+
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = {
+  BRANCH_0_NUM_DNN_FEATURES,
+  BRANCH_0_NUM_LOGITS,
+  BRANCH_0_NUM_DNN_LAYERS,
+  {
+      BRANCH_0_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_0_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_0_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_0_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = {
+  BRANCH_1_NUM_DNN_FEATURES,
+  BRANCH_1_NUM_LOGITS,
+  BRANCH_1_NUM_DNN_LAYERS,
+  {
+      BRANCH_1_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_1_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_1_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_1_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = {
+  BRANCH_2_NUM_DNN_FEATURES,
+  BRANCH_2_NUM_LOGITS,
+  BRANCH_2_NUM_DNN_LAYERS,
+  {
+      BRANCH_2_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_2_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_2_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_2_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = {
+  BRANCH_3_NUM_DNN_FEATURES,
+  BRANCH_3_NUM_LOGITS,
+  BRANCH_3_NUM_DNN_LAYERS,
+  {
+      BRANCH_3_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_3_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_3_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_3_logits_bias,
+  },
+};
+
+#undef NUM_DNN_BRANCHES
+#undef NUM_CNN_LAYERS
+#undef BRANCH_0_NUM_DNN_LAYERS
+#undef BRANCH_1_NUM_DNN_LAYERS
+#undef BRANCH_2_NUM_DNN_LAYERS
+#undef BRANCH_3_NUM_DNN_LAYERS
+#undef CNN_LAYER_0_HEIGHT
+#undef CNN_LAYER_0_WIDTH
+#undef CNN_LAYER_0_IN_CH
+#undef CNN_LAYER_0_OUT_CH
+#undef CNN_LAYER_0_HORZ_STRIDE
+#undef CNN_LAYER_0_VERT_STRIDE
+#undef CNN_LAYER_1_HEIGHT
+#undef CNN_LAYER_1_WIDTH
+#undef CNN_LAYER_1_IN_CH
+#undef CNN_LAYER_1_OUT_CH
+#undef CNN_LAYER_1_HORZ_STRIDE
+#undef CNN_LAYER_1_VERT_STRIDE
+#undef CNN_LAYER_2_HEIGHT
+#undef CNN_LAYER_2_WIDTH
+#undef CNN_LAYER_2_IN_CH
+#undef CNN_LAYER_2_OUT_CH
+#undef CNN_LAYER_2_HORZ_STRIDE
+#undef CNN_LAYER_2_VERT_STRIDE
+#undef CNN_LAYER_3_HEIGHT
+#undef CNN_LAYER_3_WIDTH
+#undef CNN_LAYER_3_IN_CH
+#undef CNN_LAYER_3_OUT_CH
+#undef CNN_LAYER_3_HORZ_STRIDE
+#undef CNN_LAYER_3_VERT_STRIDE
+#undef CNN_LAYER_4_HEIGHT
+#undef CNN_LAYER_4_WIDTH
+#undef CNN_LAYER_4_IN_CH
+#undef CNN_LAYER_4_OUT_CH
+#undef CNN_LAYER_4_HORZ_STRIDE
+#undef CNN_LAYER_4_VERT_STRIDE
+#undef BRANCH_0_NUM_DNN_FEATURES
+#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_0_NUM_LOGITS
+#undef BRANCH_1_NUM_DNN_FEATURES
+#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_1_NUM_LOGITS
+#undef BRANCH_2_NUM_DNN_FEATURES
+#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_2_NUM_LOGITS
+#undef BRANCH_3_NUM_DNN_FEATURES
+#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_3_NUM_LOGITS
+
+static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = {
+  100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = {
+  -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = {
+  100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = {
+  -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = {
+  100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = {
+  -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f,
+};
+
+static const float av1_intra_mode_cnn_partition_mean[1] = {
+  1.191922f,
+};
+
+static const float av1_intra_mode_cnn_partition_std[1] = {
+  1.730044f,
+};
+
+static const int quad_to_linear_0[1] = { 0 };
+static const int quad_to_linear_1[4] = { 0, 1, 2, 3 };
+static const int quad_to_linear_2[16] = { 0, 1, 4,  5,  2,  3,  6,  7,
+                                          8, 9, 12, 13, 10, 11, 14, 15 };
+static const int quad_to_linear_3[64] = {
+  0,  1,  8,  9,  2,  3,  10, 11, 16, 17, 24, 25, 18, 19, 26, 27,
+  4,  5,  12, 13, 6,  7,  14, 15, 20, 21, 28, 29, 22, 23, 30, 31,
+  32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59,
+  36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_

diff --git a/libaom/av1/encoder/partition_model_weights.h b/libaom/av1/encoder/partition_model_weights.h
index b754c88..71c1ace 100644
--- a/libaom/av1/encoder/partition_model_weights.h
+++ b/libaom/av1/encoder/partition_model_weights.h

@@ -18,6 +18,10 @@
 
 #include "av1/encoder/ml.h"
 
+// TODO(chiyotsai@google.com): The performance of these models are getting worse
+// due the changes in the encoder. We should retrain the models here to get
+// better performance once we have the time.
+
 #define FEATURE_SIZE 10
 #define LABEL_SIZE 16
 // nn model for ab partition pruning, 128x128.
@@ -2442,54 +2446,359 @@
 #undef LABEL_SIZE
 
 // Below are the models used for simple_motion_search_based_split
-static const float av1_simple_motion_search_based_split_thresh_128 = 2.0f;
-static const float av1_simple_motion_search_based_split_thresh_64 = 2.0f;
-static const float av1_simple_motion_search_based_split_thresh_32 = 2.0f;
-static const float av1_simple_motion_search_based_split_thresh_16 = 2.0f;
-static const float av1_simple_motion_search_based_split_thresh_8 = 2.0f;
+// Thresholds
+// The first index level is for aggresiveness, and the second is frame
+// resolution, third is bsize
+static const float av1_simple_motion_search_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          1.40402595879f,  // p = 0.8028197
+          4.72845183649f,  // p = 0.99123732
+          1.86517797783f,  // p = 0.86589934
+          1.58715223005f,  // p = 0.83021506
+          7.22695596987f,  // p = 0.9992738
+      },
+      // midres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+      // hdres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          100.0000f,  // p = 1.000000
+          4.952535f,  // p = 0.992984
+          1.720880f,  // p = 0.848242
+          1.426233f,  // p = 0.806314
+          1.491905f,  // p = 0.816364
+      },
+      // Midres
+      {
+          100.0000f,  // p = 100.0000
+          3.137263f,  // p = 0.958404
+          2.703262f,  // p = 0.937219
+          1.877166f,  // p = 0.867285
+          2.221149f,  // p = 0.902133
+      },
+      // Hdres
+      {
+          4.417680f,  // p = 0.988082
+          3.086898f,  // p = 0.956349
+          3.966704f,  // p = 0.981416
+          1.532565f,  // p = 0.822381
+          3.449975f,  // p = 0.969230
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          100.000000f,  // p = 0.998048
+          1.484020f,    // p = 0.815179
+          1.866781f,    // p = 0.866085
+          1.706711f,    // p = 0.846409
+          2.080369f,    // p = 0.888980
+      },
+      // midres
+      {
+          100.000000f,  // p = 0.0
+          3.265763f,    // p = 0.963235428881
+          2.024598f,    // p = 0.883355591569
+          1.846446f,    // p = 0.863709256976
+          2.240962f,    // p = 0.903868036126
+      },
+      // hdres
+      {
+          3.133026f,  // p = 0.958234684141
+          2.940954f,  // p = 0.949834204693
+          2.484544f,  // p = 0.923051170045
+          1.702972f,  // p = 0.845922460525
+          1.655562f,  // p = 0.839641385729
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f,
+        0.762099214988f },
+      // midres
+      { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f,
+        0.557298794638f },
+      // hdres
+      { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f,
+        1.86572095242f },
+  },
+};
 
-// BLOCK_128X128
+static const float av1_simple_motion_search_no_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+      },
+      // midres
+      {
+          -3.38168078f,  // p = 0.032872917
+          -4.08610739f,  // p = 0.016526795
+          -1.78302370f,  // p = 0.15270848
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+      },
+      // hdres
+      {
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+          -2.98718897f,  // p = 0.048008
+          -100.000000f,  // p = 0.0
+          -3.33229488f,  // p = 0.03447975
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          -100.0000f,  // p = 0.0
+          -4.893793f,  // p = 0.007437
+          -3.387766f,  // p = 0.032680
+          -2.982806f,  // p = 0.048209
+          -2.330372f,  // p = 0.088639
+      },
+      // Midres
+      {
+          -100.0000f,  // p = 0.000000
+          -6.131853f,  // p = 0.002168
+          -2.346579f,  // p = 0.087338
+          -2.712849f,  // p = 0.062219
+          -3.195430f,  // p = 0.039338
+      },
+      // Hdres
+      {
+          -3.491416f,  // p = 0.029557
+          -2.192853f,  // p = 0.100394
+          -3.620180f,  // p = 0.026079
+          -2.030855f,  // p = 0.116001
+          -2.797586f,  // p = 0.057455
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          -100.0000f,  // p = 0.0
+          -3.617350f,  // p = 0.026151
+          -5.902503f,  // p = 0.002725
+          -4.677840f,  // p = 0.009213
+          -2.168378f,  // p = 0.102626
+      },
+      // midres
+      {
+          -100.0000f,  // p = 0.0
+          -3.204195f,  // p = 0.0390081679555
+          -2.354128f,  // p = 0.0867382128969
+          -2.523326f,  // p = 0.0742390077132
+          -3.112328f,  // p = 0.0426016085803
+      },
+      // hdres
+      {
+          -5.047760f,  // p = 0.00638270448225
+          -3.414994f,  // p = 0.0318301469487
+          -5.628090f,  // p = 0.00358255438917
+          -2.122691f,  // p = 0.10691083145
+          -1.972387f,  // p = 0.122132728355
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f,
+        -1.0830321897f },
+      // midres
+      { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f,
+        -0.228236297886f },
+      // hdres
+      { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f,
+        -1.36741555171f },
+  },
+};
+
+static const float av1_simple_motion_search_split_mean_128[17] = {
+  14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f,
+  12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f,  0.714786f,
+  3.535450f,  3.566207f,  0.835913f,  3.315452f,  3.302908f,
+};
+
+static const float av1_simple_motion_search_split_std_128[17] = {
+  1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f,
+  1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f,
+  1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f,
+};
+
+static const float av1_simple_motion_search_split_mean_64[17] = {
+  12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f,
+  10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f,  0.896393f,
+  2.819613f,  2.855845f,  0.926296f,  2.808782f,  2.798229f,
+};
+
+static const float av1_simple_motion_search_split_std_64[17] = {
+  1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f,
+  1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f,
+  1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f,
+};
+
+static const float av1_simple_motion_search_split_mean_32[17] = {
+  10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f,
+  8.759780f,  8.656299f,  8.772563f, 8.669839f, 4.208026f, 0.958573f,
+  2.308769f,  2.347375f,  0.961685f, 2.323464f, 2.296322f,
+};
+
+static const float av1_simple_motion_search_split_std_32[17] = {
+  1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f,
+  1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f,
+  0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f,
+};
+
+static const float av1_simple_motion_search_split_mean_16[17] = {
+  9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f,
+  7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f,
+  1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f,
+};
+
+static const float av1_simple_motion_search_split_std_16[17] = {
+  1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f,
+  1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f,
+  0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f,
+};
+
+static const float av1_simple_motion_search_split_mean_8[17] = {
+  7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f,
+  5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f,
+  1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f,
+};
+
+static const float av1_simple_motion_search_split_std_8[17] = {
+  1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f,
+  1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f,
+  0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f,
+};
+
+static const float *const av1_simple_motion_search_split_mean[5] = {
+  av1_simple_motion_search_split_mean_128,
+  av1_simple_motion_search_split_mean_64,
+  av1_simple_motion_search_split_mean_32,
+  av1_simple_motion_search_split_mean_16,
+  av1_simple_motion_search_split_mean_8,
+};
+
+static const float *const av1_simple_motion_search_split_std[5] = {
+  av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64,
+  av1_simple_motion_search_split_std_32,  av1_simple_motion_search_split_std_16,
+  av1_simple_motion_search_split_std_8,
+};
+
 #define NUM_HIDDEN_LAYERS_128 1
-#define NUM_FEATURES_128 6
-#define NUM_LAYER_0_UNITS_128 16
+#define NUM_FEATURES_128 17
+#define NUM_LAYER_0_UNITS_128 20
 #define NUM_LOGITS_128 1
 
-static const float av1_simple_motion_search_based_split_layer_0_kernel_128[] = {
-  -0.807346f,  0.242298f,   12.9862f,   -1.19161f,  5.21734f,    -1.1363f,
-  -2.39127f,   0.930915f,   -2.44285f,  -2.42966f,  5.73476f,    0.0506879f,
-  -0.234878f,  -0.317875f,  0.361322f,  0.431648f,  -0.39105f,   -0.110225f,
-  -2.46236f,   0.979713f,   -10.5596f,  -7.76653f,  -3.06518f,   2.42554f,
-  0.0492961f,  -0.467176f,  0.130746f,  0.494527f,  -0.0336645f, 0.501755f,
-  0.176486f,   -0.869541f,  7.77757f,   6.81303f,   6.00771f,    7.35696f,
-  0.150731f,   -0.307017f,  -0.437639f, -0.082924f, 0.379107f,   0.452278f,
-  -0.0143836f, -0.183691f,  -0.604698f, -9.2681f,   -2.06087f,   11.0256f,
-  0.0487599f,  -0.249168f,  -0.180407f, 0.304772f,  0.218642f,   -0.406073f,
-  -0.0289919f, -0.794381f,  5.45092f,   5.38374f,   3.25745f,    5.32903f,
-  1.12718f,    -0.0215478f, 2.78552f,   4.8951f,    -0.959671f,  0.694264f,
-  -0.0611219f, -0.331937f,  0.258252f,  -0.495331f, -0.285923f,  0.294713f,
-  -0.119947f,  0.0753204f,  10.2021f,   -5.82147f,  -12.0137f,   3.0365f,
-  0.366697f,   0.142683f,   -3.29731f,  -5.76651f,  -5.62578f,   10.9462f,
-  -0.325459f,  0.092602f,   -0.868027f, -0.691768f, -0.292017f,  -0.00841203f,
-  0.702545f,   -0.612227f,  -7.68881f,  9.52225f,   -1.18581f,   -2.56762f
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = {
+  0.24095f,    -0.397761f,  -0.388619f,  -0.0629548f, -0.44577f,   0.688212f,
+  -0.20889f,   -1.08227f,   -0.0313894f, -0.615505f,  -0.401839f,  0.40233f,
+  -0.171305f,  0.439803f,   1.58527f,    -0.968535f,  -1.29255f,   1.14846f,
+  0.885777f,   0.116412f,   -0.225704f,  0.316506f,   0.793951f,   -0.63591f,
+  0.097789f,   -0.327027f,  -0.778396f,  -0.231667f,  -0.9622f,    1.0044f,
+  0.32594f,    0.179768f,   -0.115529f,  -0.499395f,  -1.14727f,   -1.26111f,
+  0.269818f,   -0.0882028f, -0.349107f,  0.100901f,   0.0249506f,  0.528929f,
+  0.113961f,   0.929794f,   0.242494f,   -0.122828f,  -0.0477379f, 0.170659f,
+  0.0500187f,  0.28859f,    0.78783f,    0.482412f,   0.795298f,   0.179517f,
+  0.453911f,   -0.298029f,  -0.903332f,  0.510615f,   0.691994f,   0.433383f,
+  -0.140802f,  -1.11635f,   -0.547326f,  1.11318f,    0.71905f,    0.978538f,
+  0.097444f,   -0.0386012f, 0.713599f,   0.465164f,   0.391278f,   -0.472864f,
+  0.230224f,   -0.279508f,  0.558192f,   -0.468625f,  0.55995f,    -0.57507f,
+  -1.39947f,   -0.755819f,  -1.04512f,   -0.411552f,  -0.830444f,  -0.106571f,
+  -0.0972184f, 0.251842f,   0.269955f,   0.230492f,   -0.290581f,  -0.484799f,
+  0.0151041f,  0.171047f,   0.829999f,   -0.384581f,  0.220301f,   -0.121687f,
+  1.88848f,    -0.482809f,  -0.48185f,   1.34482f,    -0.716438f,  -0.284482f,
+  -1.78592f,   -1.29333f,   0.886867f,   0.80106f,    0.456415f,   0.649095f,
+  0.231093f,   0.361562f,   0.290018f,   0.128009f,   -0.196343f,  0.0607802f,
+  0.576761f,   -0.0413836f, 0.0300984f,  -0.318998f,  0.204434f,   -0.712524f,
+  0.833394f,   -0.81168f,   0.765488f,   -0.720973f,  1.12866f,    -0.838694f,
+  1.295f,      -0.159127f,  1.05404f,    0.736519f,   0.248662f,   0.229233f,
+  0.0434302f,  0.0551856f,  0.197862f,   0.354823f,   -0.32429f,   -0.227353f,
+  -0.132198f,  -0.438118f,  -0.210401f,  -0.81046f,   0.653555f,   0.826737f,
+  0.154235f,   0.228945f,   0.123089f,   0.614964f,   -0.0940471f, -0.00676807f,
+  0.24996f,    0.949233f,   0.746526f,   -0.044474f,  0.386414f,   0.503221f,
+  0.155133f,   -0.698848f,  -0.735356f,  -0.255091f,  0.413235f,   -0.335295f,
+  -0.145757f,  0.326299f,   -0.602629f,  -0.844474f,  -0.346722f,  -0.42598f,
+  -0.491016f,  -0.447732f,  -0.965366f,  -0.0242841f, 0.836606f,   -0.104877f,
+  1.23236f,    0.683986f,   0.787005f,   -0.0253437f, 1.2145f,     1.29554f,
+  -1.24302f,   -0.229495f,  0.439415f,   0.885087f,   -0.408704f,  -0.119299f,
+  -0.0960972f, 0.60148f,    0.683271f,   -0.057129f,  -0.180295f,  -0.264815f,
+  -0.363184f,  0.638271f,   0.631083f,   -0.252899f,  -0.164364f,  -1.31274f,
+  0.354408f,   0.0429172f,  0.371154f,   -1.0978f,    0.0433642f,  -0.467394f,
+  -0.706572f,  1.57198f,    -0.0701271f, 1.93149f,    -0.446267f,  1.4519f,
+  -1.29567f,   0.309978f,   -0.878062f,  0.891494f,   0.364005f,   -0.209611f,
+  -0.125927f,  0.184097f,   0.0629695f,  -0.43375f,   -0.0980562f, 1.08547f,
+  0.578312f,   0.16566f,    -0.198852f,  -0.241854f,  -0.523934f,  -0.206037f,
+  -0.867721f,  1.00041f,    1.09848f,    -2.12562f,   -0.19992f,   -0.186128f,
+  -0.03507f,   0.0484884f,  0.160856f,   0.10802f,    -0.805141f,  -1.06902f,
+  0.290363f,   0.0222096f,  -0.849266f,  0.112932f,   0.148682f,   -0.0457585f,
+  1.139f,      1.79141f,    0.194122f,   -0.342508f,  -0.403572f,  0.133678f,
+  0.217553f,   -0.263759f,  0.18441f,    0.254529f,   0.0471115f,  0.733178f,
+  -0.416205f,  0.441447f,   -0.443335f,  0.725005f,   -0.78946f,   0.71301f,
+  -0.644969f,  1.5445f,     0.365277f,   -0.455775f,  -0.365066f,  0.4742f,
+  -0.381714f,  -0.545794f,  -0.0464861f, -0.222768f,  -0.0106466f, -0.069743f,
+  0.0335566f,  0.378348f,   -0.249663f,  0.922286f,   0.125711f,   -0.894619f,
+  0.444682f,   0.447893f,   -1.98936f,   -1.41978f,   0.0406667f,  -0.199928f,
+  -0.199786f,  0.463481f,   0.334931f,   -0.396222f,  -0.0732259f, 0.796684f,
+  -0.140817f,  -0.26878f,   0.194642f,   0.895784f,   -0.369976f,  -2.26981f,
+  -0.0791776f, -0.0492268f, 0.6715f,     0.281805f,   0.0156664f,  -0.779785f,
+  0.17743f,    0.188786f,   -0.588077f,  -0.359153f,  0.258319f,   0.881688f,
+  0.846894f,   1.00292f,    0.838134f,   0.680632f,   0.273098f,   -0.329261f,
+  0.217757f,   -0.506726f,  -0.336523f,  -0.695875f,  -0.252006f,  0.751216f,
+  0.334409f,   -0.0151467f, 0.0885474f,  0.0973114f,  -0.248754f,  -0.263716f,
+  0.369906f,   -0.213749f,  -0.0355395f, -0.137799f,  2.43233f,    -0.944233f,
+  -0.745167f,  0.318558f,   0.316608f,   0.568678f
 };
 
-static const float av1_simple_motion_search_based_split_logits_kernel_128[] = {
-  0.364895f,    0.577553f,  0.115758f,  -0.999496f, 0.124885f, 3.23193f,
-  -0.00386642f, 0.970794f,  0.136637f,  -4.28052f,  -1.49234f, 0.370436f,
-  0.576981f,    -0.469656f, -0.124071f, 1.07669f
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = {
+  0.821344f,  1.11542f,   -1.24172f,  1.03642f,  1.13511f,
+  1.16414f,   -0.278655f, -1.35558f,  -1.26788f, -1.63189f,
+  -0.323271f, 1.21319f,   -0.888415f, 0.987145f, -1.16767f,
+  0.255833f,  -0.1392f,   1.43265f,   -1.54952f, 1.65159f
 };
 
-static const float av1_simple_motion_search_based_split_layer_0_bias_128[] = {
-  1.32916f,    0.817212f,  0.0f,       -0.921066f, 0.0f,      3.57649f,
-  -0.0204517f, 2.97286f,   0.0f,       5.49957f,   -8.14518f, 0.0f,
-  1.30826f,    -0.349536f, -0.638933f, 5.4496f
+static const float av1_simple_motion_search_split_logits_kernel_128[] = {
+  0.3565753f, 0.5490161f, -1.015597f, 0.565366f,   0.751604f,
+  0.922747f,  -1.931846f, 1.759353f,  -0.7362949f, 0.5707034f,
+  -1.092127f, 0.936767f,  2.034499f,  2.08148f,    0.9509507f,
+  -1.342504f, -0.834566f, 0.618184f,  0.844113f,   1.182693f
 };
 
-static const float av1_simple_motion_search_based_split_logits_bias_128[] = {
-  0.683442f
+static const float av1_simple_motion_search_split_logits_bias_128[] = {
+  1.819351f
 };
 
-static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_128 = {
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = {
   NUM_FEATURES_128,
   NUM_LOGITS_128,
   NUM_HIDDEN_LAYERS_128,
@@ -2497,12 +2806,12 @@
       NUM_LAYER_0_UNITS_128,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_kernel_128,
-      av1_simple_motion_search_based_split_logits_kernel_128,
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_128,
+      av1_simple_motion_search_split_logits_kernel_128,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_bias_128,
-      av1_simple_motion_search_based_split_logits_bias_128,
+      av1_simple_motion_search_split_hiddenlayer_0_bias_128,
+      av1_simple_motion_search_split_logits_bias_128,
   },
 };
 
@@ -2511,48 +2820,115 @@
 #undef NUM_LAYER_0_UNITS_128
 #undef NUM_LOGITS_128
 
-// BLOCK_64X64
 #define NUM_HIDDEN_LAYERS_64 1
-#define NUM_FEATURES_64 6
-#define NUM_LAYER_0_UNITS_64 16
+#define NUM_FEATURES_64 17
+#define NUM_LAYER_0_UNITS_64 24
 #define NUM_LOGITS_64 1
 
-static const float av1_simple_motion_search_based_split_layer_0_kernel_64[] = {
-  0.0345945f,  -0.394064f,  0.0919978f, 0.270358f,  -0.384502f, -0.504608f,
-  -0.25759f,   0.155981f,   2.62567f,   -10.7204f,  -0.709802f, 8.15948f,
-  0.589866f,   -0.445645f,  -1.68232f,  10.0061f,   -3.17671f,  4.87259f,
-  -0.448886f,  -0.205568f,  -0.462388f, 0.385001f,  -0.451687f, 0.49602f,
-  -0.256708f,  0.803322f,   3.25594f,   0.38541f,   -1.83867f,  -2.15132f,
-  0.936059f,   -0.203056f,  -5.92959f,  -6.24554f,  -6.68631f,  -6.85977f,
-  -0.0407565f, -0.258902f,  0.195053f,  -0.366515f, 0.339543f,  -0.433017f,
-  -2.67026f,   0.385457f,   1.86683f,   1.9501f,    0.0381398f, 1.086f,
-  -0.153729f,  0.173772f,   -42.9029f,  -36.8934f,  -2.892f,    -0.0540691f,
-  0.77469f,    -0.380145f,  2.2689f,    -9.53332f,  1.15712f,   2.86601f,
-  -0.437036f,  0.247132f,   -8.51058f,  -3.62972f,  -8.99449f,  -0.638738f,
-  0.0609263f,  -0.0614603f, 5.42307f,   5.35926f,   5.27437f,   5.26599f,
-  -0.0729677f, 0.0306104f,  -7.77867f,  5.03598f,   -8.17832f,  5.85461f,
-  -0.253269f,  0.164582f,   -4.49713f,  3.83265f,   9.04851f,   -2.85668f,
-  1.22618f,    0.166904f,   -1.51975f,  -4.01576f,  -1.44374f,  -2.22147f,
-  -0.217072f,  -0.0984913f, -0.265515f, 0.360021f,  0.0779512f, 0.361516f
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = {
+  -1.40663f,    -0.851503f,   -0.0613111f,  0.741591f,    0.302754f,
+  0.184001f,    0.0474853f,   0.371096f,    0.0541624f,   0.381508f,
+  0.355427f,    0.0428822f,   0.154916f,    -0.00490099f, 0.025484f,
+  0.0208921f,   0.140596f,    -0.292525f,   -0.459067f,   -0.081393f,
+  0.109824f,    -0.290183f,   0.720236f,    0.385835f,    -0.150643f,
+  -0.078518f,   0.0979819f,   -0.102135f,   0.137152f,    -0.0786457f,
+  0.0171441f,   0.991338f,    -0.546583f,   -1.0714f,     -0.0842851f,
+  0.244072f,    0.427379f,    0.146775f,    -0.921613f,   -0.912093f,
+  0.393566f,    -0.232375f,   0.19963f,     0.312355f,    0.55659f,
+  -0.104714f,   -0.137563f,   0.0985237f,   0.0788307f,   -0.225514f,
+  0.0228832f,   -0.288733f,   -0.00737685f, -0.711657f,   -0.256796f,
+  0.0869605f,   0.583977f,    0.384306f,    1.46692f,     -0.741126f,
+  -0.21105f,    -0.276604f,   -0.0151463f,  -0.0227997f,  -0.0403232f,
+  0.044122f,    0.0185784f,   -0.0451951f,  0.00489513f,  -0.387131f,
+  0.0966724f,   -0.599174f,   -0.00243351f, -0.21439f,    0.302043f,
+  0.130334f,    -0.191251f,   0.863261f,    -1.50112f,    0.00901057f,
+  0.000324294f, -0.0572545f,  0.0117685f,   -0.0734682f,  -0.0570435f,
+  -0.126253f,   1.2313f,      -0.328267f,   0.211788f,    -0.175438f,
+  -0.0419298f,  0.166447f,    -0.178739f,   -0.326221f,   -0.0439188f,
+  1.01182f,     -0.390678f,   -0.426343f,   0.0944665f,   -0.225042f,
+  -0.183344f,   0.0500763f,   -0.377393f,   -0.673401f,   -0.436907f,
+  -0.00366876f, -0.363412f,   0.195194f,    0.250248f,    -0.397193f,
+  -0.0917222f,  -0.0221579f,  1.7693f,      -0.0694484f,  -0.0410764f,
+  -0.134571f,   -0.159992f,   -0.170359f,   -0.249333f,   -0.128056f,
+  -0.617054f,   -0.808701f,   -0.540642f,   0.396391f,    0.147787f,
+  0.346916f,    0.709852f,    0.116064f,    0.0509731f,   0.073713f,
+  -0.365082f,   -1.09287f,    -0.618214f,   0.20545f,     0.126161f,
+  -0.140012f,   0.62592f,     0.316326f,    -0.392765f,   -0.15934f,
+  0.337617f,    -0.41669f,    -0.295225f,   0.0602025f,   -0.0150657f,
+  -0.319629f,   0.783729f,    -0.0661199f,  -0.362657f,   0.390042f,
+  -0.043614f,   -0.0414596f,  0.121155f,    -0.309775f,   -0.284761f,
+  -0.243932f,   0.279855f,    -0.266823f,   0.734824f,    -0.164028f,
+  0.261776f,    -0.105585f,   0.10733f,     -0.180469f,   1.18875f,
+  -1.12836f,    -0.173008f,   0.150221f,    0.111598f,    0.148306f,
+  -1.2833f,     -1.06346f,    0.233546f,    0.16432f,     0.00142378f,
+  0.340574f,    -0.0140885f,  0.634761f,    -0.122096f,   0.821487f,
+  0.421424f,    -0.0256687f,  -0.035503f,   -0.0453547f,  -0.0215179f,
+  -0.0671277f,  -0.0486862f,  -0.962761f,   -0.208383f,   0.109573f,
+  -0.210668f,   -0.176485f,   0.421279f,    0.41605f,     0.342084f,
+  0.619364f,    0.103718f,    -0.00341643f, 0.00266677f,  0.249089f,
+  -0.22848f,    -0.0368968f,  1.12092f,     -0.64912f,    -0.456579f,
+  0.477823f,    0.418345f,    1.41515f,     0.0936279f,   0.886155f,
+  -0.785656f,   -0.217109f,   -0.561829f,   -0.286435f,   -0.884068f,
+  -0.148839f,   -0.282848f,   0.0683745f,   0.0962815f,   -0.111975f,
+  0.0509158f,   -0.211274f,   0.744909f,    -0.8982f,     0.315232f,
+  -0.78624f,    0.598387f,    -0.530952f,   0.677357f,    0.0371339f,
+  0.99209f,     -0.681899f,   -0.291416f,   -0.224822f,   -0.26049f,
+  -0.0436525f,  -0.380004f,   -0.27187f,    0.534779f,    0.717939f,
+  0.418197f,    -0.152539f,   -0.0684039f,  -0.186308f,   -0.0653121f,
+  0.194145f,    -0.196367f,   0.256997f,    -0.726269f,   -0.307672f,
+  -0.153362f,   0.450827f,    0.708842f,    -0.0667079f,  0.555564f,
+  0.0486892f,   0.0715072f,   -0.7211f,     -0.849797f,   0.0650271f,
+  1.2747f,      -0.646738f,   -0.53042f,    0.182197f,    0.928203f,
+  0.180621f,    -0.00640791f, -0.171416f,   0.092688f,    -0.391275f,
+  -0.0650657f,  0.0843773f,   0.170824f,    0.378085f,    0.0596657f,
+  0.844398f,    -1.3083f,     -1.27828f,    -0.199179f,   0.557855f,
+  0.241479f,    0.385804f,    0.169533f,    -0.0028072f,  0.0538041f,
+  0.00136234f,  0.0130481f,   0.0349449f,   -0.0366494f,  -0.000474055f,
+  0.437956f,    0.286724f,    -0.298187f,   0.461967f,    0.43065f,
+  -0.0877194f,  -0.19133f,    0.379121f,    -0.687751f,   -1.64077f,
+  -0.375191f,   -0.336836f,   -0.323904f,   -0.101859f,   0.0126672f,
+  -0.346332f,   0.112303f,    -0.863336f,   0.155538f,    0.366509f,
+  -0.0976829f,  0.635278f,    -0.681967f,   -0.527729f,   0.591839f,
+  0.366678f,    0.189981f,    0.0208007f,   -0.565809f,   0.70183f,
+  -0.282844f,   -0.327485f,   0.347243f,    -1.13014f,    -0.373378f,
+  -0.514978f,   0.662994f,    -0.144931f,   0.1402f,      -0.820049f,
+  0.711498f,    0.681156f,    1.06515f,     -0.423409f,   -0.0392664f,
+  0.0675396f,   -0.0508602f,  0.0431443f,   0.0212639f,   -0.0279887f,
+  -0.62611f,    -0.202064f,   0.701934f,    1.28452f,     -0.00858481f,
+  -0.517249f,   0.0615832f,   -0.260215f,   0.0949119f,   -0.28423f,
+  -0.39573f,    -0.0574246f,  -0.318658f,   0.0601775f,   -0.0629386f,
+  -0.134208f,   0.111686f,    -0.23355f,    0.078667f,    0.741023f,
+  0.828523f,    -0.345067f,   -0.315135f,   -0.0957154f,  0.522825f,
+  -0.190057f,   -0.473789f,   -0.390489f,   0.200677f,    -0.0271802f,
+  0.110336f,    0.493302f,    0.663126f,    0.570148f,    -0.380042f,
+  -0.437349f,   -0.660884f,   0.301908f,    0.0644179f,   0.172494f,
+  0.461917f,    0.330938f,    -0.140041f,   -0.0430205f,  -1.51003f,
+  -0.410984f,   -0.182161f,   0.0235313f,   -0.364849f,   0.154183f,
+  -0.592465f,   0.272701f,    0.192389f,    -0.0497777f,  -0.924467f,
+  -0.179513f,   -0.592217f,   0.436363f,    -0.0716164f,  0.189094f,
+  -0.574697f,   -0.304303f,   0.326441f,    -0.0865553f,  0.735948f,
+  0.266912f,    0.435824f,    -0.123322f
 };
 
-static const float av1_simple_motion_search_based_split_logits_kernel_64[] = {
-  0.470821f, 0.474747f, -0.571292f, 0.403221f,  0.628966f,  -0.617029f,
-  0.501105f, 0.499962f, -1.5451f,   -0.473518f, -0.730568f, -5.55817f,
-  0.776761f, 0.42569f,  0.311925f,  0.469968f
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = {
+  -1.19333f,  1.01834f,   -1.10844f,  0.0454873f, -1.45506f,   0.580864f,
+  -0.040979f, -0.505681f, -1.15072f,  0.692697f,  -0.520812f,  -0.479384f,
+  0.529652f,  0.507252f,  -1.08619f,  0.0586375f, 0.0929614f,  -0.46753f,
+  -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f
 };
 
-static const float av1_simple_motion_search_based_split_layer_0_bias_64[] = {
-  -0.134085f, 0.0758715f, 1.10419f,  0.0f,       -5.75737f, 1.65494f,
-  0.0f,       3.44047f,   0.394852f, 3.43858f,   3.65871f,  -4.84987f,
-  1.21207f,   -1.7705f,   -5.46469f, -0.0889634f
+static const float av1_simple_motion_search_split_logits_kernel_64[] = {
+  -3.32501f,  0.43082f,   -1.060692f, 1.328908f,  0.8892894f,  0.6488833f,
+  -1.096516f, -0.664786f, -1.301339f, 0.508805f,  -2.128406f,  -0.757304f,
+  0.383839f,  0.694763f,  -0.591725f, 0.770385f,  1.021594f,   0.589181f,
+  -0.76238f,  1.488826f,  0.709135f,  -0.575738f, 0.26421759f, -0.2484219f
 };
 
-static const float av1_simple_motion_search_based_split_logits_bias_64[] = {
-  -0.479491f
+static const float av1_simple_motion_search_split_logits_bias_64[] = {
+  0.699037f
 };
 
-static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_64 = {
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = {
   NUM_FEATURES_64,
   NUM_LOGITS_64,
   NUM_HIDDEN_LAYERS_64,
@@ -2560,12 +2936,12 @@
       NUM_LAYER_0_UNITS_64,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_kernel_64,
-      av1_simple_motion_search_based_split_logits_kernel_64,
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_64,
+      av1_simple_motion_search_split_logits_kernel_64,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_bias_64,
-      av1_simple_motion_search_based_split_logits_bias_64,
+      av1_simple_motion_search_split_hiddenlayer_0_bias_64,
+      av1_simple_motion_search_split_logits_bias_64,
   },
 };
 
@@ -2574,48 +2950,101 @@
 #undef NUM_LAYER_0_UNITS_64
 #undef NUM_LOGITS_64
 
-// BLOCK_32X32
 #define NUM_HIDDEN_LAYERS_32 1
-#define NUM_FEATURES_32 6
-#define NUM_LAYER_0_UNITS_32 16
+#define NUM_FEATURES_32 17
+#define NUM_LAYER_0_UNITS_32 20
 #define NUM_LOGITS_32 1
 
-static const float av1_simple_motion_search_based_split_layer_0_kernel_32[] = {
-  -1.61796f,   0.0585128f,  1.57904f,   1.52703f,   0.367779f, 0.220434f,
-  1.66652f,    -1.77782f,   6.41118f,   4.16976f,   4.97299f,  4.84111f,
-  -0.0956536f, -0.163284f,  -0.143662f, 0.129329f,  0.449659f, -0.528844f,
-  -1.00067f,   1.17203f,    -4.26777f,  -4.78521f,  8.45658f,  -3.49498f,
-  -1.78386f,   0.111488f,   4.176f,     6.31911f,   -10.5369f, 6.26983f,
-  -1.32233f,   1.22999f,    -4.1666f,   -10.0359f,  -4.14779f, -10.4695f,
-  1.83011f,    -0.333152f,  -9.87986f,  -8.11992f,  -8.2775f,  -7.79918f,
-  -0.101404f,  0.00401393f, 8.89046f,   -7.32186f,  -6.59597f, 9.66257f,
-  -1.1492f,    1.23067f,    -3.6341f,   6.59275f,   -3.2373f,  -3.42564f,
-  0.371736f,   -0.140902f,  -2.75715f,  5.92487f,   -7.9185f,  9.13743f,
-  -3.52698f,   -0.191044f,  5.96691f,   6.26327f,   4.36378f,  5.69354f,
-  -0.608845f,  -0.191236f,  -0.482191f, -0.180474f, -3.8838f,  -3.92934f,
-  -1.03191f,   0.994568f,   7.95516f,   -4.0035f,   -2.86266f, -4.96105f,
-  1.75022f,    0.125058f,   -1.52159f,  -3.59304f,  -2.82634f, -2.49556f,
-  -2.05557f,   -0.222577f,  3.7608f,    5.50475f,   2.7046f,   5.25952f,
-  -1.91327f,   -0.0356497f, 1.47611f,   1.27499f,   -1.76108f, -0.578954f
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = {
+  -0.980626f,   -0.946611f,    0.103761f,    0.408899f,    0.498149f,
+  0.0490161f,   0.253279f,     0.332029f,    0.00367441f,  0.364401f,
+  -0.236433f,   0.0592119f,    -0.0978848f,  0.159733f,    -0.018052f,
+  -1.10726f,    1.16167f,      -0.244982f,   -0.147819f,   -0.147095f,
+  0.111404f,    -0.349502f,    0.441178f,    0.0984191f,   -0.135537f,
+  -0.0423312f,  0.0123079f,    0.358012f,    -0.266796f,   0.0125811f,
+  0.196563f,    0.337093f,     -1.07266f,    -1.25134f,    0.57337f,
+  -0.521717f,   0.259824f,     0.537383f,    -0.463688f,   -0.336128f,
+  0.373385f,    0.483443f,     -0.229293f,   -0.33373f,    -0.656021f,
+  0.768647f,    0.179279f,     0.315415f,    0.187749f,    1.07839f,
+  0.0626629f,   -0.230299f,    0.662606f,    -0.414154f,   0.459334f,
+  -0.6312f,     0.427704f,     -0.249849f,   0.701056f,    -0.707969f,
+  0.057401f,    0.620434f,     0.665748f,    -0.501356f,   -0.230685f,
+  0.0722371f,   -0.0988625f,   -0.114035f,   -0.653799f,   0.571353f,
+  0.268276f,    1.13251f,      -1.0695f,     -0.225607f,   -0.984355f,
+  -0.42213f,    0.300422f,     1.21492f,     -0.139931f,   -0.000726004f,
+  0.045964f,    -0.0817352f,   -0.0278813f,  -0.0102341f,  -0.0144087f,
+  -0.475882f,   1.20682f,      -0.359919f,   0.277189f,    -0.166401f,
+  0.599211f,    -0.129872f,    0.574211f,    -0.247573f,   0.824405f,
+  -1.53329f,    -0.202151f,    -0.328698f,   -0.516322f,   -0.281416f,
+  -0.383651f,   -0.252862f,    -0.43185f,    0.456802f,    -0.430055f,
+  -0.55245f,    -0.6884f,      -0.541456f,   -0.281376f,   1.10425f,
+  -0.140706f,   1.59816f,      -0.0343895f,  -0.00920039f, -0.0307667f,
+  0.0560132f,   -0.0340302f,   -0.10848f,    0.0593314f,   -0.951795f,
+  0.876831f,    -1.00548f,     -0.566244f,   0.430061f,    1.10109f,
+  -0.634212f,   -0.0755369f,   -0.108953f,   1.03191f,     0.109036f,
+  -0.0415309f,  0.0681162f,    -0.0611775f,  -0.0231938f,  0.0973158f,
+  -0.0558169f,  -0.823484f,    -0.918509f,   0.16756f,     0.27087f,
+  0.286074f,    0.174069f,     0.1304f,      0.386074f,    0.433953f,
+  0.0291467f,   -1.74087f,     0.0296094f,   -0.00793714f, -0.13041f,
+  0.00990992f,  -0.0137848f,   -0.0742606f,  -0.251029f,   -0.645316f,
+  0.640029f,    0.550607f,     0.470097f,    0.549451f,    -0.285723f,
+  -0.164759f,   -0.128166f,    -0.391496f,   -0.80287f,    0.0769472f,
+  1.34391f,     0.0215005f,    0.0669497f,   0.131919f,    0.291674f,
+  0.0952889f,   -0.677953f,    -0.364054f,   0.144823f,    0.246198f,
+  -0.12393f,    0.363661f,     0.215091f,    -0.239658f,   0.18491f,
+  0.118703f,    0.0064156f,    1.38619f,     -1.3845f,     0.0567323f,
+  1.20812f,     -0.720374f,    -1.92158f,    -1.48657f,    0.335601f,
+  0.409379f,    0.373618f,     0.231274f,    0.292194f,    0.368619f,
+  0.2398f,      0.473579f,     0.83402f,     -0.0133751f,  -0.00344358f,
+  2.20688e-05f, 0.00836757f,   0.00405377f,  0.0110539f,   -0.260154f,
+  0.192112f,    -0.666986f,    0.302875f,    -0.113302f,   0.17882f,
+  -0.221493f,   0.146161f,     -0.448697f,   0.584187f,    0.122109f,
+  0.989981f,    -1.14706f,     -0.734042f,   0.0638213f,   0.213357f,
+  0.068543f,    -0.808558f,    0.404741f,    0.808313f,    1.57523f,
+  -0.113448f,   0.254102f,     -0.350065f,   -0.615f,      0.0753549f,
+  -0.540936f,   -0.0250732f,   -0.225681f,   -0.161384f,   0.0128342f,
+  -0.0933368f,  -0.286904f,    0.130133f,    -0.874747f,   0.392585f,
+  -0.493135f,   0.169708f,     0.0909804f,   1.89921f,     -0.469954f,
+  0.65165f,     -0.953401f,    -0.21595f,    -0.37479f,    0.0451146f,
+  0.0234621f,   -0.0596903f,   -0.0682308f,  -0.0830426f,  0.130011f,
+  -0.409141f,   0.0627038f,    -0.581148f,   -0.513922f,   0.631676f,
+  0.0637034f,   0.0539081f,    0.0638872f,   0.515863f,    -0.0123463f,
+  0.177238f,    0.279506f,     -0.930345f,   1.23726f,     0.202851f,
+  0.708792f,    -0.445086f,    -0.0267075f,  -0.913822f,   -0.0714978f,
+  -0.281107f,   -0.0770565f,   -0.23086f,    -0.165893f,   -0.319683f,
+  0.216235f,    -0.490999f,    2.04841f,     -0.0524071f,  -0.239043f,
+  -0.0526375f,  0.023002f,     -0.132685f,   -0.155354f,   -0.186503f,
+  -0.904296f,   0.166478f,     0.063268f,    -0.302842f,   -0.27179f,
+  -0.428299f,   0.50193f,      0.480717f,    -0.864275f,   0.317096f,
+  0.40698f,     0.0286107f,    0.189432f,    -0.0374374f,  0.0671728f,
+  0.203681f,    -0.457959f,    -0.155776f,   0.340948f,    0.542841f,
+  0.342675f,    -0.000952399f, 0.470957f,    0.744418f,    -1.11763f,
+  -0.658812f,   -0.044832f,    0.0688237f,   -0.357766f,   0.428662f,
+  -0.087152f,   -0.291903f,    0.373244f,    -0.587853f,   0.415895f,
+  -0.535694f,   0.621785f,     -0.143648f,   0.0451373f,   0.00068827f,
+  1.84432f,     -1.26239f,     -0.432087f,   -0.152307f,   0.0293551f,
+  0.184744f,    -0.0173156f,   -0.00572154f, -0.0305062f,  -0.0900071f
 };
 
-static const float av1_simple_motion_search_based_split_logits_kernel_32[] = {
-  -0.220382f, -0.693902f, 0.424827f, 0.379952f, -0.413791f, -0.326785f,
-  -0.455086f, 0.242402f,  0.307986f, 0.175746f, 0.498901f,  -0.628053f,
-  0.285447f,  0.230052f,  0.415151f, -0.842946f
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = {
+  0.160011f,  0.903856f,   -0.13738f,  0.358221f, -0.0906044f,
+  -0.606558f, -0.0215651f, -0.03377f,  -1.67017f, -0.144554f,
+  -0.201482f, -0.87719f,   0.639815f,  -0.51976f, -0.309922f,
+  -1.33421f,  0.721328f,   -0.889354f, -1.7158f,  -0.285963f
 };
 
-static const float av1_simple_motion_search_based_split_layer_0_bias_32[] = {
-  -1.80751f, 6.40356f,   -0.0512058f, -4.59163f, -0.369933f, -0.195755f,
-  -0.16648f, -0.599755f, -5.35975f,   -1.21349f, 2.48414f,   1.07096f,
-  -3.66684f, -6.17761f,  4.2159f,     -1.05286f
+static const float av1_simple_motion_search_split_logits_kernel_32[] = {
+  -0.2745374f,  0.333548f,  -0.2437388f, 0.288009f,   0.55635f,
+  0.4560176f,   0.2970518f, 0.391192f,   1.311854f,   -0.231219f,
+  -0.2968651f,  -1.819984f, 0.2775824f,  0.28929857f, 0.419126f,
+  -0.32868411f, -0.916399f, -0.1921077f, -0.617489f,  0.637953f
 };
 
-static const float av1_simple_motion_search_based_split_logits_bias_32[] = {
-  -2.58676f
+static const float av1_simple_motion_search_split_logits_bias_32[] = {
+  0.208473f
 };
 
-static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_32 = {
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = {
   NUM_FEATURES_32,
   NUM_LOGITS_32,
   NUM_HIDDEN_LAYERS_32,
@@ -2623,12 +3052,12 @@
       NUM_LAYER_0_UNITS_32,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_kernel_32,
-      av1_simple_motion_search_based_split_logits_kernel_32,
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_32,
+      av1_simple_motion_search_split_logits_kernel_32,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_bias_32,
-      av1_simple_motion_search_based_split_logits_bias_32,
+      av1_simple_motion_search_split_hiddenlayer_0_bias_32,
+      av1_simple_motion_search_split_logits_bias_32,
   },
 };
 
@@ -2637,48 +3066,101 @@
 #undef NUM_LAYER_0_UNITS_32
 #undef NUM_LOGITS_32
 
-// BLOCK_16X16
 #define NUM_HIDDEN_LAYERS_16 1
-#define NUM_FEATURES_16 6
-#define NUM_LAYER_0_UNITS_16 16
+#define NUM_FEATURES_16 17
+#define NUM_LAYER_0_UNITS_16 20
 #define NUM_LOGITS_16 1
 
-static const float av1_simple_motion_search_based_split_layer_0_kernel_16[] = {
-  -0.611497f,  -0.0422086f, -0.555957f,   -0.632451f, -0.144179f, -0.152722f,
-  -0.330265f,  -0.419866f,  0.287343f,    0.385295f,  -0.424486f, 0.424281f,
-  2.27442f,    -2.47933f,   5.24731f,     4.33827f,   4.73215f,   3.41909f,
-  1.16058f,    -0.364505f,  0.12207f,     -0.287749f, 0.0509783f, -0.0200119f,
-  1.52907f,    -1.1905f,    -2.56978f,    -3.00186f,  -3.56084f,  -3.89276f,
-  0.00365657f, 1.57125f,    -4.421f,      -2.48803f,  -2.51531f,  -4.28646f,
-  2.52248f,    -1.03377f,   -1.09607f,    -1.44633f,  -1.58736f,  -1.25927f,
-  -1.45841f,   -0.566619f,  -0.246166f,   -0.182289f, -0.238156f, 0.177991f,
-  0.0112509f,  -0.17677f,   -0.485877f,   0.0812852f, 0.104975f,  0.222793f,
-  -0.372858f,  -0.48624f,   -0.00870389f, -0.385019f, 0.405842f,  0.288523f,
-  0.167374f,   -0.204208f,  -8.74148f,    -8.59267f,  -8.42492f,  -8.3778f,
-  -5.57063f,   -0.406818f,  -0.873199f,   -0.896224f, -0.701479f, -0.985736f,
-  -0.625956f,  -0.0446202f, -0.509987f,   -0.321804f, -0.470759f, -0.248556f,
-  -0.369436f,  -0.160828f,  0.0591148f,   0.405218f,  0.142584f,  -0.130106f,
-  0.125321f,   0.0888179f,  7.34822f,     -6.71488f,  -7.06592f,  6.33224f,
-  0.0333619f,  -0.377782f,  0.160767f,    -0.128169f, -0.484818f, -0.311973f
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = {
+  0.0136957f,   0.182135f,    -0.583394f,    0.0556956f,   0.211152f,
+  0.168234f,    -0.694203f,   -0.678216f,    0.289943f,    1.00014f,
+  -0.0427784f,  -0.0427538f,  -0.0276009f,   -0.00133608f, 0.0901944f,
+  0.0674892f,   0.104068f,    -0.308582f,    -0.43596f,    0.855997f,
+  -0.223414f,   0.0390026f,   0.366492f,     0.216065f,    -0.386863f,
+  -0.148823f,   -0.297022f,   0.0529546f,    -0.202885f,   1.26471f,
+  -0.861163f,   -0.0949431f,  0.573627f,     -0.00277083f, -0.616063f,
+  -0.626927f,   0.371583f,    -0.411743f,    0.173387f,    -0.209734f,
+  0.293697f,    -0.260714f,   0.442728f,     -0.594486f,   1.38987f,
+  0.208025f,    -0.0433776f,  0.01173f,      0.921766f,    -0.168379f,
+  0.000697326f, 0.209967f,    -0.304577f,    0.149551f,    -0.196658f,
+  0.389251f,    -0.449106f,   -0.456329f,    0.669073f,    -0.163806f,
+  0.083348f,    -0.0783998f,  0.0678355f,    0.0510435f,   0.103964f,
+  0.104537f,    -0.778093f,   -1.0641f,      -0.626102f,   -2.02131f,
+  0.159591f,    0.254161f,    -0.000362642f, 0.289859f,    0.192713f,
+  0.139801f,    -0.0251327f,  0.164002f,     1.22892f,     -0.0852193f,
+  0.0769487f,   0.0296408f,   -0.0418688f,   0.0936023f,   0.0448523f,
+  0.674015f,    -0.0732944f,  0.313575f,     -0.593432f,   0.642067f,
+  -1.06063f,    0.468223f,    -0.769085f,    -0.173798f,   -0.175663f,
+  0.692808f,    0.00753295f,  -0.123327f,    -0.0234937f,  -0.0923153f,
+  0.0216917f,   -0.0690157f,  -0.397488f,    0.426628f,    0.264475f,
+  0.342074f,    -0.139817f,   0.215915f,     0.422544f,    -0.321102f,
+  0.0355587f,   0.460193f,    0.0315326f,    0.080556f,    -0.0256533f,
+  -0.0857874f,  -0.488283f,   -0.299653f,    -0.245987f,   0.104383f,
+  0.203731f,    0.328734f,    0.668104f,     -0.586909f,   -0.501335f,
+  -0.661292f,   -0.359811f,   0.00951363f,   0.816315f,    -0.0124104f,
+  0.0545827f,   0.089863f,    0.0125486f,    0.043609f,    -0.0259544f,
+  0.0123911f,   0.12557f,     -0.539875f,    -0.0556721f,  0.16532f,
+  0.265834f,    -0.384171f,   0.646496f,     0.366147f,    -0.111272f,
+  0.262096f,    -0.0845724f,  0.382724f,     0.165783f,    0.1025f,
+  0.392988f,    0.290525f,    0.038659f,     0.540269f,    -0.485586f,
+  -0.273065f,   -0.154052f,   -0.0896895f,   -0.35394f,    0.193214f,
+  -0.423728f,   0.654576f,    -0.373321f,    0.814914f,    0.026278f,
+  -0.0328304f,  -0.220913f,   -0.0442121f,   0.487545f,    -0.509537f,
+  -0.777581f,   -1.23886f,    0.223482f,     0.206009f,    0.20391f,
+  0.194628f,    0.226762f,    0.171609f,     -0.219037f,   0.557892f,
+  -0.312011f,   1.27709f,     0.064013f,     0.105384f,    0.0493933f,
+  0.074059f,    -0.0100078f,  -0.0176888f,   -0.440005f,   0.302922f,
+  -0.197456f,   0.296128f,    -0.326647f,    0.305323f,    -0.30696f,
+  0.201951f,    -0.15874f,    -0.793042f,    0.0197254f,   0.0569867f,
+  -0.0295468f,  -0.0215012f,  0.025855f,     -0.0196102f,  0.215558f,
+  -0.253069f,   0.298469f,    0.261269f,     0.435305f,    0.0120354f,
+  -0.384789f,   -0.2772f,     0.0366613f,    -0.494994f,   0.149072f,
+  1.32981f,     -0.427717f,   0.43938f,      -0.16375f,    -0.444342f,
+  0.548214f,    0.127955f,    -1.24387f,     0.0863676f,   0.175071f,
+  0.172673f,    -0.0906204f,  0.444454f,     -0.546669f,   0.215857f,
+  -0.100621f,   0.200699f,    -0.0985915f,   0.134706f,    -0.256396f,
+  0.393427f,    0.119606f,    -0.214278f,    -0.0183637f,  0.194266f,
+  -0.238025f,   0.182203f,    0.599718f,     0.846933f,    0.0607852f,
+  -0.183434f,   -0.723743f,   -0.72414f,     -0.124701f,   0.0227527f,
+  -0.0664636f,  -0.0385867f,  -0.0257377f,   -0.149054f,   0.12077f,
+  0.678029f,    -0.624456f,   0.189644f,     -0.518604f,   0.134397f,
+  -0.189777f,   -0.309376f,   -0.00377086f,  0.701132f,    -0.170915f,
+  0.00736111f,  -0.121906f,   0.329136f,     0.165514f,    0.0328356f,
+  0.171275f,    0.248619f,    0.247704f,     -0.449933f,   0.0841684f,
+  0.136982f,    0.122703f,    -0.0169439f,   -0.0726496f,  0.302648f,
+  -0.128556f,   0.0667425f,   -0.289717f,    -0.207532f,   -1.20269f,
+  -0.68892f,    0.045259f,    0.0973945f,    0.0988314f,   -0.944748f,
+  -0.180401f,   0.134331f,    0.033834f,     0.109023f,    0.265723f,
+  0.38063f,     -0.106518f,   -0.0686953f,   0.3744f,      -1.0957f,
+  0.0302782f,   0.0515164f,   0.00188222f,   0.0014413f,   -0.0404425f,
+  0.0124618f,   -0.0828645f,  0.506166f,     -0.776352f,   -0.405138f,
+  -0.123887f,   0.0732116f,   0.379928f,     0.604524f,    -0.492317f,
+  0.439191f,    0.0744193f,   0.389101f,     0.0604518f,   0.0943165f,
+  0.0339942f,   0.0917975f,   0.0161988f,    0.512227f,    0.538021f,
+  -0.411495f,   0.307281f,    0.33746f,      -0.218639f,   0.265742f,
+  0.39738f,     -0.12442f,    0.125236f,     -0.0845223f,  -0.150396f,
+  0.0334878f,   -0.00391915f, 0.0406864f,    -0.0487059f,  0.0377073f
 };
 
-static const float av1_simple_motion_search_based_split_logits_kernel_16[] = {
-  -0.132207f,   0.15176f,   -0.680086f, 0.605921f, -0.43294f,  0.485811f,
-  -0.306286f,   0.551368f,  0.413904f,  0.548748f, -0.437391f, 0.560778f,
-  -0.00685266f, -0.558657f, 0.122127f,  0.260165f
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = {
+  0.0535976f, -0.0130279f, 0.150146f,   -0.511132f, -0.357698f,
+  0.6719f,    -1.27877f,   -0.0208048f, 0.0961914f, 0.263603f,
+  0.704574f,  -1.48998f,   0.728063f,   0.941829f,  -0.199981f,
+  0.797802f,  -0.29816f,   -0.60894f,   -0.116624f, -1.16723f
 };
 
-static const float av1_simple_motion_search_based_split_layer_0_bias_16[] = {
-  -0.200928f, -0.074132f, 8.69963f,    -9.00807f,  9.08983f, -6.83586f,
-  -3.89329f,  10.4881f,   -0.0670618f, 0.0f,       9.21614f, 8.41773f,
-  -0.145851f, 0.0f,       -1.43038f,   -0.0460311f
+static const float av1_simple_motion_search_split_logits_kernel_16[] = {
+  0.343153f,   -0.2110482f, -0.487199f,   0.3274144f, -2.1975f,
+  -0.6051438f, 0.1901127f,  0.4741924f,   -0.24029f,  -0.185018f,
+  -0.652635f,  2.57714f,    -0.31033031f, -0.307222f, 0.329035f,
+  -0.430181f,  0.3429f,     0.742292f,    0.3269808f, 0.4142165f
 };
 
-static const float av1_simple_motion_search_based_split_logits_bias_16[] = {
-  -4.19885f
+static const float av1_simple_motion_search_split_logits_bias_16[] = {
+  -0.783658f
 };
 
-static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_16 = {
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = {
   NUM_FEATURES_16,
   NUM_LOGITS_16,
   NUM_HIDDEN_LAYERS_16,
@@ -2686,12 +3168,12 @@
       NUM_LAYER_0_UNITS_16,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_kernel_16,
-      av1_simple_motion_search_based_split_logits_kernel_16,
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_16,
+      av1_simple_motion_search_split_logits_kernel_16,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_bias_16,
-      av1_simple_motion_search_based_split_logits_bias_16,
+      av1_simple_motion_search_split_hiddenlayer_0_bias_16,
+      av1_simple_motion_search_split_logits_bias_16,
   },
 };
 
@@ -2700,49 +3182,100 @@
 #undef NUM_LAYER_0_UNITS_16
 #undef NUM_LOGITS_16
 
-#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
-// BLOCK_8X8
 #define NUM_HIDDEN_LAYERS_8 1
-#define NUM_FEATURES_8 6
-#define NUM_LAYER_0_UNITS_8 16
+#define NUM_FEATURES_8 17
+#define NUM_LAYER_0_UNITS_8 20
 #define NUM_LOGITS_8 1
 
-static const float av1_simple_motion_search_based_split_layer_0_kernel_8[] = {
-  0.0370236f,   -0.580211f,  2.0134f,    1.69637f,    2.43181f,   -0.521648f,
-  -0.00375187f, 0.122712f,   -4.74411f,  7.36187f,    5.42574f,   -5.53557f,
-  0.0993344f,   -0.358843f,  0.0765453f, -0.615987f,  -0.754633f, -0.175846f,
-  0.714976f,    0.492862f,   0.346604f,  -1.23922f,   -2.67031f,  2.12749f,
-  1.71511f,     -1.4239f,    2.09396f,   2.42478f,    2.40151f,   2.90487f,
-  0.540813f,    -0.0954257f, -4.57571f,  -4.88078f,   -4.62386f,  -5.75167f,
-  1.35351f,     -1.08114f,   1.43744f,   1.44333f,    0.608153f,  0.193742f,
-  -0.405512f,   -0.155164f,  0.0771456f, -0.473182f,  -0.057984f, 0.140435f,
-  0.743021f,    -0.418589f,  -0.377622f, -0.531411f,  -0.668025f, -0.826607f,
-  1.37834f,     -1.07753f,   0.870466f,  0.516756f,   0.708689f,  0.286795f,
-  -3.97895f,    -0.338629f,  2.79427f,   1.80561f,    1.46275f,   1.50438f,
-  0.0232533f,   -0.43174f,   -0.348251f, 0.0863006f,  0.0321103f, 0.129674f,
-  -1.12024f,    -0.0990596f, -0.283472f, -0.238713f,  -0.239175f, -0.40816f,
-  -0.00106566f, 0.0972736f,  5.19284f,   -3.70862f,   6.39657f,   -5.27588f,
-  -2.08003f,    0.38825f,    2.38771f,   -1.27501f,   -2.45619f,  3.07324f,
-  0.616966f,    -0.451472f,  -0.319365f, 0.00807278f, -0.303261f, -0.351679f
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = {
+  0.079443f,   -1.04068f,   0.336819f,    -0.20901f,   0.796251f,
+  0.181066f,   0.0118876f,  -0.207145f,   0.250671f,   -0.402119f,
+  -0.0847227f, 1.88683f,    0.303469f,    0.0718458f,  0.0338589f,
+  0.158896f,   0.0540238f,  -0.385426f,   0.955925f,   0.424506f,
+  0.492584f,   -0.795058f,  -0.248667f,   -0.905349f,  -0.316989f,
+  0.545471f,   0.63762f,    -0.232613f,   -0.238947f,  -0.395338f,
+  -0.322673f,  -0.0761563f, -0.125357f,   0.0694415f,  -0.371599f,
+  0.358387f,   -0.486841f,  0.403863f,    -0.0295666f, 0.283074f,
+  -0.424396f,  0.156318f,   -0.685355f,   0.6663f,     0.337949f,
+  0.273198f,   0.517448f,   0.458911f,    0.157252f,   0.692096f,
+  0.64965f,    -0.23987f,   -1.08431f,    -0.252475f,  -0.332614f,
+  -0.712291f,  -0.380973f,  0.460545f,    0.48936f,    0.337601f,
+  0.489223f,   1.65336f,    -0.223585f,   0.17367f,    -0.235057f,
+  -0.456773f,  0.327877f,   -0.221192f,   -0.940151f,  -1.06616f,
+  0.687084f,   -0.109973f,  0.106636f,    0.445895f,   0.163432f,
+  0.378306f,   0.201902f,   0.176811f,    0.693082f,   1.62156f,
+  -0.178346f,  0.455175f,   1.61943f,     0.231376f,   0.0890932f,
+  -0.889693f,  -1.03298f,   0.778196f,    -0.0289539f, 0.137848f,
+  0.18707f,    0.171889f,   0.119157f,    0.24893f,    -0.313628f,
+  0.00250735f, -0.0758209f, 0.272974f,    -0.229825f,  2.47926f,
+  -0.0354665f, 0.175366f,   0.0411555f,   -1.52149f,   -0.0258663f,
+  0.253027f,   -0.0520839f, -0.0189782f,  0.362387f,   -0.371154f,
+  0.622929f,   0.0447056f,  0.242529f,    -0.168391f,  0.308935f,
+  -0.117294f,  2.16307f,    0.0673638f,   0.080771f,   -0.460779f,
+  -0.940176f,  0.473266f,   -0.0125302f,  0.475145f,   -0.218187f,
+  0.43258f,    -0.0380196f, 0.413607f,    -0.110856f,  -1.52076f,
+  0.0896812f,  0.246636f,   -0.0612008f,  0.189583f,   0.0106902f,
+  -0.158403f,  -0.629377f,  -0.0634279f,  -0.0864584f, -0.226568f,
+  -0.286234f,  -0.0721132f, -0.43702f,    0.113702f,   0.433372f,
+  0.743396f,   0.14312f,    0.29914f,     0.801188f,   0.7609f,
+  0.385046f,   0.480314f,   0.171119f,    -1.59058f,   -1.18853f,
+  0.150676f,   0.408123f,   -0.00677924f, 0.398145f,   0.0914611f,
+  0.176945f,   0.0677457f,  0.316478f,    0.998219f,   -0.22618f,
+  0.0756793f,  -0.0156674f, 0.105716f,    0.0496245f,  -0.0827133f,
+  -0.423119f,  -0.161033f,  0.212962f,    -0.234453f,  0.743366f,
+  1.04108f,    0.0597604f,  -0.285993f,   -0.114829f,  -0.557364f,
+  -0.840051f,  0.326509f,   -0.192508f,   -0.141769f,  0.370626f,
+  -0.126353f,  0.00672923f, 0.493623f,    -0.852076f,  0.466798f,
+  -0.226436f,  0.259268f,   -0.452662f,   0.0721126f,  0.0198245f,
+  0.2048f,     0.02506f,    0.316194f,    0.814651f,   1.01288f,
+  -0.569607f,  -0.0838994f, 1.37146f,     -0.613135f,  0.441761f,
+  -0.643901f,  0.364269f,   -0.147177f,   0.338001f,   -0.332376f,
+  0.518875f,   -0.628964f,  -0.291889f,   -0.050736f,  0.108047f,
+  1.05673f,    0.0479492f,  0.466756f,    -0.0867334f, -0.0355575f,
+  0.57626f,    -0.227583f,  -0.146421f,   0.0990489f,  0.117351f,
+  -0.103858f,  -0.0336936f, 0.0201903f,   -0.0766383f, -0.010211f,
+  0.0400779f,  0.0725462f,  0.137142f,    0.478261f,   0.287869f,
+  0.0882359f,  -0.739754f,  -0.853521f,   -0.43703f,   0.316856f,
+  0.27593f,    0.312149f,   0.175575f,    0.441839f,   0.264325f,
+  0.0148051f,  -0.005559f,  0.373176f,    0.933701f,   -0.0197615f,
+  0.0219723f,  -0.0559883f, -0.103456f,   -0.0323009f, 0.0773202f,
+  -0.390838f,  0.855488f,   -0.596525f,   -0.249093f,  0.124262f,
+  0.220172f,   0.0552478f,  1.04041f,     -0.960992f,  -0.495255f,
+  -0.211612f,  0.350007f,   -0.238998f,   -0.0265068f, 0.384686f,
+  -0.0815808f, -0.0570019f, 0.123903f,    -0.485114f,  -0.00282573f,
+  -0.0649603f, 0.163719f,   -0.469479f,   -0.439713f,  0.0602562f,
+  -0.527993f,  -0.111458f,  2.48686f,     -0.180723f,  0.0553895f,
+  0.0560679f,  -0.0978928f, -0.216063f,   0.089457f,   -1.5602f,
+  -1.62332f,   -0.147388f,  0.736155f,    0.440409f,   0.243519f,
+  0.0622638f,  0.522932f,   0.109686f,    0.422849f,   0.510589f,
+  1.01116f,    0.174019f,   0.0191171f,   -0.0717751f, -0.0068308f,
+  0.172932f,   -0.834888f,  -0.635788f,   0.32012f,    0.298656f,
+  0.274309f,   -0.155456f,  0.1755f,      -0.175171f,  0.343498f,
+  -0.122832f,  -0.107696f,  0.279924f,    -0.797633f,  -0.344658f,
+  0.162669f,   0.389092f,   0.644479f,    -0.635216f,  -0.181868f,
+  0.0579244f,  -0.0568976f, 0.433003f,    -0.591067f,  0.71013f,
+  -0.165515f,  0.225725f,   -0.358156f,   0.0541944f,  1.95485f,
+  -0.315223f,  0.61537f,    -0.0401568f,  0.22811f,    0.271147f
 };
 
-static const float av1_simple_motion_search_based_split_logits_kernel_8[] = {
-  -0.625847f, 0.381323f, 0.342475f, 0.526161f,  -0.665965f, -0.515317f,
-  -0.406218f, 0.568007f, 0.479397f, -0.426116f, 0.615638f,  0.338572f,
-  0.185583f,  0.308031f, 0.260748f, 0.531619f
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = {
+  1.63441f,  -0.616459f, -0.437775f, -0.71669f,  1.56616f,  2.28109f, 1.64054f,
+  -1.51476f, 0.0274108f, 0.935156f,  -0.966329f, 0.906069f, 1.19954f, -1.25867f,
+  -1.7376f,  -0.594211f, 0.322242f,  0.438631f,  -1.01682f, 1.30032f
 };
 
-static const float av1_simple_motion_search_based_split_layer_0_bias_8[] = {
-  4.73775f,  -1.12658f, -0.258038f, -6.06696f, 1.79131f, 2.49609f,
-  4.28388f,  0.0f,      -4.63598f,  3.06034f,  5.31994f, -0.152142f,
-  0.514738f, -1.30098f, 3.00296f,   -3.83481f
+static const float av1_simple_motion_search_split_logits_kernel_8[] = {
+  -0.463187f, 0.2936127f, 0.16762f,    -0.1663271f, -0.292418f,
+  -0.421457f, -0.378265f, 1.053049f,   0.32432879f, -0.49775575f,
+  0.427357f,  -0.239251f, -0.1631546f, 0.335468f,   0.255371f,
+  0.276901f,  -0.665683f, -0.7021493f, 0.381513f,   -0.1339761f
 };
 
-static const float av1_simple_motion_search_based_split_logits_bias_8[] = {
-  -3.44508f
+static const float av1_simple_motion_search_split_logits_bias_8[] = {
+  -1.739754f
 };
 
-static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_8 = {
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = {
   NUM_FEATURES_8,
   NUM_LOGITS_8,
   NUM_HIDDEN_LAYERS_8,
@@ -2750,76 +3283,114 @@
       NUM_LAYER_0_UNITS_8,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_kernel_8,
-      av1_simple_motion_search_based_split_logits_kernel_8,
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_8,
+      av1_simple_motion_search_split_logits_kernel_8,
   },
   {
-      av1_simple_motion_search_based_split_layer_0_bias_8,
-      av1_simple_motion_search_based_split_logits_bias_8,
+      av1_simple_motion_search_split_hiddenlayer_0_bias_8,
+      av1_simple_motion_search_split_logits_bias_8,
   },
 };
 
-#endif
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
 
-// Model based on simple_motion_search
-
-// Thresholds for doing a single type of partition
-// TODO(chiyotsai@google.com): Set the thresholds for PARTITION_SPLIT.
-static const float av1_simple_motion_search_prune_part_only_thresh_128[10] = {
-  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = {
+  &av1_simple_motion_search_split_nn_config_128,
+  &av1_simple_motion_search_split_nn_config_64,
+  &av1_simple_motion_search_split_nn_config_32,
+  &av1_simple_motion_search_split_nn_config_16,
+  &av1_simple_motion_search_split_nn_config_8,
 };
 
-static const float av1_simple_motion_search_prune_part_only_thresh_64[10] = {
-  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-};
-
-static const float av1_simple_motion_search_prune_part_only_thresh_32[10] = {
-  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-};
-
-static const float av1_simple_motion_search_prune_part_only_thresh_16[10] = {
-  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-};
-
-static const float av1_simple_motion_search_prune_part_only_thresh_8[10] = {
-  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-};
-
-// Thresholds for pruning a partition type
-static const float av1_simple_motion_search_prune_part_prune_thresh_128[10] = {
-  0.0f, 0.0288721601835f, 0.0288721601835f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-  0.0f
-};
-
-static const float av1_simple_motion_search_prune_part_prune_thresh_64[10] = {
-  0.0f, 0.0281573780991f, 0.0281573780991f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-  0.0f
-};
-
-static const float av1_simple_motion_search_prune_part_prune_thresh_32[10] = {
-  0.0f, 0.0225501403434f, 0.0225501403434f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-  0.0f
-};
-
-static const float av1_simple_motion_search_prune_part_prune_thresh_16[10] = {
-  0.0f,
-  0.000961189195907f,
-  0.000961189195907f,
-  0.0f,
-  0.0f,
-  0.0f,
-  0.0f,
-  0.0f,
-  0.0f,
-  0.0f
-};
-
-static const float av1_simple_motion_search_prune_part_prune_thresh_8[10] = {
-  0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+// Model based on simple_motion_search for pruning rect
+// Thresholds. The first idx level is aggresiveness, second is frame resolution,
+// third is bsize
+static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = {
+  // Aggressivness = 0
+  {
+      // Lowres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Midres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Hdres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+  },
+  // Aggressivness = 1
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.116076f,
+          0.049759f,
+          0.057747f,
+          0.006001f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.017380f,
+          0.026077f,
+          0.078111f,
+          0.064477f,
+      },
+      // Hdres
+      {
+          0.002994f,
+          0.103093f,
+          0.076408f,
+          0.010456f,
+          0.187211f,
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.003111f,
+          0.144294f,
+          0.144884f,
+          0.069924f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.013696f,
+          0.055203f,
+          0.152271f,
+          0.078886f,
+      },
+      // Hdres
+      {
+          0.030577f,
+          0.082486f,
+          0.040690f,
+          0.140924f,
+          0.067608f,
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // Lowres
+      { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f,
+        0.287219697095f },
+      // Midres
+      { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f,
+        0.178833795641f },
+      // Hdres
+      { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f,
+        0.21329309279f },
+  },
 };
 
 // Mean and std
-static const float av1_simple_motion_search_prune_part_mean_128[25] = {
+static const float av1_simple_motion_search_prune_rect_mean_128[25] = {
   13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
   10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
   12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
@@ -2827,14 +3398,14 @@
   4.012611f,  4.052191f,  0.853365f,  3.954503f,  3.944135f,
 };
 
-static const float av1_simple_motion_search_prune_part_std_128[25] = {
+static const float av1_simple_motion_search_prune_rect_std_128[25] = {
   2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
   3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
   2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
   1.208679f, 0.353742f, 1.228122f, 1.211777f,
 };
 
-static const float av1_simple_motion_search_prune_part_mean_64[25] = {
+static const float av1_simple_motion_search_prune_rect_mean_64[25] = {
   11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
   9.084122f,  8.559063f,  8.499496f, 8.095865f, 8.041795f,
   10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
@@ -2842,61 +3413,77 @@
   3.306144f,  3.351039f,  0.928582f, 3.319739f, 3.287726f,
 };
 
-static const float av1_simple_motion_search_prune_part_std_64[25] = {
+static const float av1_simple_motion_search_prune_rect_std_64[25] = {
   2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
   3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
   2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
   1.081292f, 0.257521f, 1.112510f, 1.089404f,
 };
 
-static const float av1_simple_motion_search_prune_part_mean_32[25] = {
+static const float av1_simple_motion_search_prune_rect_mean_32[25] = {
   9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
   7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
   8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
   2.751266f, 0.963302f, 2.716584f, 2.709725f,
 };
 
-static const float av1_simple_motion_search_prune_part_std_32[25] = {
+static const float av1_simple_motion_search_prune_rect_std_32[25] = {
   1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
   1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
   1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
   0.952221f, 0.188018f, 0.985295f, 0.946228f,
 };
 
-static const float av1_simple_motion_search_prune_part_mean_16[25] = {
+static const float av1_simple_motion_search_prune_rect_mean_16[25] = {
   8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
   6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
   7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
   2.131698f, 0.981005f, 2.110868f, 2.106539f,
 };
 
-static const float av1_simple_motion_search_prune_part_std_16[25] = {
+static const float av1_simple_motion_search_prune_rect_std_16[25] = {
   1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
   1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
   1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
   0.829935f, 0.136507f, 0.828972f, 0.808563f,
 };
 
-static const float av1_simple_motion_search_prune_part_mean_8[25] = {
+static const float av1_simple_motion_search_prune_rect_mean_8[25] = {
   6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
   4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
   6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
   1.531762f, 0.989606f, 1.496581f, 1.484139f,
 };
 
-static const float av1_simple_motion_search_prune_part_std_8[25] = {
+static const float av1_simple_motion_search_prune_rect_std_8[25] = {
   1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
   1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
   1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
   0.754040f, 0.101419f, 0.738239f, 0.729455f,
 };
 
+static const float *const av1_simple_motion_search_prune_rect_mean[5] = {
+  av1_simple_motion_search_prune_rect_mean_128,
+  av1_simple_motion_search_prune_rect_mean_64,
+  av1_simple_motion_search_prune_rect_mean_32,
+  av1_simple_motion_search_prune_rect_mean_16,
+  av1_simple_motion_search_prune_rect_mean_8,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_std[5] = {
+  av1_simple_motion_search_prune_rect_std_128,
+  av1_simple_motion_search_prune_rect_std_64,
+  av1_simple_motion_search_prune_rect_std_32,
+  av1_simple_motion_search_prune_rect_std_16,
+  av1_simple_motion_search_prune_rect_std_8,
+};
+
 #define NUM_HIDDEN_LAYERS_128 1
 #define NUM_FEATURES_128 25
 #define NUM_LAYER_0_UNITS_128 8
 #define NUM_LOGITS_128 4
 
-static const float av1_simple_motion_search_prune_part_logits_kernel_128[] = {
+static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = {
   -0.129103f, 0.457758f,  -0.489986f, 0.65462f,   -0.184312f, 3.81202f,
   -0.444407f, -0.64198f,  -0.575008f, 0.0311711f, 0.525243f,  -20.892f,
   1.08811f,   -65.0976f,  -12.3973f,  -1.38278f,  -0.264233f, 0.241636f,
@@ -2905,12 +3492,12 @@
   0.398452f,  0.696949f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_bias_128[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = {
   1.22789f, -1.34527f, 0.759048f,  0.315086f,
   1.0834f,  -1.58019f, -0.465158f, 1.20716f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_kernel_128[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = {
   -0.668677f,  0.58694f,    -0.417094f,   0.754735f,   -0.7859f,
   0.377479f,   -0.0415929f, -0.0140585f,  -0.730001f,  0.747528f,
   -0.135247f,  0.406505f,   -0.234184f,   0.956362f,   -0.637555f,
@@ -2953,11 +3540,11 @@
   -0.197626f,  0.130044f,   -0.234488f,   -0.0373991f, -0.0717973f
 };
 
-static const float av1_simple_motion_search_prune_part_logits_bias_128[] = {
+static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = {
   1.58571f, -4.6314f, -2.00273f, 0.543699f
 };
 
-static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_128 = {
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = {
   NUM_FEATURES_128,
   NUM_LOGITS_128,
   NUM_HIDDEN_LAYERS_128,
@@ -2965,12 +3552,12 @@
       NUM_LAYER_0_UNITS_128,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_kernel_128,
-      av1_simple_motion_search_prune_part_logits_kernel_128,
+      av1_simple_motion_search_prune_rect_layer_0_kernel_128,
+      av1_simple_motion_search_prune_rect_logits_kernel_128,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_bias_128,
-      av1_simple_motion_search_prune_part_logits_bias_128,
+      av1_simple_motion_search_prune_rect_layer_0_bias_128,
+      av1_simple_motion_search_prune_rect_logits_bias_128,
   },
 };
 
@@ -2984,7 +3571,7 @@
 #define NUM_LAYER_0_UNITS_64 32
 #define NUM_LOGITS_64 10
 
-static const float av1_simple_motion_search_prune_part_logits_kernel_64[] = {
+static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = {
   0.10424f,    -0.346025f,  0.534547f,   -0.385925f,  2.58341f,    -0.256414f,
   -0.232498f,  0.329823f,   -0.0777376f, -0.590939f,  0.062657f,   -0.628252f,
   0.0934588f,  2.04029f,    -0.224448f,  0.371168f,   -0.385348f,  -0.589883f,
@@ -3041,7 +3628,7 @@
   -0.359633f,  0.668108f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_bias_64[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = {
   0.0735592f, -0.045064f, -0.0114103f, 1.39246f,    -0.683467f,  0.155765f,
   -0.667652f, -0.202425f, -0.585433f,  -0.146752f,  -0.0812931f, 0.580642f,
   0.578542f,  -0.831916f, 0.610063f,   0.0101856f,  -0.235863f,  0.538141f,
@@ -3050,7 +3637,7 @@
   0.656818f,  0.0169274f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_kernel_64[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = {
   -0.310947f,   -0.232675f,    0.0171092f,    0.0834474f,   0.373977f,
   0.300429f,    0.215072f,     -0.454074f,    0.187565f,    0.282742f,
   0.562562f,    -0.0419322f,   0.000978486f,  -0.298267f,   0.216934f,
@@ -3213,12 +3800,12 @@
   -0.737297f,   -0.201515f,    -0.025122f,    -0.109854f,   0.36738f
 };
 
-static const float av1_simple_motion_search_prune_part_logits_bias_64[] = {
+static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = {
   0.346819f,  0.442965f,  -0.0216032f,  0.0229235f, -0.402797f,
   -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
 };
 
-static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_64 = {
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = {
   NUM_FEATURES_64,
   NUM_LOGITS_64,
   NUM_HIDDEN_LAYERS_64,
@@ -3226,12 +3813,12 @@
       NUM_LAYER_0_UNITS_64,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_kernel_64,
-      av1_simple_motion_search_prune_part_logits_kernel_64,
+      av1_simple_motion_search_prune_rect_layer_0_kernel_64,
+      av1_simple_motion_search_prune_rect_logits_kernel_64,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_bias_64,
-      av1_simple_motion_search_prune_part_logits_bias_64,
+      av1_simple_motion_search_prune_rect_layer_0_bias_64,
+      av1_simple_motion_search_prune_rect_logits_bias_64,
   },
 };
 
@@ -3245,7 +3832,7 @@
 #define NUM_LAYER_0_UNITS_32 28
 #define NUM_LOGITS_32 10
 
-static const float av1_simple_motion_search_prune_part_logits_kernel_32[] = {
+static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = {
   0.486581f,    0.340847f,   -0.109226f,   0.467224f,   -0.541561f,
   0.0943619f,   -0.429442f,  -0.207442f,   0.959963f,   0.618666f,
   -0.0636751f,  0.144508f,   -0.0278289f,  0.332293f,   -0.751493f,
@@ -3304,7 +3891,7 @@
   -0.280626f,   0.42476f,    0.157411f,    0.0358675f,  -0.192591f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_bias_32[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = {
   0.940498f,  0.15602f,   -0.234831f, 0.0268585f, 0.144769f,  0.243081f,
   0.611406f,  0.366093f,  0.361868f,  0.39668f,   0.401479f,  0.369467f,
   0.0909503f, 0.710595f,  0.032786f,  0.525891f,  -1.0232f,   0.732557f,
@@ -3312,7 +3899,7 @@
   0.59681f,   -0.472405f, 0.0969218f, -0.250624f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_kernel_32[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = {
   0.355607f,    0.126701f,    -0.0825159f,  0.200675f,     -0.011308f,
   -0.280057f,   0.559816f,    0.142689f,    0.0422419f,    -0.151692f,
   -0.0275637f,  -0.283101f,   -0.20822f,    -0.200394f,    0.465427f,
@@ -3455,12 +4042,12 @@
   -0.092764f,   0.0295707f,   -0.0462887f,  -0.00636006f,  0.0334169f
 };
 
-static const float av1_simple_motion_search_prune_part_logits_bias_32[] = {
+static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = {
   0.176459f,  0.154405f, 0.281821f,  0.375264f,  -0.882863f,
   -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
 };
 
-static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_32 = {
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = {
   NUM_FEATURES_32,
   NUM_LOGITS_32,
   NUM_HIDDEN_LAYERS_32,
@@ -3468,12 +4055,12 @@
       NUM_LAYER_0_UNITS_32,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_kernel_32,
-      av1_simple_motion_search_prune_part_logits_kernel_32,
+      av1_simple_motion_search_prune_rect_layer_0_kernel_32,
+      av1_simple_motion_search_prune_rect_logits_kernel_32,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_bias_32,
-      av1_simple_motion_search_prune_part_logits_bias_32,
+      av1_simple_motion_search_prune_rect_layer_0_bias_32,
+      av1_simple_motion_search_prune_rect_logits_bias_32,
   },
 };
 
@@ -3487,7 +4074,7 @@
 #define NUM_LAYER_0_UNITS_16 32
 #define NUM_LOGITS_16 10
 
-static const float av1_simple_motion_search_prune_part_logits_kernel_16[] = {
+static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = {
   -0.520913f,   0.395611f,    0.0369091f,   -0.318591f,  -0.463252f,
   0.134992f,    -0.43154f,    -0.0739112f,  -0.118817f,  0.476373f,
   -0.281406f,   0.3413f,      0.456255f,    0.33307f,    0.2942f,
@@ -3554,7 +4141,7 @@
   -0.202705f,   -0.0852339f,  -0.62572f,    -0.0734234f, -0.838088f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_bias_16[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = {
   -0.0616197f, 0.939947f, 0.521161f,  0.213886f,  0.130324f,  -0.127443f,
   -0.0538715f, 0.708746f, 0.445031f,  0.418781f,  -0.114539f, 0.521941f,
   1.13719f,    0.606545f, -0.32193f,  -0.150788f, 0.158487f,  -0.224005f,
@@ -3563,7 +4150,7 @@
   0.661496f,   0.95533f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_kernel_16[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = {
   -0.203489f,   0.00686229f,  -0.161414f,   0.0637276f,   0.27516f,
   0.512219f,    0.164205f,    0.00326062f,  -0.41914f,    -0.400334f,
   0.554419f,    0.715772f,    -0.295569f,   -0.703503f,   0.0137744f,
@@ -3726,12 +4313,12 @@
   -0.0679228f,  -0.203457f,   0.131948f,    -0.0041251f,  -0.209054f
 };
 
-static const float av1_simple_motion_search_prune_part_logits_bias_16[] = {
+static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = {
   0.304025f,  0.131887f, 0.259279f,  -0.561564f, -0.161729f,
   -0.208036f, 0.102206f, -0.162937f, -1.42311f,  -0.708305f
 };
 
-static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_16 = {
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = {
   NUM_FEATURES_16,
   NUM_LOGITS_16,
   NUM_HIDDEN_LAYERS_16,
@@ -3739,12 +4326,12 @@
       NUM_LAYER_0_UNITS_16,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_kernel_16,
-      av1_simple_motion_search_prune_part_logits_kernel_16,
+      av1_simple_motion_search_prune_rect_layer_0_kernel_16,
+      av1_simple_motion_search_prune_rect_logits_kernel_16,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_bias_16,
-      av1_simple_motion_search_prune_part_logits_bias_16,
+      av1_simple_motion_search_prune_rect_layer_0_bias_16,
+      av1_simple_motion_search_prune_rect_logits_bias_16,
   },
 };
 
@@ -3758,7 +4345,7 @@
 #define NUM_LAYER_0_UNITS_8 32
 #define NUM_LOGITS_8 4
 
-static const float av1_simple_motion_search_prune_part_logits_kernel_8[] = {
+static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = {
   -0.266303f,  -0.387676f,  0.204501f,   -0.120842f,  -0.0752326f, 0.0337739f,
   0.0243477f,  -0.356748f,  0.0143051f,  -0.16403f,   -0.139013f,  0.175003f,
   -0.206754f,  0.349059f,   0.181763f,   0.212768f,   -0.313783f,  0.182829f,
@@ -3783,7 +4370,7 @@
   -0.112242f,  0.295184f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_bias_8[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = {
   -2.16023f,  -3.12831f, -0.213206f,  -2.97875f, -1.83791f, -2.84713f,
   -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
   -0.853224f, -3.29503f, -0.537517f,  0.923106f, -3.18665f, -1.29905f,
@@ -3792,7 +4379,7 @@
   -0.490783f, -0.415782f
 };
 
-static const float av1_simple_motion_search_prune_part_layer_0_kernel_8[] = {
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = {
   -0.702198f,  -0.102148f,   0.0564545f,   -0.0555548f,  0.16184f,
   0.0950792f,  0.136974f,    -0.00824146f, 0.05746f,     0.0447542f,
   0.145978f,   0.0855769f,   -0.041449f,   0.301347f,    -0.0206691f,
@@ -3955,11 +4542,11 @@
   -0.0811161f, 0.00237994f,  0.850042f,    0.0665473f,   0.134413f
 };
 
-static const float av1_simple_motion_search_prune_part_logits_bias_8[] = {
+static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = {
   1.63404f, -0.715866f, -1.0132f, -2.08745f
 };
 
-static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_8 = {
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = {
   NUM_FEATURES_8,
   NUM_LOGITS_8,
   NUM_HIDDEN_LAYERS_8,
@@ -3967,12 +4554,12 @@
       NUM_LAYER_0_UNITS_8,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_kernel_8,
-      av1_simple_motion_search_prune_part_logits_kernel_8,
+      av1_simple_motion_search_prune_rect_layer_0_kernel_8,
+      av1_simple_motion_search_prune_rect_logits_kernel_8,
   },
   {
-      av1_simple_motion_search_prune_part_layer_0_bias_8,
-      av1_simple_motion_search_prune_part_logits_bias_8,
+      av1_simple_motion_search_prune_rect_layer_0_bias_8,
+      av1_simple_motion_search_prune_rect_logits_bias_8,
   },
 };
 
@@ -3981,77 +4568,14 @@
 #undef NUM_LAYER_0_UNITS_8
 #undef NUM_LOGITS_8
 
-#define FEATURE_SIZE 19
-static const float av1_2pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
-  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
-  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
-  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
-  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
-};
-
-static const float av1_2pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
-  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
-  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
-  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
-  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
-};
-
-static const float av1_2pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
-  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
-  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
-  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
-  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
-};
-
-static const float av1_2pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
-  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
-  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
-  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
-  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
-};
-
-static const float av1_2pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
-  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
-  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
-  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
-  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
-};
-
-static const float av1_2pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
-  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
-  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
-  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
-  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
-};
-
-static const float av1_2pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
-  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
-  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
-  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
-  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
-};
-
-static const float av1_2pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
-  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
-  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
-  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
-  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
-};
-
-static const float av1_2pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
-  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
-  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
-  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
-  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
-};
-
-static const float av1_2pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
-  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
-  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
-  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
-  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
-};
-#undef FEATURE_SIZE
+static const NN_CONFIG
+    *const av1_simple_motion_search_prune_rect_nn_config[5] = {
+      &av1_simple_motion_search_prune_rect_nn_config_128,
+      &av1_simple_motion_search_prune_rect_nn_config_64,
+      &av1_simple_motion_search_prune_rect_nn_config_32,
+      &av1_simple_motion_search_prune_rect_nn_config_16,
+      &av1_simple_motion_search_prune_rect_nn_config_8,
+    };
 
 // nn model for predicting max square partition level of a superblock
 #define NUM_HIDDEN_LAYERS 1
@@ -4368,438 +4892,752 @@
   -0.5396254205f,
 };
 
-// Early termination in firstpass
-static const float av1_fp_simple_motion_search_term_none_mean_32[20] = {
-  10.216787f, 10.167575f, 8.405353f, 8.340786f,  8.436503f,
-  8.373259f,  8.444113f,  8.379074f, 8.448215f,  8.384669f,
-  4.107491f,  0.923902f,  2.702687f, 2.712742f,  0.953166f,
-  2.703244f,  2.707070f,  9.549801f, 12.013671f, 17.059454f,
+#define FEATURES 31
+#define HIDDEN_NODES 32
+static const float av1_early_term_after_split_nn_weights_64_layer0[] = {
+  -0.306296f, -0.691664f, 0.335148f,  -0.298465f, -0.509241f, -0.632796f,
+  -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f,
+  -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f,
+  -0.240293f, 0.121749f,  -0.489777f, -0.756647f, 0.001047f,  -0.016528f,
+  0.145714f,  0.172910f,  0.086197f,  0.162882f,  -0.070588f, -0.077104f,
+  0.502730f,  -0.244954f, 0.265605f,  -0.323994f, 0.223397f,  -1.086453f,
+  0.391886f,  0.200343f,  0.253878f,  0.018925f,  0.201819f,  -0.205136f,
+  0.427314f,  0.041155f,  0.070484f,  0.159925f,  -0.057095f, -0.146544f,
+  -0.073792f, 0.152628f,  0.003986f,  -0.515965f, -0.209754f, 0.037457f,
+  0.070622f,  -0.143571f, -0.059602f, 0.111734f,  0.319674f,  0.149894f,
+  -0.219883f, 0.206678f,  0.015809f,  -0.210549f, 0.130156f,  -0.189502f,
+  -0.850392f, -0.156363f, -0.060354f, 0.189044f,  0.266495f,  0.151305f,
+  -0.563677f, -0.354896f, 0.300637f,  0.257568f,  -0.008359f, -0.535497f,
+  -0.003127f, 0.293054f,  -0.020212f, -0.157278f, 0.229972f,  -0.309799f,
+  -0.329927f, -0.077140f, 0.001177f,  -0.024415f, 0.134044f,  -0.181587f,
+  -0.135380f, 0.230989f,  -0.281451f, 0.912282f,  0.511562f,  -3.900779f,
+  -0.039917f, 1.956406f,  -0.357589f, 0.292998f,  -0.950158f, 0.422041f,
+  0.526572f,  0.605746f,  -0.147110f, 0.256576f,  0.090010f,  0.221641f,
+  0.029763f,  0.351592f,  0.458324f,  -0.005888f, 0.010521f,  -0.389326f,
+  -0.094006f, -0.171489f, -0.013153f, 0.026333f,  -0.454571f, -1.932891f,
+  -0.168211f, 0.051298f,  -0.258061f, -0.028936f, -0.555937f, -0.475566f,
+  -0.304046f, -0.318113f, 0.099697f,  -0.217145f, 0.139433f,  -0.203986f,
+  -0.164012f, 0.051527f,  0.138603f,  -0.085100f, -0.082887f, -0.242955f,
+  -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f,  0.135086f,
+  0.146200f,  0.184827f,  -0.199041f, 0.162570f,  -0.300167f, 0.017748f,
+  -0.140111f, 0.103553f,  0.206929f,  0.193446f,  0.123141f,  -1.201898f,
+  -0.052254f, -0.750121f, 0.111741f,  0.204092f,  -0.166266f, 0.124008f,
+  -0.455496f, 0.306035f,  0.275903f,  0.193599f,  -0.730011f, 0.126808f,
+  0.051059f,  0.103634f,  -0.044334f, 0.048889f,  0.405228f,  0.574099f,
+  0.061167f,  0.260576f,  0.070032f,  -0.038040f, 0.229183f,  -0.243269f,
+  -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f,
+  -0.290828f, -0.233006f, 0.068712f,  0.618085f,  -0.407008f, 0.686868f,
+  0.172247f,  0.826287f,  -0.002672f, 0.239825f,  -0.051548f, 0.420773f,
+  0.218747f,  0.041057f,  -0.071189f, 0.286987f,  -0.113915f, 0.122561f,
+  0.013979f,  -0.049046f, 0.148175f,  0.031313f,  -0.248601f, 0.209488f,
+  0.069008f,  0.072763f,  0.332475f,  0.079986f,  -0.151042f, -0.205110f,
+  -0.155550f, -0.510408f, 0.330429f,  0.577729f,  0.266524f,  -0.378489f,
+  0.228204f,  0.055318f,  0.117583f,  -0.588557f, -0.778201f, 0.434622f,
+  -0.227820f, 0.611642f,  0.170548f,  0.817761f,  0.006642f,  -1.005794f,
+  -0.911490f, 1.633684f,  -0.290664f, 0.308128f,  0.295986f,  0.243377f,
+  -0.001275f, -0.131156f, 0.275205f,  -0.041865f, -0.201951f, -0.016380f,
+  0.336604f,  -0.258118f, 0.890810f,  0.441065f,  -0.968006f, 0.135989f,
+  -1.447191f, 0.353426f,  -0.343235f, 0.376837f,  -0.071602f, -0.319639f,
+  -0.072347f, 0.547450f,  -0.215380f, 0.182141f,  -0.066186f, 0.033787f,
+  0.257482f,  0.217428f,  -0.130249f, 0.057525f,  0.263991f,  0.230664f,
+  -0.245113f, 0.048610f,  -0.079955f, 0.251737f,  -0.070368f, -0.017968f,
+  -0.151815f, 0.025945f,  -0.257769f, 0.299735f,  0.077263f,  -0.565526f,
+  0.326263f,  0.096429f,  0.113414f,  0.092754f,  -0.141908f, 0.172060f,
+  0.393117f,  -0.216755f, 0.331051f,  -0.363369f, -0.113363f, -0.095164f,
+  -0.072784f, 0.214572f,  0.010993f,  0.209456f,  0.260381f,  -0.314747f,
+  -0.422173f, -0.189963f, -0.225130f, 0.339448f,  0.153814f,  0.265616f,
+  -0.103575f, -0.123841f, -0.106236f, 0.155894f,  -0.156264f, -1.361406f,
+  -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f,
+  0.105758f,  0.040788f,  -0.313589f, -1.359318f, 0.071329f,  0.176404f,
+  -0.476141f, 0.010108f,  -0.201440f, -0.221167f, -0.197448f, -0.013927f,
+  -0.610270f, -0.607285f, 0.178070f,  0.174320f,  0.313115f,  0.026191f,
+  -0.112330f, 0.122338f,  -0.367751f, 0.196794f,  0.153709f,  -0.205454f,
+  -0.397471f, -1.879336f, -0.030129f, 0.143429f,  -0.079832f, 0.435259f,
+  -1.729539f, 0.518301f,  -0.141393f, 0.199399f,  -1.914601f, 0.142865f,
+  -0.219899f, 0.508458f,  0.086365f,  -0.220740f, -0.012507f, 1.263320f,
+  0.042136f,  0.050922f,  -0.329644f, -0.188198f, 0.251522f,  0.394731f,
+  -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f,  0.359257f,
+  -0.427732f, -0.100652f, 0.192129f,  0.075572f,  0.916708f,  0.255747f,
+  0.486384f,  0.127989f,  -0.556449f, -0.484913f, 0.392298f,  0.045401f,
+  -0.839551f, -0.703619f, 0.069263f,  -0.040720f, 0.542265f,  0.443739f,
+  0.862552f,  -0.021726f, 0.230858f,  -0.261004f, -0.125697f, -0.106435f,
+  0.002341f,  0.013904f,  0.011034f,  0.542296f,  -0.284325f, 0.135736f,
+  0.113882f,  0.040610f,  -0.255485f, 0.224061f,  -0.087140f, 0.127872f,
+  -0.002638f, 0.164889f,  -0.335958f, -0.031166f, -0.393581f, 0.075455f,
+  0.055995f,  0.087934f,  -0.133859f, -0.342187f, 0.002492f,  -0.340722f,
+  0.058304f,  0.104165f,  -0.142136f, -0.351111f, -0.158037f, -0.079924f,
+  -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f,
+  0.076088f,  -0.232091f, -0.070052f, 0.097595f,  0.063173f,  -0.211195f,
+  0.126478f,  -0.178828f, 0.278723f,  -0.070807f, -0.179783f, 0.034123f,
+  0.035721f,  -0.200431f, 0.170640f,  0.107933f,  0.226594f,  -0.301499f,
+  -0.291096f, 0.228076f,  -0.272951f, 0.002490f,  -0.210707f, -0.128033f,
+  -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f,
+  -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f,
+  0.749334f,  -1.161845f, 0.505480f,  0.221733f,  0.210490f,  -0.234984f,
+  0.014183f,  -0.510401f, 0.238692f,  -0.134111f, 0.083844f,  -0.478751f,
+  -0.088434f, 0.304063f,  0.150336f,  -0.749682f, -0.081999f, 0.729739f,
+  0.412508f,  0.132571f,  0.058306f,  -0.047451f, -0.117435f, -0.445395f,
+  -0.005182f, -0.025757f, 0.175051f,  -0.258194f, -0.150311f, -0.196533f,
+  -1.314316f, -0.428627f, 0.512451f,  0.045138f,  -0.200925f, 0.081538f,
+  -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f,
+  -0.419858f, -0.154321f, 0.376970f,  0.094017f,  0.783520f,  0.110641f,
+  0.077966f,  -0.093064f, 0.160522f,  -0.863041f, 0.086210f,  0.560764f,
+  0.057032f,  0.159224f,  0.323068f,  -0.173109f, 0.014042f,  -0.126856f,
+  -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f,
+  -0.215834f, 0.062076f,  -0.270596f, 0.271581f,  -0.153486f, -0.247165f,
+  0.079737f,  -0.157049f, -0.027459f, -0.299397f, 0.136729f,  -0.334192f,
+  -0.191722f, 0.145865f,  -0.031324f, -0.307165f, -0.244923f, -0.228027f,
+  0.063807f,  0.054965f,  -0.005709f, -0.041977f, -0.276245f, 0.020003f,
+  0.133323f,  -0.145992f, -0.951030f, 0.414083f,  -1.063323f, 0.137872f,
+  0.104732f,  -0.123728f, 0.542532f,  0.213654f,  0.542954f,  0.155619f,
+  0.543072f,  0.399067f,  0.191402f,  -0.102552f, -0.176734f, -0.136776f,
+  -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f,
+  0.058331f,  0.126601f,  0.104420f,  -0.148684f, 0.343218f,  0.093604f,
+  -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f,
+  0.042791f,  -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f,
+  -0.070898f, -0.156415f, 0.112831f,  -0.065931f, -0.353007f, 0.058453f,
+  -0.136982f, 0.233393f,  0.017240f,  -0.018428f, 0.229104f,  -0.371440f,
+  -0.262212f, 0.203075f,  -0.263293f, 0.034413f,  -0.299354f, 0.227269f,
+  0.204977f,  -0.118107f, -0.359832f, -0.068252f, 0.480105f,  -0.214711f,
+  -0.614381f, 0.209048f,  -0.456014f, -0.188819f, -0.220995f, -0.322104f,
+  -0.191457f, 0.420874f,  -0.454919f, 0.023119f,  0.291700f,  -0.532885f,
+  -0.032642f, 0.043271f,  0.133974f,  0.002399f,  -0.179899f, -0.044158f,
+  -0.027078f, -0.350075f, 0.236766f,  0.346771f,  -0.118534f, -0.421221f,
+  0.019544f,  0.109349f,  0.141517f,  0.403561f,  0.409102f,  0.054555f,
+  -0.561751f, 0.577183f,  -0.705156f, -0.231188f, -1.969772f, 0.172289f,
+  -0.048122f, 0.205671f,  -0.667130f, -0.066870f, 0.202838f,  -0.095538f,
+  -0.842651f, 0.254170f,  0.046256f,  -0.271891f, -0.369254f, 0.492101f,
+  0.001189f,  -0.186525f, 0.188470f,  -0.207072f, 0.030086f,  -0.132904f,
+  0.127001f,  0.116662f,  -0.079246f, 0.227241f,  -0.462178f, 0.446304f,
+  -1.660753f, 0.241832f,  -0.288040f, 0.054663f,  -0.435804f, 0.296782f,
+  -0.026421f, -0.115618f, 0.163416f,  0.834001f,  0.008019f,  -0.014243f,
+  0.524658f,  0.067894f,  -0.253936f, -0.100657f, 1.285389f,  -0.005952f,
+  0.087134f,  -0.088375f, -0.121866f, -0.171172f, 0.279463f,  -0.598593f,
+  -0.727761f, 0.189831f,  -0.822575f, -0.291141f, -0.012410f, -0.069999f,
+  0.098842f,  -0.218513f, 0.009494f,  0.100106f,  -0.402884f, -0.299236f,
+  -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f,
+  -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f,
+  -0.012113f, -0.032962f, -0.450648f, 0.129060f,  -0.135227f, -0.298593f,
+  0.001435f,  0.278790f,  -0.272945f, 0.162759f,  -0.290208f, 0.058481f,
+  -0.490971f, 0.019630f,  -0.210347f, 0.000520f,  -0.340413f, 0.641562f,
+  0.023104f,  0.194832f,  -0.441894f, -0.253538f, -0.228332f, 0.423264f,
+  -1.094073f, -0.475657f, -0.238752f, 0.033910f,  0.440425f,  0.036320f,
+  0.566989f,  -0.065326f, -0.297939f, 0.406098f,  0.529561f,  -0.113084f,
+  0.141472f,  -0.024462f, -0.179212f, 0.187801f,  -0.235787f, -0.229624f,
+  0.357791f,  0.061110f,  -0.607788f, -1.713694f, -0.651041f, 1.734283f,
+  -0.334701f, 0.161687f,  0.010215f,  0.320708f,  0.169447f,  0.513558f,
+  0.488340f,  -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f,
+  0.327028f,  -0.100539f, 0.012048f,  -0.223013f, -0.239680f, 0.323035f,
+  0.165950f,  -0.155110f, 0.128664f,  -0.157378f, -0.124490f, 0.291553f,
+  0.055849f,  -0.221664f, 0.077770f,  -0.350658f, -0.181939f, 0.110230f,
+  -0.078219f, 0.007472f,  -0.031620f, 0.007708f,  -0.201794f, 0.017594f,
+  -0.027480f, 0.058884f,  -0.369166f, -0.369770f, 0.181635f,  -0.183318f,
+  -0.389184f, -0.256661f, 0.160107f,  0.037127f,  -0.082573f, -0.095815f,
+  -0.322782f, 0.072528f,  -0.348875f, 0.216247f,  -0.161757f, -0.385502f,
+  -0.315738f, 0.020123f,  -0.155609f, 0.114403f,  -0.383232f, 0.629529f,
+  0.066142f,  0.448392f,  -0.389557f, -0.083315f, 0.829535f,  -0.015531f,
+  -0.050728f, -0.325127f, 0.812992f,  -0.196780f, 0.021060f,  -0.952647f,
+  0.006687f,  -0.512715f, -0.066778f, 0.410067f,  -0.116945f, -0.288283f,
+  0.189334f,  -0.083153f, 0.159980f,  -0.068208f, 0.107358f,  -0.154411f,
+  -0.068914f, 0.186816f,  0.032251f,  0.109242f,  0.134825f,  0.035101f,
+  -0.253175f, 0.157309f,  -0.363597f, -0.138176f, -0.334141f, -0.172697f,
+  0.045800f,  -0.286057f, 0.173403f,  -0.172444f, -0.117996f, -0.383848f,
+  -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f,
+  0.056121f,  0.155046f,  0.044708f,  -0.295609f, -0.211688f, -0.233229f,
+  -0.264980f, 0.145549f,  0.045323f,  -0.027112f, 0.175638f,  -0.207251f,
+  -0.055274f, 0.092706f,  0.086200f,  -0.241340f, -0.147416f, 0.024510f,
+  -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f,
+  -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f,
+  -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f,  -0.090609f,
+  0.006595f,  -0.200790f, 0.171856f,  -0.027766f, -0.032017f, -0.006745f,
+  0.566426f,  -0.096850f, 0.727633f,  -0.408065f, -0.012436f, 0.005646f,
+  -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f,
+  -0.231385f, -0.203175f, 0.041903f,  -0.373694f, 0.058239f,  -0.101116f,
+  0.183772f,  0.164523f,  -0.099046f, -0.201272f, -0.394523f, -0.157517f,
+  0.032079f,  -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f,
+  0.100268f,  -0.023806f, 0.004978f,  0.184916f,  0.142699f,  -0.113240f,
+  -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f,
+  -0.113085f, -0.279699f, -0.267434f, 0.126263f,  -0.260527f, -0.153904f,
+  -0.494653f, -0.355144f, 0.030549f,  -0.216400f, -0.123363f, 0.189090f,
+  0.219122f,  0.096677f,  -0.202037f, -0.014489f, -0.137859f, -0.114184f,
+  -0.279423f, -0.270683f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_std_32[20] = {
-  1.886182f, 1.886638f, 1.884324f, 1.883410f, 1.851800f, 1.851652f, 1.847129f,
-  1.848014f, 1.832187f, 1.832360f, 1.758185f, 0.265155f, 0.939592f, 0.932395f,
-  0.211284f, 0.950024f, 0.945295f, 1.846744f, 1.453674f, 1.505994f,
+static const float av1_early_term_after_split_nn_bias_64_layer0[] = {
+  -0.491455f, 0.464538f,  -0.005742f, -0.219951f, -0.073682f, 0.102027f,
+  0.567071f,  0.441402f,  0.277521f,  0.314498f,  -0.448199f, -0.065032f,
+  0.488139f,  -0.079632f, 0.000000f,  0.521555f,  -0.151950f, -0.034616f,
+  0.393438f,  -0.072242f, -0.087343f, -0.571308f, 0.017372f,  -0.126144f,
+  0.372261f,  -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f,
+  -0.109614f, -0.164492f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_mean_16[20] = {
-  9.131485f, 9.065489f, 7.254479f, 7.158092f, 7.274240f,  7.178158f,  7.278780f,
-  7.182110f, 7.278793f, 7.182714f, 3.981902f, 0.964040f,  2.080875f,  2.087185f,
-  0.973397f, 2.088189f, 2.090166f, 9.386505f, 10.826546f, 15.985614f,
+static const float av1_early_term_after_split_nn_weights_64_layer1[] = {
+  -0.373195f, -0.283141f, 0.416113f,  0.483659f,  0.230583f,  0.349197f,
+  -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f,  0.339355f,
+  -0.828033f, 0.019617f,  0.118757f,  -0.619360f, 0.282295f,  -0.054116f,
+  -0.730596f, 0.068567f,  -0.248707f, 0.461225f,  0.330224f,  -0.287080f,
+  -0.458103f, 0.591852f,  -0.008491f, 0.632119f,  -0.007872f, 0.007869f,
+  -0.230698f, -0.011437f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_std_16[20] = {
-  1.681172f, 1.688587f, 1.710854f, 1.717533f, 1.684010f, 1.691476f, 1.683537f,
-  1.691523f, 1.674699f, 1.682130f, 1.639731f, 0.186191f, 0.796448f, 0.795075f,
-  0.160921f, 0.791005f, 0.790048f, 1.430960f, 1.337976f, 1.370498f,
+static const float av1_early_term_after_split_nn_bias_64_layer1[] = {
+  -0.55403697f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_mean_8[20] = {
-  7.821461f, 7.714526f, 5.799360f, 5.606948f, 5.805885f, 5.614357f,  5.794252f,
-  5.599669f, 5.798780f, 5.605399f, 4.069016f, 0.977720f, 1.577513f,  1.581266f,
-  0.983371f, 1.524603f, 1.524952f, 9.221803f, 9.508886f, 14.972815f,
-};
-
-static const float av1_fp_simple_motion_search_term_none_std_8[20] = {
-  1.618036f, 1.634415f, 1.652861f, 1.672006f, 1.646337f, 1.664935f, 1.650876f,
-  1.670476f, 1.645141f, 1.664301f, 1.502258f, 0.147592f, 0.760353f, 0.762547f,
-  0.127879f, 0.741096f, 0.742186f, 1.042003f, 1.292524f, 1.250398f,
-};
-
-#define NUM_HIDDEN_LAYERS_32 1
-#define NUM_FEATURES_32 20
-#define NUM_LAYER_0_UNITS_32 20
-#define NUM_LOGITS_32 1
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32[] = {
-      -0.293987f,   0.796773f,     -0.0888487f, -0.00796495f, -0.343768f,
-      0.0783252f,   0.0596814f,    -0.235432f,  -0.0780005f,  -0.409017f,
-      -0.256821f,   -0.281654f,    1.00889f,    0.701893f,    -0.0181661f,
-      0.119718f,    0.0956582f,    0.76792f,    0.235693f,    0.351628f,
-      -1.28111f,    -1.45847f,     0.387732f,   0.476054f,    0.384561f,
-      0.427465f,    0.11875f,      -0.0176598f, -0.0528453f,  0.395589f,
-      -0.331994f,   0.0442108f,    0.195171f,   -0.0377402f,  -0.0736457f,
-      -0.0490903f,  0.116165f,     -0.549512f,  0.12968f,     0.641055f,
-      -1.03066f,    -0.601979f,    0.351981f,   -0.122019f,   0.00869275f,
-      0.399222f,    -0.343995f,    -0.444257f,  -0.160805f,   -0.537537f,
-      0.261478f,    -0.163785f,    0.218916f,   0.106506f,    -0.103819f,
-      0.0121841f,   0.284757f,     -0.362989f,  1.10793f,     0.477236f,
-      -0.424117f,   -0.884156f,    -0.468291f,  -0.510531f,   0.791441f,
-      0.75243f,     0.839871f,     0.604127f,   -0.182956f,   -0.246703f,
-      -1.25861f,    0.0546303f,    0.0811323f,  0.00655988f,  0.0286305f,
-      -0.00938366f, -0.0291418f,   -0.231632f,  -0.331077f,   1.12479f,
-      -0.635514f,   -0.146066f,    0.853122f,   0.923699f,    0.180011f,
-      -0.252973f,   0.1474f,       -0.454344f,  0.354736f,    0.576872f,
-      -1.43275f,    0.0327868f,    0.140849f,   -0.102523f,   0.0524867f,
-      0.007091f,    -0.00232578f,  -0.536116f,  -0.700144f,   0.166646f,
-      0.0636548f,   0.44645f,      -0.346062f,  -0.685779f,   -1.0792f,
-      -0.999219f,   0.442744f,     0.371198f,   0.777914f,    0.719409f,
-      -0.417984f,   0.0602868f,    0.0225539f,  0.0457407f,   0.0249501f,
-      0.0126021f,   0.00450792f,   0.0485095f,  0.203485f,    0.584116f,
-      -0.599426f,   -0.244633f,    0.168231f,   -0.00134934f, -0.106987f,
-      -0.0490239f,  -0.22029f,     0.138017f,   0.373674f,    0.00638684f,
-      -2.08003f,    0.106453f,     0.124456f,   -0.0286108f,  0.0422698f,
-      0.013734f,    0.0780971f,    -0.40173f,   0.473453f,    1.16836f,
-      -0.251035f,   0.0119074f,    0.319241f,   0.0422023f,   -0.730454f,
-      -0.745948f,   0.796709f,     0.277634f,   0.09711f,     -0.212224f,
-      0.825348f,    0.0208521f,    -0.0238098f, 0.00929265f,  0.0516351f,
-      -0.02329f,    0.0983163f,    -0.180721f,  0.0122096f,   -0.246159f,
-      0.61468f,     0.923765f,     0.240435f,   -0.294845f,   -0.495317f,
-      -0.0563837f,  -0.417936f,    0.154874f,   -0.604407f,   -0.0681337f,
-      -0.65738f,    -0.0270073f,   0.0920023f,  -0.0742724f,  0.820862f,
-      -0.602758f,   -1.20617f,     -0.201707f,  0.869499f,    -0.0539076f,
-      0.403097f,    0.429168f,     -0.938227f,  -0.830894f,   -0.362462f,
-      -0.0658648f,  0.471469f,     -0.264827f,  0.610275f,    0.367995f,
-      0.735662f,    -0.0473157f,   -0.0380545f, -0.0848067f,  -0.146108f,
-      -0.125875f,   -0.0576117f,   -0.296198f,  -0.100443f,   -0.212971f,
-      0.593524f,    1.23111f,      -0.810009f,  -0.604572f,   0.203021f,
-      0.256285f,    -1.17049f,     -1.19156f,   0.24365f,     0.727876f,
-      -0.466826f,   0.0298762f,    -0.0331735f, -0.0109056f,  0.0114862f,
-      0.00396703f,  0.0385985f,    -0.0587946f, 0.821079f,    0.0582033f,
-      0.349156f,    1.03529f,      -0.407036f,  0.200308f,    -0.265649f,
-      -0.104567f,   0.161149f,     -0.0717528f, -0.0112724f,  0.0681578f,
-      0.103809f,    -0.0807997f,   0.0316814f,  -0.332323f,   0.112254f,
-      -0.163981f,   0.118988f,     -0.777055f,  -1.34047f,    -0.910482f,
-      0.74599f,     -0.59633f,     0.165649f,   -0.594998f,   0.0845802f,
-      0.00440975f,  0.122606f,     -0.463991f,  0.418502f,    -0.339126f,
-      1.41847f,     -0.109594f,    -0.411879f,  -0.444865f,   -0.0404821f,
-      -0.0607352f,  -0.663753f,    -0.724327f,  -0.138642f,   0.834144f,
-      -0.811695f,   -0.930264f,    0.150993f,   -0.325565f,   0.0615853f,
-      -0.473993f,   0.0966587f,    0.315197f,   1.0345f,      0.35441f,
-      0.703234f,    -0.335715f,    0.783153f,   0.467976f,    -0.0234736f,
-      0.549724f,    0.539107f,     -0.510182f,  -0.154442f,   0.0126656f,
-      1.66711f,     0.884555f,     0.118675f,   -0.341705f,   0.195316f,
-      -0.0366564f,  -0.619244f,    -0.634092f,  -0.559951f,   0.0564255f,
-      0.765917f,    0.0510238f,    0.0667615f,  0.0699302f,   -0.0351751f,
-      -0.0484402f,  -0.000792665f, -0.10775f,   -0.337121f,   -0.983947f,
-      0.517793f,    1.34977f,      -0.567602f,  0.129921f,    -0.443722f,
-      -0.276277f,   -0.501404f,    -0.183234f,  -0.553055f,   -0.447434f,
-      -0.35529f,    -0.0444689f,   0.0192031f,  0.0372702f,   -0.195202f,
-      -0.020753f,   -0.0247035f,   0.420298f,   1.39373f,     0.203699f,
-      -0.218818f,   0.250734f,     -0.0282348f, 0.411986f,    -0.262946f,
-      0.526339f,    0.242769f,     -0.159857f,  -0.546788f,   -0.0410147f,
-      0.954238f,    -0.0252765f,   0.639488f,   -0.491367f,   -0.0572638f,
-      0.285763f,    -0.45764f,     0.121657f,   -1.24374f,    -0.372479f,
-      -0.111521f,   0.194134f,     -0.271364f,  0.179678f,    0.121237f,
-      -0.14305f,    -0.205662f,    0.216891f,   0.344568f,    -0.523745f,
-      -1.00908f,    0.180965f,     0.0263031f,  -0.0556144f,  0.0831083f,
-      -0.0623274f,  0.112748f,     0.597137f,   -0.502616f,   -1.10624f,
-      -0.0487462f,  -1.10744f,     -0.125653f,  0.277049f,    -0.141329f,
-      -0.00457003f, -0.161038f,    0.588462f,   0.323317f,    0.49762f,
-      0.477561f,    0.901705f,     -0.264511f,  0.256557f,    0.076023f,
-      -0.0460696f,  0.0830666f,    -0.0651269f, -0.881245f,   -0.285999f,
-      0.53127f,     0.914533f,     0.0505795f,  -0.3054f,     -0.0988696f,
-      -0.0658403f,  0.15979f,      -0.453316f,  -0.824834f,   -0.280222f,
-      -0.686952f,   -0.0768344f,   -1.12235f,   -0.815408f,   0.0202134f,
-      -0.111892f,   0.0847659f,    -0.18763f,   0.597782f,    0.364016f
-    };
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32[] = {
-      -1.541f,     -0.00935641f, -1.50754f, -0.638648f, -0.679403f,
-      -0.0387804f, -0.714791f,   -1.69522f, 0.435677f,  -1.5846f,
-      0.108788f,   0.614982f,    0.111048f, -0.465826f, -0.611358f,
-      0.637197f,   0.929621f,    -1.20889f, 0.954558f,  0.716529f
-    };
-
-static const float av1_fp_simple_motion_search_term_none_logits_kernel_32[] = {
-  0.396195f,   -0.791364f,  -0.881893f, 1.0542069f, 0.772562f,
-  0.60815647f, 1.117405f,   -1.272638f, 0.483183f,  -0.917147f,
-  0.690799f,   -0.601466f,  -0.545536f, -0.416353f, -0.927874f,
-  0.972198f,   -0.3770457f, 0.542694f,  -0.591889f, 0.464565f
-};
-
-static const float av1_fp_simple_motion_search_term_none_logits_bias_32[] = {
-  -0.590318f
-};
-
-static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_32 = {
-  NUM_FEATURES_32,
-  NUM_LOGITS_32,
-  NUM_HIDDEN_LAYERS_32,
+static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = {
+  FEATURES,
+  1,
+  1,
   {
-      NUM_LAYER_0_UNITS_32,
+      HIDDEN_NODES,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32,
-      av1_fp_simple_motion_search_term_none_logits_kernel_32,
+      av1_early_term_after_split_nn_weights_64_layer0,
+      av1_early_term_after_split_nn_weights_64_layer1,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32,
-      av1_fp_simple_motion_search_term_none_logits_bias_32,
+      av1_early_term_after_split_nn_bias_64_layer0,
+      av1_early_term_after_split_nn_bias_64_layer1,
   },
 };
 
-#undef NUM_HIDDEN_LAYERS_32
-#undef NUM_FEATURES_32
-#undef NUM_LAYER_0_UNITS_32
-#undef NUM_LOGITS_32
-
-#define NUM_HIDDEN_LAYERS_16 1
-#define NUM_FEATURES_16 20
-#define NUM_LAYER_0_UNITS_16 24
-#define NUM_LOGITS_16 1
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16[] = {
-      -0.315922f,   0.74455f,     -0.0196939f,  0.238336f,    0.288554f,
-      0.0845902f,   -0.0121831f,  0.455303f,    0.0235902f,   0.218997f,
-      -0.0445164f,  0.0752211f,   0.0539915f,   -0.0439682f,  -0.397139f,
-      -0.0030004f,  -0.106365f,   0.845384f,    0.684638f,    -0.965702f,
-      0.307643f,    -0.0433377f,  -0.0644826f,  -0.214946f,   -0.44467f,
-      0.142967f,    0.0109982f,   -0.344458f,   -0.42947f,    0.269175f,
-      -0.88534f,    -0.28077f,    -1.36018f,    -0.33725f,    -0.0885953f,
-      -0.123887f,   0.218107f,    -0.0759977f,  0.739124f,    0.684048f,
-      0.577964f,    -0.328481f,   -0.247837f,   0.00546713f,  0.191895f,
-      -0.145274f,   0.320121f,    -0.482379f,   0.534585f,    -0.1582f,
-      0.944784f,    0.944665f,    0.0494451f,   -0.0399724f,  -0.170375f,
-      -0.0869746f,  0.106216f,    -0.120556f,   -1.57849f,    -0.752895f,
-      0.424454f,    -0.0269515f,  0.00398589f,  0.214165f,    -0.142986f,
-      0.199223f,    0.049624f,    -0.116783f,   -0.648119f,   -0.311599f,
-      0.122629f,    -0.0338422f,  0.345092f,    -0.408254f,   0.601037f,
-      -0.00146985f, 0.00133926f,  0.0392668f,   -0.931156f,   0.31429f,
-      -0.150243f,   0.0755763f,   -0.32177f,    0.258521f,    -0.104078f,
-      -0.144506f,   0.0199566f,   -0.454723f,   -0.292959f,   -0.0953681f,
-      -1.24843f,    0.446814f,    -0.311363f,   0.0590878f,   -0.0568717f,
-      -0.421585f,   0.179852f,    0.668763f,    0.48914f,     0.290584f,
-      -1.14053f,    -1.37576f,    0.420112f,    -0.158582f,   0.268231f,
-      0.252999f,    0.276423f,    0.529033f,    0.141127f,    0.702762f,
-      0.181407f,    -0.0279289f,  -0.0194757f,  0.0752152f,   -0.136963f,
-      0.00902489f,  0.125334f,    0.0680212f,   -0.370449f,   0.438003f,
-      -0.600869f,   0.154209f,    -0.36306f,    -0.484209f,   0.140093f,
-      0.0743079f,   -0.143317f,   0.0442872f,   0.272089f,    0.601531f,
-      1.20687f,     -0.280695f,   0.222235f,    -0.0106747f,  -0.017026f,
-      0.204008f,    -0.0316111f,  -0.64679f,    -0.866749f,   -0.774231f,
-      0.306231f,    -0.0940114f,  -0.56555f,    -0.34399f,    0.425142f,
-      0.424064f,    -0.50189f,    -0.146558f,   0.544899f,    0.141728f,
-      1.14592f,     -0.0124826f,  0.111613f,    -0.0862228f,  0.0211737f,
-      0.0614017f,   0.0245077f,   -0.454523f,   -0.0766391f,  -0.436808f,
-      0.251409f,    -0.13354f,    -0.242447f,   -0.311807f,   -0.844505f,
-      -0.671486f,   0.0946297f,   0.241702f,    0.856521f,    0.529763f,
-      -0.869772f,   -0.0016341f,  0.14511f,     0.0136254f,   -0.0359721f,
-      -0.0454713f,  0.00664495f,  0.0373555f,   0.653991f,    -0.075867f,
-      -0.102728f,   -0.947685f,   -0.119479f,   -0.145413f,   0.148364f,
-      0.310885f,    -0.266837f,   0.354087f,    0.299469f,    0.603911f,
-      0.257161f,    0.0190527f,   0.152862f,    -0.0987196f,  -0.293369f,
-      0.139026f,    -0.128421f,   0.0505933f,   -0.703803f,   1.08628f,
-      -0.562294f,   -0.818943f,   0.102178f,    0.727399f,    -0.228433f,
-      0.484057f,    0.0595919f,   -0.0559087f,  -0.549447f,   0.176168f,
-      1.41744f,     -0.126284f,   0.0987251f,   -0.00123073f, 0.00510827f,
-      0.105209f,    0.0671775f,   -0.438525f,   0.211028f,    -0.782459f,
-      0.286411f,    -0.459887f,   0.0633669f,   0.329958f,    -0.0736945f,
-      0.45188f,     -0.2447f,     0.676601f,    0.600321f,    -0.0336198f,
-      0.108531f,    0.0452834f,   -0.0848577f,  0.0731281f,   1.32381f,
-      -0.118349f,   0.129497f,    -0.840938f,   -1.45444f,    -0.559047f,
-      -0.248109f,   -0.491559f,   -0.139812f,   0.175964f,    0.168687f,
-      0.123031f,    0.201625f,    0.422849f,    0.34436f,     0.0426694f,
-      0.558045f,    -0.246772f,   0.679483f,    -0.0959578f,  -0.102879f,
-      0.391029f,    0.280906f,    0.0867408f,   -1.10932f,    0.402526f,
-      -0.227285f,   0.336087f,    -0.237765f,   0.185619f,    -0.309732f,
-      0.0781132f,   -0.0234955f,  0.0828806f,   0.19966f,     -0.241288f,
-      -0.224634f,   0.0638918f,   -0.143521f,   -0.0206692f,  -0.27131f,
-      0.973051f,    1.12031f,     0.262846f,    0.471585f,    0.105231f,
-      -0.386434f,   -0.355846f,   0.7359f,      0.567308f,    0.130768f,
-      0.242369f,    -0.0272523f,  -0.118436f,   0.374145f,    0.24802f,
-      -1.00186f,    -0.0241195f,  0.0140446f,   0.0202831f,   0.163197f,
-      0.0399298f,   -0.00912791f, -0.280572f,   -0.309893f,   -0.644495f,
-      0.243838f,    0.731391f,    0.0725078f,   0.350308f,    -0.136691f,
-      0.208814f,    0.0218567f,   -0.0805393f,  -0.18681f,    -0.214638f,
-      0.273354f,    -0.355047f,   0.242748f,    0.472951f,    -0.202705f,
-      0.405247f,    0.161622f,    -0.284883f,   -1.31181f,    -0.661056f,
-      -0.248219f,   -0.827307f,   0.289221f,    0.660529f,    0.48563f,
-      0.407366f,    0.0327303f,   -0.0610309f,  -0.647064f,   0.0899991f,
-      0.376267f,    1.27555f,     0.0264175f,   0.153931f,    1.07345f,
-      0.0715052f,   0.174473f,    0.01322f,     -0.715723f,   0.113909f,
-      0.100968f,    -0.457287f,   -0.672022f,   -0.20532f,    0.895176f,
-      0.357034f,    0.5413f,      0.918393f,    -0.455f,      -0.499617f,
-      -1.21799f,    0.0634338f,   0.144944f,    -0.106715f,   0.0227713f,
-      -0.0203213f,  0.030851f,    -0.0726756f,  0.589192f,    -0.060841f,
-      -0.198521f,   0.497179f,    -0.0591156f,  -0.135466f,   -0.132638f,
-      -0.181333f,   -0.332358f,   0.0349959f,   0.212885f,    -0.536206f,
-      -0.425009f,   -0.035525f,   0.0384449f,   0.0360549f,   -0.0383953f,
-      -0.0263281f,  -0.0228435f,  1.11771f,     0.928061f,    -0.163923f,
-      -0.327868f,   -0.894518f,   0.00448907f,  0.0805977f,   0.329559f,
-      0.157429f,    0.292729f,    0.497688f,    0.188659f,    0.203724f,
-      -1.26001f,    -0.0392533f,  -0.0566088f,  0.000859925f, 0.125254f,
-      0.054261f,    0.0357295f,   -0.393813f,   -0.275944f,   0.299657f,
-      -0.211421f,   0.038172f,    -0.439829f,   -0.913949f,   0.35642f,
-      0.865473f,    -0.472033f,   -0.752376f,   0.995255f,    0.417965f,
-      -0.680645f,   0.0622027f,   0.128878f,    -0.0357859f,  0.0793577f,
-      0.203629f,    -0.0600867f,  0.0512268f,   0.528584f,    0.23889f,
-      0.38255f,     -0.216407f,   -0.0338828f,  0.0328103f,   -0.885678f,
-      -0.716634f,   0.438663f,    0.320841f,    -0.119656f,   0.626092f,
-      0.8526f,      -0.0325005f,  -0.0275416f,  -0.171131f,   0.0260563f,
-      -0.0162027f,  0.0879367f,   -0.340473f,   0.0220265f,   -0.1731f,
-      0.512539f,    0.587822f,    -0.175619f,   0.177215f,    -0.35458f,
-      -0.159059f,   -0.423754f,   0.0198413f,   -0.336208f,   -0.359052f,
-      -1.50819f,    0.0628184f,   0.054506f,    0.0048834f,   0.361657f,
-      0.00986886f,  -0.0721521f,  -0.256765f,   1.41173f,     0.376196f,
-      -0.0783331f,  0.174803f,    -0.00240091f, -0.306571f,   -0.304654f,
-      -0.0348377f,  0.115569f,    -0.20359f,    -0.162341f,   -0.0443526f,
-      -0.848317f,   -0.228167f,   0.699534f,    0.482092f,    -0.0921484f,
-      -0.172425f,   -0.0610094f,  -0.188327f,   0.836209f,    0.541725f
-    };
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16[] = {
-      -0.388147f, -0.0868767f, 0.702129f,  0.376659f, -0.709988f, 0.496603f,
-      -0.238442f, -1.35761f,   -0.391887f, 0.235468f, -0.327982f, 0.731842f,
-      1.0949f,    -0.789218f,  -0.881452f, 0.514341f, 0.727894f,  -0.494498f,
-      -1.32304f,  -1.22643f,   -0.294287f, -1.3974f,  -0.128148f, -0.0956137f
-    };
-
-static const float av1_fp_simple_motion_search_term_none_logits_kernel_16[] = {
-  0.456147f,   0.248707f,  -0.5205241f, -0.1506567f, 0.388359f,   -0.6074409f,
-  -0.4719775f, -0.733864f, 0.5588447f,  -0.4021345f, -1.140733f,  -0.73399f,
-  -0.4299591f, 0.450688f,  0.817564f,   -0.265486f,  -0.3525806f, 0.55188314f,
-  1.365457f,   1.180764f,  0.587772f,   -0.870683f,  0.818839f,   0.318488f
+static const float av1_early_term_after_split_nn_weights_32_layer0[] = {
+  0.026050f,  -0.226531f, 0.308107f,  -0.083744f, 0.201785f,  0.098562f,
+  0.147595f,  -0.495771f, -0.245741f, 0.201616f,  -0.272070f, -0.579545f,
+  -0.127261f, -0.229588f, 0.250831f,  -0.176929f, -0.031689f, 0.284718f,
+  0.085845f,  -0.285027f, 0.012304f,  0.382402f,  -0.204591f, 0.272514f,
+  -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f,  0.195689f,
+  0.242530f,  0.023528f,  -0.294242f, -0.272132f, 0.460180f,  -0.731281f,
+  -0.208103f, 0.208204f,  0.348250f,  0.016328f,  0.043707f,  -0.169551f,
+  0.108521f,  0.226895f,  -0.020471f, 0.102443f,  0.429640f,  -0.252555f,
+  -0.218434f, -0.163665f, 0.175531f,  0.101588f,  -0.135798f, -0.158102f,
+  0.142565f,  0.128277f,  0.174985f,  -0.100073f, 0.113967f,  0.223682f,
+  -0.145576f, -0.008443f, 0.112748f,  -0.037845f, 0.076954f,  -0.287137f,
+  -0.518185f, -0.106833f, 0.175359f,  0.031408f,  0.219069f,  -0.294440f,
+  0.007766f,  0.067754f,  -0.049168f, -0.212368f, -0.261708f, 0.309252f,
+  0.220859f,  -0.274852f, -0.653157f, 0.083438f,  -0.265386f, 0.174429f,
+  -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f,  0.212890f,
+  0.272053f,  -0.425315f, -0.107726f, 0.294444f,  -0.354629f, 0.104402f,
+  -0.307663f, 0.558430f,  0.140334f,  -0.054831f, -0.449456f, 0.058274f,
+  -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f,  -0.079297f,
+  -0.638571f, 0.181823f,  -0.039611f, 0.206310f,  -0.659157f, -0.102930f,
+  -0.067303f, -0.176881f, -0.001038f, 0.091835f,  0.079739f,  -0.121923f,
+  0.211070f,  0.362719f,  -0.154915f, -0.151876f, -0.165460f, 0.023469f,
+  -0.251036f, 0.210014f,  -0.537125f, 0.156832f,  -0.216987f, 0.062975f,
+  -0.198462f, 0.329123f,  0.125870f,  0.225830f,  0.086377f,  -0.128773f,
+  -0.179673f, -0.074612f, 0.456645f,  0.021905f,  -0.243140f, 0.059145f,
+  -0.273942f, -0.277822f, 0.154556f,  -0.025459f, 0.227614f,  -0.313076f,
+  0.044705f,  -0.019017f, 0.108999f,  -0.020243f, -0.016373f, 0.560270f,
+  -0.064818f, 0.050880f,  -0.218458f, 0.825699f,  -0.534056f, -0.258253f,
+  0.222073f,  0.013295f,  0.477870f,  -0.386727f, 0.388509f,  0.004128f,
+  0.451388f,  -0.175788f, 0.264093f,  -0.109812f, 0.358132f,  0.500992f,
+  -0.446933f, -0.222397f, 0.345834f,  0.370943f,  -0.233115f, -0.047005f,
+  -0.111335f, -0.111586f, 0.026975f,  -0.052191f, -0.111800f, -0.129782f,
+  0.225132f,  0.102524f,  0.544557f,  -0.111674f, -0.857884f, 0.133258f,
+  0.310001f,  0.043829f,  0.104143f,  0.256493f,  0.242520f,  -0.342082f,
+  0.421447f,  0.124227f,  0.061542f,  -0.090206f, 0.316681f,  0.353452f,
+  -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f,
+  0.255725f,  -0.126346f, 0.034095f,  -0.240276f, -0.135918f, 0.095682f,
+  -0.147457f, -0.338216f, -0.200426f, 0.010265f,  -0.243915f, -0.231375f,
+  -0.323924f, -0.014353f, 0.150252f,  -0.264346f, 0.205303f,  -0.194610f,
+  -0.282527f, 0.180555f,  -0.000087f, 0.027240f,  -0.000903f, -0.345877f,
+  -0.353274f, -0.311829f, 0.172985f,  -0.111748f, -0.309380f, 0.108110f,
+  -0.260914f, -0.164990f, 0.183625f,  -0.319692f, -0.096988f, 0.094147f,
+  -0.047062f, -0.080978f, 0.227387f,  -0.000450f, -0.220159f, -0.211448f,
+  -0.020885f, -0.139646f, -0.086721f, 0.067928f,  -0.033084f, -0.251996f,
+  0.090317f,  0.086313f,  -0.228420f, -0.111356f, -0.314304f, -0.223664f,
+  0.188176f,  -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f,
+  -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f,
+  -0.038595f, 0.119537f,  0.260477f,  -0.168014f, -0.172751f, 0.532861f,
+  -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f,
+  -0.532110f, 0.359323f,  -0.254786f, 0.471316f,  -0.545024f, 0.291912f,
+  -0.836939f, 0.443427f,  -0.441709f, 0.168866f,  -0.140372f, 0.546607f,
+  -0.315465f, 0.023328f,  0.137709f,  -0.083492f, -0.049986f, -0.071302f,
+  -0.293680f, -0.105049f, 0.315317f,  0.279569f,  0.220762f,  0.088161f,
+  -0.756456f, -0.074512f, 0.958318f,  -0.332924f, -0.004906f, -0.629271f,
+  0.212050f,  0.279123f,  0.311523f,  -0.599580f, 0.516150f,  0.456952f,
+  0.020255f,  0.247290f,  -0.182670f, -0.335554f, 0.021203f,  0.131081f,
+  -0.208584f, 0.112530f,  -0.198980f, 0.211583f,  -0.101271f, -0.206453f,
+  -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f,  -0.165483f,
+  0.094953f,  -0.182578f, 0.055068f,  0.135605f,  -0.266941f, -0.297556f,
+  0.199181f,  0.015979f,  -0.158659f, -0.226841f, 0.171306f,  0.013438f,
+  -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f,  -0.026230f,
+  -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f,
+  0.248576f,  -0.144425f, -0.113948f, -0.247297f, 0.276682f,  0.010963f,
+  -0.737786f, 0.026347f,  0.007830f,  0.753543f,  0.371904f,  0.305614f,
+  0.105028f,  0.073530f,  -0.119137f, 0.102352f,  -0.080523f, 0.176366f,
+  -0.159457f, -0.339948f, 0.360131f,  -0.007051f, -0.388378f, -0.101695f,
+  0.663041f,  -0.234486f, -0.142536f, -0.099931f, 0.041478f,  0.230425f,
+  0.005743f,  0.154060f,  0.056233f,  -0.080668f, -0.009754f, -0.194356f,
+  0.185474f,  -0.296474f, 0.192700f,  0.257767f,  0.348529f,  0.458265f,
+  0.060276f,  -0.130473f, 0.139889f,  0.310073f,  -0.306869f, -0.272922f,
+  -0.259862f, 0.409207f,  0.431991f,  -0.100357f, -0.050415f, -0.071830f,
+  -0.239665f, 0.153399f,  0.177192f,  -0.611644f, -0.176114f, -0.022694f,
+  -0.033701f, -0.345842f, 0.015660f,  0.158931f,  -0.097586f, 0.222001f,
+  0.257887f,  -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f,
+  -0.010895f, 0.052815f,  -0.265306f, -0.081059f, 0.219162f,  -0.256084f,
+  -0.372676f, 0.148977f,  0.174831f,  0.086980f,  0.108518f,  0.074011f,
+  0.038032f,  -0.070856f, -0.109407f, 0.126174f,  0.022341f,  -0.249786f,
+  -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f,  -0.017953f,
+  -0.028353f, 0.233621f,  0.109426f,  0.232798f,  -0.104950f, -0.241798f,
+  -0.018995f, -0.167954f, 0.002473f,  0.060418f,  -0.232717f, -0.195980f,
+  -0.283971f, -0.371881f, 0.219728f,  0.018072f,  -0.166694f, -0.083301f,
+  -0.000616f, -0.212641f, -0.173158f, 0.222739f,  -0.235302f, 0.237624f,
+  0.222232f,  -0.041235f, -0.342411f, 0.121194f,  0.211291f,  -0.032237f,
+  -0.249401f, -0.291668f, 0.206055f,  -0.148200f, 0.011824f,  -0.272728f,
+  -0.194854f, 0.367175f,  -0.257243f, 0.103433f,  -0.231077f, 0.236734f,
+  0.135733f,  -0.362845f, 0.197147f,  0.242782f,  -0.135289f, 0.123311f,
+  0.259420f,  -0.116278f, 0.127287f,  0.236789f,  -0.097438f, 0.118073f,
+  0.112796f,  -0.035949f, 0.184408f,  0.200948f,  -0.008859f, 0.195989f,
+  0.161970f,  -0.295320f, -0.330389f, 0.141034f,  0.066081f,  -0.707857f,
+  0.357037f,  0.149633f,  0.679877f,  0.548674f,  0.469076f,  0.194123f,
+  -0.209872f, -0.071764f, -0.126960f, 0.199420f,  0.327116f,  -0.169053f,
+  -0.429156f, 0.443429f,  -0.225530f, -0.130738f, -0.028351f, 0.644393f,
+  0.049606f,  -0.243602f, -0.409920f, 0.117028f,  -0.258557f, 0.073865f,
+  -0.200454f, -0.139957f, -0.031314f, 0.162325f,  0.247221f,  0.071909f,
+  -0.336276f, 0.079922f,  0.192780f,  -0.148882f, 0.133192f,  -0.143177f,
+  -0.121327f, 0.126221f,  -0.089521f, -0.181826f, 0.149923f,  -0.280682f,
+  0.391572f,  0.108990f,  -0.445494f, -0.170787f, 0.225182f,  0.223313f,
+  -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f,  -0.049377f,
+  0.057976f,  0.033558f,  0.068733f,  -0.283353f, 0.217877f,  0.158093f,
+  -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f,  -0.474510f,
+  -0.096738f, 0.256940f,  0.234203f,  -0.226667f, -0.260576f, -0.183403f,
+  -0.035578f, 0.141570f,  0.078764f,  -0.028086f, 0.155800f,  -0.251115f,
+  -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f,
+  0.043057f,  0.068136f,  -0.179903f, 0.143699f,  -0.002571f, 0.239012f,
+  0.197456f,  0.035745f,  -0.311927f, 0.220320f,  0.102687f,  -0.294105f,
+  0.426740f,  0.209050f,  0.211907f,  0.083453f,  0.006578f,  -0.143338f,
+  0.003157f,  0.040295f,  0.234497f,  0.035344f,  -0.163909f, 0.411115f,
+  0.289453f,  -0.075357f, -0.008884f, 0.469798f,  -0.033304f, -0.153293f,
+  -0.229322f, -0.004162f, 0.113363f,  0.395381f,  0.067414f,  -0.188966f,
+  -0.117424f, -0.166423f, 0.066839f,  0.595641f,  -0.204782f, -0.451727f,
+  0.198509f,  -0.921583f, -0.246765f, -0.153411f, 0.046491f,  0.365906f,
+  0.376710f,  -0.017355f, -0.035232f, 0.138785f,  -0.163918f, -0.283449f,
+  -0.094340f, 0.192127f,  0.154815f,  0.035787f,  -0.029087f, 0.115649f,
+  -0.220133f, -0.452741f, 0.311667f,  0.157666f,  0.091401f,  0.236040f,
+  -0.168523f, 0.122176f,  -0.219016f, -0.214856f, 0.172824f,  -0.091810f,
+  0.031520f,  -0.857420f, 0.643446f,  -0.017471f, 0.206082f,  -0.933517f,
+  -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f,
+  0.289970f,  -0.889775f, -0.044741f, 0.232647f,  -0.319416f, 0.073030f,
+  0.278549f,  0.238782f,  -0.202206f, 0.272540f,  0.201412f,  0.175574f,
+  -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f,  0.505169f,
+  -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f,
+  -0.161019f, 0.344837f,  0.361958f,  -0.097050f, 0.014375f,  0.267110f,
+  0.341442f,  -0.016688f, 0.073393f,  0.131500f,  0.246331f,  0.011059f,
+  0.033597f,  0.014779f,  -0.269366f, -0.504788f, 0.048651f,  0.295682f,
+  0.237363f,  0.227484f,  -0.235814f, -0.160530f, 0.182682f,  -0.172999f,
+  -0.126630f, 0.168357f,  -0.078729f, 0.052805f,  0.377021f,  -0.004727f,
+  0.230415f,  -0.876673f, 0.458457f,  0.099401f,  -0.019616f, 0.611982f,
+  -0.231508f, -0.070894f, -0.056142f, 0.548969f,  -0.376599f, -0.600428f,
+  0.241930f,  -0.592893f, 0.189371f,  0.488651f,  -0.092446f, -0.272569f,
+  0.251643f,  0.315945f,  -0.301468f, 0.112961f,  0.052119f,  -0.066076f,
+  -0.082249f, 0.252805f,  -0.195539f, 0.150386f,  -0.865534f, 0.673447f,
+  0.030177f,  -0.438528f, -1.006174f, 0.575176f,  -0.271656f, 0.035835f,
+  -1.056916f, 0.495267f,  -0.092428f, -0.109511f, -0.192359f, 0.166669f,
+  -0.624326f, -0.000354f, -0.089075f, 0.176279f,  -0.289347f, 0.021346f,
+  0.020375f,  0.255282f,  -0.045588f, 0.173675f,  0.100957f,  -0.294373f,
+  0.049303f,  -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f,
+  0.079024f,  0.101113f,  0.135742f,  -0.348869f, -0.026759f, -0.134155f,
+  -0.179275f, -0.054297f, -0.054948f, 0.029351f,  0.190560f,  0.102476f,
+  -0.025785f, 0.169442f,  -0.271303f, 0.200667f,  0.099063f,  0.074767f,
+  -0.326533f, 0.044426f,  -0.290251f, -0.082443f, -0.164482f, -0.349412f,
+  0.045109f,  -0.157330f, 0.165935f,  0.012672f,  -0.059818f, 0.399140f,
+  -0.316620f, 0.386638f,  -0.285399f, -0.296777f, -0.200473f, -0.144232f,
+  0.251851f,  -0.203768f, 0.001071f,  -0.179063f, 0.248952f,  -0.143029f,
+  0.010423f,  -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f,
+  0.322825f,  0.133683f,  0.017388f,  0.142467f,  0.221320f,  0.004059f,
+  -0.115770f, 0.143363f,  0.137972f,  -0.272584f, 0.489366f,  -0.091828f,
+  -0.014703f, 0.082332f,  -0.476226f, -0.202859f, 0.356094f,  -0.283049f,
+  0.218086f,  0.202015f,  0.201724f,  0.012617f,  0.050720f,  0.255695f,
+  0.244653f,  0.111296f,  -0.151450f, -0.056210f, -0.757348f, 0.441724f,
+  -0.022455f, -0.244662f, 0.296205f,  -0.421883f, -0.217386f, -0.254301f,
+  0.409105f,  -0.031309f, 0.050147f,  -0.337170f, -0.106620f, -0.606455f,
+  0.308024f,  0.298144f,  0.363993f,  0.704870f,  -0.047292f, 0.166901f,
+  0.105991f,  -0.536757f, -0.424031f, -0.226034f, 0.213635f,  -0.526754f,
+  0.310990f,  -0.116038f, 0.007775f,  0.538330f,  -0.177912f, 0.445357f,
+  -0.290365f, 0.451169f,  0.030931f,  0.033388f,  0.209905f,  -0.244492f,
+  -0.097792f, -0.246042f, 0.132047f,  0.032576f,  0.115516f,  0.022890f,
+  0.093508f,  -0.071840f, 0.362948f,  -0.135245f, 0.659911f,  -0.321413f,
+  0.193118f,  -0.795001f, -0.218311f, 0.024862f,  0.206172f,  -0.832878f,
+  -0.255670f, 0.343402f,  -0.275211f, -0.898363f, -0.025172f, 0.158565f,
+  0.171347f,  -0.127518f, -0.215156f, -0.159198f, 0.250355f,  -0.132452f,
+  0.061254f,  -0.097544f, -0.223246f, 0.013183f,  0.239468f,  0.259017f,
+  -0.217739f, -0.032263f, 0.123755f,  -0.701777f, 0.150049f,  -0.555293f,
+  0.062430f,  -0.260304f, 0.494894f,  -0.168702f, -0.134829f, -0.113989f,
+  0.150092f,  -0.060248f, 0.115711f,  -0.277202f, 0.499811f,  0.417116f,
+  0.191081f,  -0.376432f, -0.321092f, 0.033992f,  0.057193f,  0.127077f,
+  -0.009042f, 0.014443f,  0.142808f,  -0.124349f, 0.213087f,  -0.381686f,
+  0.129726f,  -0.038396f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_logits_bias_16[] = {
-  -0.1046478f
+static const float av1_early_term_after_split_nn_bias_32_layer0[] = {
+  -0.107171f, 0.060848f,  -0.069480f, -0.121982f, 0.037637f,  -0.291839f,
+  0.102257f,  -0.065889f, -0.032452f, 0.034171f,  -0.073984f, -0.005236f,
+  0.218820f,  0.132123f,  -0.089621f, -0.067679f, 0.049368f,  0.329444f,
+  -0.184729f, 0.031702f,  0.009735f,  -0.039964f, -0.018024f, -0.073031f,
+  -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f,  0.216609f,
+  -0.078358f, -0.007740f,
 };
 
-static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_16 = {
-  NUM_FEATURES_16,
-  NUM_LOGITS_16,
-  NUM_HIDDEN_LAYERS_16,
+static const float av1_early_term_after_split_nn_weights_32_layer1[] = {
+  0.047869f,  -0.231773f, -0.185663f, 0.460676f,  -0.208182f, 0.590555f,
+  -0.622627f, 0.279377f,  0.351681f,  0.633504f,  1.069884f,  0.332449f,
+  -0.457703f, -0.435817f, -0.028853f, 0.327490f,  -0.282469f, -0.975792f,
+  -0.062975f, -0.147187f, 0.348340f,  -1.207116f, 0.516159f,  -1.509626f,
+  -0.805072f, 0.522999f,  0.143671f,  0.304246f,  -0.360720f, -0.612472f,
+  0.260045f,  -0.223243f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer1[] = {
+  -0.07571174f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = {
+  FEATURES,
+  1,
+  1,
   {
-      NUM_LAYER_0_UNITS_16,
+      HIDDEN_NODES,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16,
-      av1_fp_simple_motion_search_term_none_logits_kernel_16,
+      av1_early_term_after_split_nn_weights_32_layer0,
+      av1_early_term_after_split_nn_weights_32_layer1,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16,
-      av1_fp_simple_motion_search_term_none_logits_bias_16,
+      av1_early_term_after_split_nn_bias_32_layer0,
+      av1_early_term_after_split_nn_bias_32_layer1,
   },
 };
 
-#undef NUM_HIDDEN_LAYERS_16
-#undef NUM_FEATURES_16
-#undef NUM_LAYER_0_UNITS_16
-#undef NUM_LOGITS_16
-
-#define NUM_HIDDEN_LAYERS_8 1
-#define NUM_FEATURES_8 20
-#define NUM_LAYER_0_UNITS_8 16
-#define NUM_LOGITS_8 1
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8[] = {
-      -1.11024f,    -0.530449f,    -0.164768f,  0.675431f,   0.456155f,
-      0.711099f,    -0.248095f,    0.112132f,   -0.131481f,  0.234457f,
-      0.128073f,    0.306214f,     0.175471f,   0.220189f,   -0.270533f,
-      0.293534f,    -0.0795547f,   0.234901f,   -0.191754f,  0.101171f,
-      -0.108621f,   0.395477f,     -0.529459f,  -0.354854f,  -0.941334f,
-      -0.237689f,   0.39357f,      0.527129f,   0.174333f,   -0.00520422f,
-      1.22219f,     -0.21815f,     0.0866816f,  -0.29591f,   -0.212968f,
-      0.00431436f,  -0.295382f,    -0.582317f,  -0.284654f,  0.486427f,
-      -0.202448f,   -0.0421883f,   -0.116346f,  -0.345832f,  -0.0471637f,
-      -0.149954f,   -0.0969526f,   -0.59491f,   0.594364f,   0.298285f,
-      -1.33301f,    0.149562f,     0.097433f,   0.157641f,   -0.231132f,
-      -0.0191656f,  0.149396f,     0.811553f,   1.07336f,    0.140674f,
-      1.02134f,     0.455909f,     -0.0548795f, 0.0459996f,  -0.0589837f,
-      -0.116328f,   -0.607502f,    -0.232595f,  -0.517977f,  -0.325901f,
-      1.35047f,     -0.148698f,    0.0313182f,  0.181634f,   0.06539f,
-      0.00820322f,  0.0522113f,    -1.06071f,   -0.817999f,  -0.527422f,
-      -1.39175f,    -0.110088f,    0.0858626f,  -0.247541f,  0.29043f,
-      1.13767f,     0.185834f,     0.390613f,   -0.501175f,  -0.214176f,
-      -0.256376f,   0.496687f,     0.240471f,   0.218852f,   0.513543f,
-      0.400559f,    -0.249168f,    -0.752987f,  0.430491f,   -0.72299f,
-      0.339754f,    0.396623f,     -0.0638322f, 0.353122f,   0.355662f,
-      -0.0704821f,  0.195448f,     0.179396f,   0.486533f,   0.0815535f,
-      -0.503726f,   -0.000321223f, 0.501591f,   -0.117849f,  0.217667f,
-      -0.123391f,   -0.4026f,      0.149756f,   -0.0359276f, -0.0990213f,
-      -0.215278f,   -0.293649f,    0.301629f,   -0.11081f,   -0.206725f,
-      -0.00147108f, 0.363644f,     -0.430092f,  0.169524f,   0.116091f,
-      -0.583605f,   -0.0974948f,   0.253256f,   0.22648f,    0.136902f,
-      -0.882541f,   -0.75078f,     -0.0629343f, 0.411035f,   0.265742f,
-      -0.360904f,   -0.899324f,    0.605871f,   0.0318372f,  0.0735312f,
-      -0.00960722f, 0.691249f,     0.127449f,   -0.133021f,  -0.0793589f,
-      0.665591f,    -0.0682262f,   -0.0437626f, 0.0783621f,  2.25727f,
-      0.126529f,    -0.0320763f,   -0.261759f,  -1.19987f,   0.216295f,
-      -0.253886f,   -0.642908f,    0.1865f,     0.00299179f, 0.0246782f,
-      -0.00750628f, 0.566367f,     0.99916f,    -0.0209625f, 0.273254f,
-      1.09724f,     0.30026f,      0.21585f,    -0.0276715f, 0.338996f,
-      0.129884f,    -0.00628438f,  0.0461783f,  -1.36378f,   -0.394756f,
-      -0.395261f,   0.215928f,     0.252803f,   -0.207108f,  -0.0506214f,
-      -0.0138889f,  0.124197f,     -0.0522996f, 0.533803f,   -0.25729f,
-      -0.463514f,   0.128322f,     -1.04751f,   -0.605498f,  -0.107235f,
-      -0.00813289f, 0.539742f,     -0.0524178f, 0.272101f,   0.151935f,
-      0.607511f,    -0.0608427f,   0.36342f,    0.0999134f,  0.69712f,
-      -0.152471f,   0.364244f,     0.410644f,   0.312606f,   0.405679f,
-      -0.371656f,   -0.0492209f,   -0.148911f,  0.214996f,   -0.274749f,
-      -0.0372888f,  0.079023f,     -0.429136f,  -1.30393f,   -0.833824f,
-      -1.31373f,    -0.445343f,    0.526917f,   1.30569f,    -0.0626746f,
-      0.282353f,    -0.28552f,     0.28084f,    -0.234934f,  0.227076f,
-      1.09919f,     0.33248f,      -0.114933f,  0.40629f,    0.331031f,
-      0.245334f,    -0.0318782f,   0.00735305f, -1.58715f,   0.126443f,
-      -0.09472f,    -0.182152f,    0.311673f,   -0.186136f,  0.817743f,
-      0.928961f,    0.117334f,     -0.373644f,  -0.0797864f, 0.205565f,
-      0.0789797f,   0.0757131f,    -0.152409f,  0.30301f,    -0.0170824f,
-      -0.194496f,   0.485547f,     0.370124f,   -0.802044f,  -0.789671f,
-      0.669258f,    0.55082f,      -0.438853f,  0.0597597f,  -0.0148101f,
-      -0.41603f,    0.0486339f,    -0.464523f,  -0.413725f,  0.00907629f,
-      0.70351f,     -0.136422f,    -0.145957f,  -0.0626726f, -0.115773f,
-      -0.333937f,   0.135474f,     -0.379598f,  -0.134422f,  0.227595f,
-      0.908927f,    0.759504f,     -0.0088258f, -0.349333f,  0.122667f,
-      -0.682175f,   0.2201f,       -0.332003f,  -0.44433f,   -0.620308f,
-      -1.36716f,    -0.0167907f,   -0.538969f,  0.256824f,   -0.0706724f,
-      -0.0392471f,  -0.156312f,    0.153699f,   1.41967f,    0.0434739f,
-      0.428178f,    -0.0714879f,   0.0912104f,  0.00687985f, 0.341789f,
-      0.217381f,    0.128288f,     0.0286751f,  0.527344f,   -0.428139f,
-      0.60908f,     1.02074f,      -0.0977894f, 0.158067f,   0.28958f,
-      -0.065152f,   0.120616f,     -0.882976f,  -1.10413f,   -1.37497f
-    };
-
-static const float
-    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8[] = {
-      1.37086f,  -1.61858f, -1.32395f,  0.276031f, -0.124696f, -1.71489f,
-      -1.68429f, 1.79103f,  -0.335306f, -1.81523f, 0.841083f,  -0.542628f,
-      -1.82168f, 0.459829f, 0.0949306f, 0.918486f
-    };
-
-static const float av1_fp_simple_motion_search_term_none_logits_kernel_8[] = {
-  -0.283418f, -0.444453f, 0.4977782f, -0.4138758f, 0.41890771f, 0.22149438f,
-  0.545079f,  -0.729164f, 0.619389f,  0.5169534f,  -0.4236282f, 0.7304213f,
-  0.531938f,  -0.14828f,  0.75119f,   -0.464074f
+static const float av1_early_term_after_split_nn_weights_16_layer0[] = {
+  -0.113798f, 0.053357f,  -0.037947f, -0.477171f, 0.276517f,  -0.349252f,
+  -0.177284f, 0.189597f,  0.141744f,  0.230207f,  -0.328104f, 0.074328f,
+  0.247717f,  0.233533f,  0.145167f,  0.018029f,  -0.398725f, -0.226199f,
+  -0.309724f, 0.125279f,  0.194759f,  0.025531f,  0.349714f,  -0.273944f,
+  0.186871f,  0.181735f,  -0.520614f, -0.264076f, 0.308207f,  0.157438f,
+  -0.137791f, -0.054582f, 0.125879f,  0.796218f,  -0.897562f, 0.885439f,
+  0.381640f,  0.106625f,  -2.027456f, 0.000874f,  0.179581f,  0.013287f,
+  -2.329439f, -0.163169f, -0.136191f, 0.320108f,  -2.318779f, -0.196722f,
+  -0.295721f, 0.203658f,  -0.182275f, 0.615941f,  0.015762f,  0.257181f,
+  -0.115297f, 0.295774f,  -0.026144f, -0.022686f, -0.219423f, -0.042861f,
+  0.207647f,  -0.057791f, 0.201671f,  -0.169569f, 0.291492f,  -0.994991f,
+  0.137473f,  0.230948f,  0.505626f,  -1.065860f, 0.275225f,  -0.250861f,
+  0.519466f,  -1.217242f, -0.087384f, 0.053441f,  0.030729f,  -1.702304f,
+  -0.034635f, 0.010177f,  -0.035422f, -0.749979f, 0.355499f,  0.408166f,
+  -0.086883f, 0.017203f,  0.195706f,  -0.218056f, -0.029153f, 0.367335f,
+  -0.061732f, -0.241068f, 0.078496f,  -0.370346f, -0.124223f, -0.172708f,
+  0.037971f,  0.038875f,  -0.282489f, -0.266323f, -0.210864f, 0.214714f,
+  0.234695f,  -0.045625f, 0.015357f,  -0.007464f, -0.362003f, -0.113465f,
+  0.145141f,  0.238470f,  -0.202664f, -0.286587f, -0.347112f, 0.054501f,
+  -0.190290f, -0.283256f, 0.062179f,  0.041165f,  -0.006935f, -0.220351f,
+  -0.088800f, 0.220924f,  -0.200982f, 0.058493f,  -0.225175f, 0.057175f,
+  -0.618187f, 0.761023f,  -0.743774f, -0.500599f, -0.584999f, 1.545211f,
+  0.123055f,  -0.106848f, -0.353057f, 1.552187f,  0.174104f,  0.068060f,
+  -0.449859f, 1.254299f,  -0.161716f, -0.060630f, -0.230721f, 0.165976f,
+  -0.101582f, -0.422415f, 0.110384f,  -0.130098f, 0.104428f,  0.083518f,
+  0.031626f,  0.083048f,  0.158877f,  0.173340f,  0.063962f,  0.427845f,
+  0.663268f,  0.376996f,  0.146435f,  -0.091329f, 0.443447f,  0.518432f,
+  -0.182777f, -0.091313f, 0.331229f,  0.532604f,  -0.187001f, 0.054774f,
+  0.298068f,  0.502295f,  -0.362378f, 0.054283f,  0.292806f,  0.168901f,
+  -0.214787f, 0.025637f,  0.458009f,  -0.322714f, -0.264059f, 0.140313f,
+  -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f,  -0.009061f,
+  -0.050681f, 0.108681f,  0.043272f,  -1.073133f, 0.206410f,  0.469576f,
+  0.291494f,  -2.021244f, -0.001183f, -0.067542f, 0.364907f,  -2.470543f,
+  0.049147f,  -0.018868f, 0.658500f,  -2.531048f, 0.275433f,  -0.034224f,
+  -0.171386f, 0.096369f,  0.728069f,  0.272332f,  0.222255f,  -0.030426f,
+  0.026994f,  0.208928f,  -0.173943f, -0.227581f, -0.214798f, 0.079341f,
+  0.032344f,  -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f,
+  -0.086592f, 0.066487f,  0.337353f,  -0.168704f, 0.015702f,  0.022607f,
+  0.286647f,  0.218106f,  0.193319f,  -0.358714f, 0.030796f,  0.007646f,
+  -0.045617f, 0.165007f,  -0.284641f, -0.291812f, 0.207544f,  0.082823f,
+  -0.141907f, -0.331336f, -0.052908f, 0.120716f,  0.202521f,  0.232782f,
+  -0.348141f, -0.017332f, 1.191126f,  -0.391987f, -0.154537f, -0.206551f,
+  -2.378690f, 0.057918f,  -0.328183f, 2.151556f,  0.238803f,  0.164880f,
+  -0.480039f, 1.616200f,  0.260243f,  0.083704f,  -0.174461f, 1.804634f,
+  0.194810f,  0.223837f,  0.550107f,  -0.068171f, -0.293435f, -0.186770f,
+  -0.364846f, 0.127181f,  0.105556f,  -0.016202f, 0.278403f,  -0.344995f,
+  -0.009761f, -0.082555f, 0.046731f,  -0.301452f, 0.604259f,  0.055895f,
+  0.049862f,  0.314249f,  -0.305811f, -0.112937f, 0.658787f,  -0.549288f,
+  -0.307567f, -0.460650f, -0.840643f, 0.082576f,  0.373711f,  0.138318f,
+  0.336901f,  0.284984f,  -0.281400f, 0.408210f,  -0.449858f, 0.461054f,
+  0.227629f,  -0.131705f, 0.301769f,  -0.278540f, 0.189290f,  -0.269041f,
+  0.111350f,  -0.300257f, 0.436858f,  -0.265920f, -0.211938f, 0.272631f,
+  0.206291f,  0.253273f,  -0.229776f, -0.031112f, -0.171183f, -0.109676f,
+  -0.202390f, -0.068857f, 0.182125f,  -0.140523f, -0.308742f, -0.045840f,
+  0.256545f,  -0.262405f, 0.225951f,  -0.287463f, -0.189203f, -0.055552f,
+  -0.052448f, -0.242839f, -0.278877f, 0.140920f,  -0.175755f, 0.215402f,
+  -0.248841f, -0.264080f, -0.178303f, 0.147777f,  0.049460f,  -0.279877f,
+  -0.539725f, -0.004622f, 0.182874f,  0.338814f,  0.265974f,  0.249851f,
+  -0.141154f, 0.157228f,  -0.090972f, 0.179444f,  0.305255f,  0.127788f,
+  0.123270f,  0.355320f,  0.076797f,  0.263495f,  0.235965f,  -0.133816f,
+  0.243624f,  0.227062f,  -0.213629f, 0.002075f,  0.061203f,  -0.077820f,
+  -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f,
+  -0.274248f, 0.053950f,  -0.225750f, -0.367097f, -0.122391f, 0.181212f,
+  -0.411824f, -0.084241f, -0.302288f, 0.077860f,  -0.187443f, -0.300262f,
+  0.083156f,  -0.392461f, -0.332320f, -0.346474f, 0.140658f,  -0.283656f,
+  0.120714f,  -0.056577f, -0.280968f, 0.017795f,  -0.024686f, 0.073113f,
+  -0.346637f, 0.082567f,  -0.036556f, -0.369730f, 0.081225f,  -0.005211f,
+  0.144886f,  -0.003544f, 0.178307f,  -0.366035f, -0.063887f, -0.191767f,
+  0.105835f,  -0.273978f, -0.266532f, -0.023984f, 0.039166f,  0.065848f,
+  -0.026802f, -0.268923f, 0.189659f,  0.086300f,  0.030718f,  0.216565f,
+  -0.130025f, -0.215687f, 0.146341f,  -0.286438f, -0.394226f, -0.181509f,
+  -0.005612f, 0.186040f,  0.133491f,  0.032096f,  -0.261609f, 0.074007f,
+  -0.042929f, -0.234479f, 0.189704f,  0.088395f,  -0.003671f, -0.125055f,
+  -0.252418f, -0.086387f, 0.111197f,  -0.297071f, -0.018793f, -0.031902f,
+  -0.333191f, -0.186279f, 0.039868f,  0.091419f,  -0.264438f, -0.216150f,
+  -0.212550f, 0.203412f,  -0.113028f, -0.197169f, -0.346771f, 0.086066f,
+  0.091443f,  -0.128507f, -0.007281f, -0.118389f, 0.003370f,  -0.338661f,
+  0.026739f,  -0.063571f, -0.281567f, -0.166824f, 0.167455f,  0.216173f,
+  0.199163f,  0.256314f,  -0.222679f, 0.040282f,  -0.154808f, -0.133943f,
+  -0.270163f, -0.357398f, 0.260373f,  0.176950f,  -0.125162f, -0.085050f,
+  0.226376f,  -0.124585f, -0.324804f, 0.035536f,  -0.133600f, 0.173450f,
+  0.068107f,  -0.337442f, 0.169629f,  0.047223f,  0.057878f,  0.055555f,
+  -0.317449f, -0.103768f, 0.080899f,  -0.194759f, -1.137593f, 0.508999f,
+  0.045372f,  1.746454f,  1.250347f,  -0.342930f, -0.127821f, -0.220175f,
+  -0.417649f, -0.480595f, 0.071902f,  0.050231f,  -0.562554f, -0.677866f,
+  -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f,  0.572936f,
+  0.047325f,  0.050619f,  0.112611f,  -0.035393f, 0.052585f,  -0.071076f,
+  -0.015798f, -0.050228f, -0.142875f, 0.189329f,  0.048833f,  0.503633f,
+  0.249588f,  0.175492f,  -0.137664f, -0.018533f, 0.288453f,  -0.025644f,
+  0.079131f,  0.195096f,  -0.154039f, -0.104220f, -0.224072f, 0.095946f,
+  -0.208424f, 0.214745f,  0.056468f,  0.182603f,  0.341784f,  -0.134664f,
+  -0.194050f, 0.058532f,  -0.107336f, -0.087783f, -0.238795f, -0.387212f,
+  0.049055f,  -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f,
+  0.407375f,  -0.462654f, -0.609488f, 0.027742f,  -0.985512f, -0.109154f,
+  -0.423276f, 2.347960f,  0.129240f,  0.187610f,  -0.057081f, 2.424892f,
+  0.087666f,  0.106716f,  -0.039379f, 2.764866f,  0.113309f,  0.028196f,
+  -0.582789f, 0.335385f,  -0.538029f, -0.477337f, -0.114207f, 0.178829f,
+  0.006276f,  0.123179f,  0.095101f,  0.139898f,  -0.372074f, -0.111010f,
+  0.136330f,  0.272900f,  0.126737f,  -0.097808f, -0.363697f, 0.108665f,
+  -0.227749f, -0.083421f, 1.714677f,  0.451943f,  0.107931f,  -0.392281f,
+  1.615846f,  0.022307f,  -0.247011f, 0.257703f,  1.039134f,  0.537789f,
+  0.022177f,  -0.271532f, 0.351350f,  -0.399205f, -0.240534f, -0.315399f,
+  0.026928f,  -0.005618f, 0.053179f,  -0.010277f, 0.000501f,  0.040896f,
+  -0.109160f, 0.018282f,  0.003887f,  0.199599f,  0.095349f,  -0.337284f,
+  0.169929f,  -0.109409f, -0.166983f, 0.059908f,  -0.226574f, -0.120114f,
+  0.077329f,  -0.333133f, -0.220936f, 0.114309f,  -0.233965f, -0.281551f,
+  0.042948f,  0.100940f,  0.116037f,  -0.313122f, 0.215149f,  -0.309057f,
+  -0.341052f, -0.294417f, -0.179722f, 0.010795f,  0.192053f,  -0.275261f,
+  -0.033077f, 0.117348f,  0.090206f,  0.781573f,  0.602456f,  -0.220296f,
+  0.172159f,  0.758513f,  0.157910f,  -0.217897f, -0.372659f, 0.031935f,
+  0.791463f,  0.267195f,  0.931593f,  -0.057349f, 0.405512f,  -0.058512f,
+  -0.641663f, -0.076592f, 0.550227f,  -0.024094f, 0.048218f,  -0.289971f,
+  0.180940f,  0.167533f,  0.052711f,  -0.360726f, 0.019210f,  -0.488879f,
+  0.380498f,  0.151608f,  -0.276895f, -0.596554f, 0.106076f,  -0.245833f,
+  -0.048783f, 0.073823f,  0.098780f,  0.000211f,  0.113958f,  -0.068964f,
+  -0.265533f, -0.185457f, 0.175586f,  -0.163621f, -0.204919f, 0.145802f,
+  -0.163421f, 0.129576f,  -0.153486f, -0.105573f, 0.067289f,  -0.213120f,
+  -0.286103f, 0.249543f,  -0.044970f, -0.170464f, -0.105501f, -0.094765f,
+  -0.050734f, -0.369468f, 0.180020f,  -0.363328f, -0.151654f, -0.262550f,
+  -0.424503f, 0.829032f,  -0.559452f, 0.506837f,  0.143823f,  0.276660f,
+  -1.808608f, -0.259517f, -0.053945f, 0.035676f,  -1.842195f, -0.065960f,
+  -0.069285f, 0.462022f,  -2.319453f, -0.370299f, 0.183329f,  -0.146412f,
+  -0.563875f, 0.305068f,  0.480904f,  0.044319f,  -0.016098f, 0.168516f,
+  0.114874f,  -0.097621f, -0.030373f, 0.177700f,  0.181591f,  -0.146003f,
+  -0.330853f, -0.259200f, 0.779319f,  -1.517524f, 0.178781f,  0.135451f,
+  0.088784f,  -2.076089f, 0.628717f,  -0.048685f, 0.281327f,  -2.341596f,
+  0.422171f,  0.006135f,  0.367096f,  -1.663118f, 0.365253f,  -0.072884f,
+  -0.197620f, -0.688634f, 0.477354f,  0.395841f,  -0.098505f, 0.208709f,
+  -0.027523f, 0.127119f,  0.106274f,  0.114424f,  -0.122877f, -0.087245f,
+  0.086923f,  -0.527398f, -0.342062f, -0.764662f, 0.713094f,  -0.626453f,
+  -0.081454f, -0.087683f, 0.885047f,  0.323440f,  -0.018579f, -0.217166f,
+  1.617984f,  -0.159038f, 0.265991f,  -0.390313f, 1.933182f,  -0.032431f,
+  -0.057513f, -0.300841f, 0.461248f,  -0.072147f, -0.287052f, -0.078056f,
+  0.011734f,  0.044013f,  0.177174f,  0.093400f,  0.028819f,  0.193686f,
+  -0.224853f, 0.268321f,  -0.075059f, 0.074526f,  -0.015618f, 0.165615f,
+  -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f,
+  -0.224625f, -0.124980f, -0.104482f, 0.076864f,  -0.009631f, -0.164682f,
+  0.150480f,  -0.111880f, -0.260425f, 0.086234f,  -0.176936f, -0.136771f,
+  -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f,
+  -0.109061f, -0.098970f, 0.090792f,  -0.109623f, 0.349851f,  0.266341f,
+  -0.088602f, -0.108071f, 0.082519f,  0.472650f,  -1.838758f, 0.456694f,
+  0.119927f,  0.461077f,  -2.860022f, 0.231495f,  0.235771f,  0.256424f,
+  -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f,  0.505510f,
+  0.615657f,  0.193760f,  0.224600f,  0.265732f,  -0.121553f, -0.354597f,
+  -0.242414f, -0.276639f, -0.057591f, 0.026369f,  -0.261148f, -0.356155f,
+  -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f,  0.221299f,
+  -0.108857f, -0.156514f, 0.050901f,  0.058541f,  -0.077141f, 0.071515f,
+  -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f,
+  -0.143374f, -0.091811f, 0.165161f,  0.060156f,  -0.086103f, -0.039031f,
+  -0.377759f, -0.370533f, 0.074431f,  0.064192f,  0.186576f,  0.447858f,
+  -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f,  0.176286f,
+  2.850013f,  0.019385f,  -0.225361f, -0.235315f, 1.654694f,  -0.073978f,
+  -0.341412f, -1.187575f, 2.815900f,  -0.228063f, -0.174547f, 0.623825f,
+  -0.010676f, 0.157189f,  0.111879f,  -0.198965f, 0.051851f,  0.158396f,
+  0.045194f,  0.293531f,  -0.246714f, -0.351493f, 0.026954f,  0.076233f,
+  0.420367f,  0.168154f,  -0.131450f, 0.134487f,  -0.288851f, -0.134553f,
+  0.014902f,  0.756381f,  0.277713f,  0.190080f,  -0.020869f, 1.446672f,
+  0.029792f,  -0.025927f, 0.060640f,  0.559864f,  0.422229f,  0.198459f,
+  0.036167f,  0.029432f,  0.001882f,  0.038480f,  -0.160528f, -0.288855f,
+  -0.310886f, 0.291296f,  0.190558f,  -0.182816f, -0.002252f, 0.073101f,
+  -0.172245f, -0.305980f, 0.112492f,  -0.422839f, -0.295999f, -0.078160f,
+  -0.173405f, -0.032819f, 0.373774f,  -0.715223f, 0.018911f,  0.131753f,
+  -0.237364f, -0.128499f, -0.228406f, 0.341619f,  0.343552f,  -0.521581f,
+  -0.263790f, 0.362502f,  -0.018450f, 0.054233f,  0.183068f,  0.382772f,
+  0.188811f,  -0.627287f, 0.040399f,  -0.487338f, -0.192591f, 0.247426f,
+  0.154372f,  -0.483994f,
 };
 
-static const float av1_fp_simple_motion_search_term_none_logits_bias_8[] = {
-  -2.22338f
+static const float av1_early_term_after_split_nn_bias_16_layer0[] = {
+  -0.173976f, 0.305495f,  0.250981f,  -0.067127f, -0.313100f, 0.242464f,
+  0.315196f,  -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f,
+  -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f,
+  -0.239698f, -0.182082f, -0.065296f, 0.021503f,  -0.036787f, 0.311861f,
+  0.118135f,  -0.320456f, -0.110719f, 0.220692f,  -0.071727f, -0.088226f,
+  -0.110874f, -0.111671f,
 };
 
-static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_8 = {
-  NUM_FEATURES_8,
-  NUM_LOGITS_8,
-  NUM_HIDDEN_LAYERS_8,
+static const float av1_early_term_after_split_nn_weights_16_layer1[] = {
+  -0.338573f, 0.398159f,  0.314774f,  -0.037448f, -0.271950f, -0.774991f,
+  0.950901f,  -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f,
+  -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f,
+  -0.657195f, 0.927632f,  -0.040150f, 0.578920f,  0.212301f,  0.292495f,
+  0.563590f,  -0.205735f, 0.195877f,  0.582122f,  -0.217860f, 1.613379f,
+  0.313278f,  -0.555802f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer1[] = {
+  0.16553f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = {
+  FEATURES,
+  1,
+  1,
   {
-      NUM_LAYER_0_UNITS_8,
+      HIDDEN_NODES,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8,
-      av1_fp_simple_motion_search_term_none_logits_kernel_8,
+      av1_early_term_after_split_nn_weights_16_layer0,
+      av1_early_term_after_split_nn_weights_16_layer1,
   },
   {
-      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8,
-      av1_fp_simple_motion_search_term_none_logits_bias_8,
+      av1_early_term_after_split_nn_bias_16_layer0,
+      av1_early_term_after_split_nn_bias_16_layer1,
   },
 };
 
-#undef NUM_HIDDEN_LAYERS_8
-#undef NUM_FEATURES_8
-#undef NUM_LAYER_0_UNITS_8
-#undef NUM_LOGITS_8
+static const float av1_early_term_after_split_nn_weights_8_layer0[] = {
+  -0.719472f, 0.305806f,  0.855829f,  0.100094f,  0.412517f,  1.254673f,
+  1.552105f,  -5.890773f, -0.089957f, -0.016736f, 1.418074f,  -5.393506f,
+  -0.028214f, 0.117758f,  1.479209f,  -5.299794f, 0.171585f,  -0.084182f,
+  -0.162105f, 0.388577f,  -0.044319f, -0.025861f, 0.251782f,  -0.181462f,
+  -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f,
+  0.038038f,  -0.119492f, 0.049104f,  -0.344384f, -0.354513f, 0.036977f,
+  0.017513f,  -0.004025f, -0.163212f, -0.261999f, 0.146575f,  0.207541f,
+  0.130365f,  -0.252127f, 0.097419f,  -0.231057f, -0.309421f, 0.347866f,
+  -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f,
+  -0.233553f, 0.156354f,  -0.184009f, 0.344289f,  -0.308058f, -0.205202f,
+  -0.325068f, 0.183820f,  -0.361667f, -0.069559f, -0.121834f, -0.038357f,
+  -0.210043f, -0.266129f, 0.003188f,  0.074902f,  -0.328843f, 0.293679f,
+  -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f,
+  0.166074f,  -0.262899f, 0.102114f,  -0.323420f, 0.057064f,  -0.203318f,
+  -0.397413f, -0.317324f, -0.307093f, 0.020574f,  -0.188627f, 0.132529f,
+  0.118992f,  -0.487387f, -0.282975f, 0.573231f,  -0.266071f, 0.125140f,
+  -0.970034f, 1.424008f,  -0.487366f, -0.196415f, 3.680273f,  -0.008407f,
+  0.081109f,  -0.187479f, 3.876021f,  0.159168f,  0.111721f,  -0.337423f,
+  3.901760f,  0.261268f,  -0.245555f, -0.187632f, -0.324298f, 0.167234f,
+  0.170986f,  -0.473055f, 0.087016f,  -0.003469f, 0.051035f,  0.251794f,
+  0.153549f,  0.217609f,  -0.326870f, -0.175511f, 0.637341f,  -0.694837f,
+  -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f,
+  0.071414f,  0.215265f,  -0.835999f, 5.735746f,  0.300101f,  0.089626f,
+  -0.450261f, 5.608051f,  0.190491f,  0.110220f,  -0.595360f, -0.446324f,
+  0.311380f,  0.268812f,  -0.339656f, -0.008708f, 0.011111f,  -0.027557f,
+  0.171534f,  0.000676f,  0.227232f,  0.033993f,  0.146684f,  0.094817f,
+  -0.175381f, -0.211927f, -0.362471f, 0.168834f,  0.264149f,  -0.350538f,
+  -0.463249f, -0.288105f, 0.347155f,  0.183231f,  -0.229732f, -0.252202f,
+  -0.218074f, -0.008769f, -0.156103f, 0.181233f,  -0.354736f, 0.263270f,
+  -0.106636f, 0.081057f,  0.060634f,  -0.046887f, 0.050468f,  0.071259f,
+  0.221287f,  0.199071f,  -0.180185f, -0.406902f, -0.239351f, -0.034957f,
+  0.369140f,  0.864600f,  0.233798f,  0.423612f,  -0.468918f, 0.976987f,
+  0.691198f,  -1.597908f, 0.102926f,  0.305546f,  0.391196f,  -3.909059f,
+  0.333635f,  0.311561f,  0.738886f,  -4.002001f, 0.236394f,  -0.233141f,
+  0.263342f,  0.679898f,  0.136233f,  0.254743f,  -0.367571f, 0.066412f,
+  0.001606f,  -0.059542f, 0.051726f,  -0.347145f, -0.045501f, -0.313847f,
+  -0.021952f, 1.386316f,  -0.579139f, -1.275844f, -0.003493f, -1.716577f,
+  0.250209f,  0.192086f,  4.177055f,  0.351835f,  0.338177f,  0.140163f,
+  4.099592f,  0.321866f,  -0.128153f, -0.360414f, 4.350767f,  0.025943f,
+  -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f,
+  0.149441f,  -0.019823f, 0.012759f,  0.404442f,  -0.108881f, 0.067974f,
+  -0.188278f, 0.136327f,  0.109927f,  -0.179270f, -0.272342f, 0.018064f,
+  -0.304216f, -0.469470f, 0.109310f,  -0.326214f, 0.061909f,  -0.278997f,
+  -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f,
+  0.042441f,  -0.126699f, -0.420399f, -0.033842f, 0.016773f,  -0.273789f,
+  0.081928f,  -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f,
+  -0.232576f, 0.082955f,  -0.490080f, 0.073820f,  -0.090384f, 0.035781f,
+  -0.158880f, -0.506793f, -0.069132f, 0.047602f,  -0.349640f, -0.058389f,
+  -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f,  -0.548909f,
+  -0.131561f, 0.247196f,  -0.207923f, 0.133056f,  -0.509854f, -0.193685f,
+  -0.181327f, -0.242442f, 0.091821f,  0.114430f,  -0.375233f, -0.015254f,
+  -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f,
+  -0.076332f, -0.186232f, -0.268491f, 0.075561f,  -0.389082f, -0.077435f,
+  0.352562f,  -0.020086f, -0.338181f, -0.404629f, 0.254983f,  0.150477f,
+  -0.265903f, 0.003341f,  0.099969f,  -0.211964f, -0.129372f, -0.166366f,
+  0.327712f,  -0.276234f, 0.140675f,  -0.433677f, -0.163050f, -0.143578f,
+  -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f,
+  1.394155f,  -0.922486f, -1.350222f, 2.030201f,  0.057717f,  0.227650f,
+  -0.193179f, 0.037224f,  0.065555f,  0.020558f,  -0.059205f, -0.023690f,
+  -0.008718f, 0.095976f,  -0.549587f, -0.321164f, -0.243728f, 1.344381f,
+  -1.254107f, 0.294244f,  -0.154737f, -0.152597f, 0.342419f,  0.301883f,
+  0.069866f,  -0.327766f, 0.209323f,  -0.364913f, -0.005530f, -0.558972f,
+  0.057684f,  -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f,
+  -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f,
+  0.269628f,  -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f,
+  -0.135487f, -0.326108f, -0.039019f, 0.185029f,  -0.264883f, -0.563447f,
+  -0.163532f, -0.447652f, -0.141851f, 0.001714f,  -0.193184f, 0.032609f,
+  -0.112883f, 0.074599f,  0.490665f,  0.434764f,  0.021652f,  -0.219618f,
+  0.743267f,  0.147195f,  -0.303479f, -0.097674f, 0.195813f,  0.704007f,
+  -1.290851f, 0.119701f,  0.224065f,  0.260246f,  -0.580657f, -0.096201f,
+  -0.333214f, -0.586689f, 0.567178f,  0.157340f,  -0.043184f, 0.194358f,
+  -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f,  -0.564178f,
+  -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f,  -0.346106f,
+  -0.256892f, 0.110915f,  -0.337464f, -0.341474f, -0.216113f, 0.249445f,
+  -0.070175f, -0.412141f, 0.153458f,  -0.081280f, 0.164669f,  -0.356396f,
+  -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f,  -0.253233f,
+  -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f,
+  -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f,  0.011957f,
+  0.000498f,  -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f,
+  0.124249f,  -0.550804f, -0.420397f, -0.123462f, 0.333292f,  -0.240230f,
+  -0.025604f, 0.337536f,  -0.295006f, -0.272614f, -0.496850f, -0.278521f,
+  0.234591f,  -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f,
+  0.008714f,  -0.064018f, -0.124873f, -0.334014f,
+};
 
-static const float av1_fp_simple_motion_search_term_none_thresh_32 =
-    -2.2884985045792563f;
-static const float av1_fp_simple_motion_search_term_none_thresh_16 =
-    -1.6656874577527165f;
-static const float av1_fp_simple_motion_search_term_none_thresh_8 =
-    -3.608804354309157f;
+static const float av1_early_term_after_split_nn_bias_8_layer0[] = {
+  1.202379f,  -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f,
+  0.615653f,  -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f,
+  -0.225995f, 0.370877f,  -0.214821f, -0.227752f,
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer1[] = {
+  0.376594f,  0.266703f,  -0.039847f, 1.680142f,  -0.879939f, 0.286806f,
+  -0.378223f, -0.405295f, -0.021107f, 0.039188f,  0.259308f,  0.193091f,
+  0.077994f,  -0.269141f, 0.011180f,  -0.019262f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer1[] = {
+  -1.29585564f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = {
+  FEATURES,
+  1,
+  1,
+  {
+      16,
+  },
+  {
+      av1_early_term_after_split_nn_weights_8_layer0,
+      av1_early_term_after_split_nn_weights_8_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_8_layer0,
+      av1_early_term_after_split_nn_bias_8_layer1,
+  },
+};
+#undef FEATURES
+#undef HIDDEN_NODES
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/partition_strategy.c b/libaom/av1/encoder/partition_strategy.c
index e8270b3..cc820ba 100644
--- a/libaom/av1/encoder/partition_strategy.c
+++ b/libaom/av1/encoder/partition_strategy.c

@@ -11,145 +11,312 @@
 
 #include <float.h>
 
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/system_state.h"
 
 #include "av1/common/enums.h"
 #include "av1/common/reconinter.h"
 
-#include "av1/encoder/encoder.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/cnn.h"
 #include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+#include "av1/encoder/encoder.h"
+
+#include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 
-// Performs a simple_motion_search with a single reference frame and extract
-// the variance of residues. Here features is assumed to be a length 6 array.
-// After this function is called, we will store the following in to features:
-// features[0] = log(1 + dc_q**2/256)
-// features[1] = log(1 + variance_of_residue)
-// for i in [2, 3, 4, 5]:
-//  features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
-static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize,
-                                 float *features) {
-  // TODO(chiyotsai@google.com): The data this model trained on did not also use
-  // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
-  // model with the correct data should give better performance.
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get);
+#endif
 
-  MACROBLOCKD *xd = &x->e_mbd;
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    default: assert(0 && "Invalid bsize"); return -1;
+  }
+}
 
-  // Perform a single motion search in Y_PLANE to make a prediction
-  const int use_subpixel = 0;
+#if !CONFIG_REALTIME_ONLY
+// TODO(chiyotsai@google.com): This is very much a work in progress. We still
+// need to the following:
+//   -- add support for hdres
+//   -- add support for pruning rectangular partitions
+//   -- use reconstructed pixels instead of source pixels for padding
+//   -- use chroma pixels in addition to luma pixels
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int bsize, int quad_tree_idx,
+                                  int *partition_none_allowed,
+                                  int *partition_horz_allowed,
+                                  int *partition_vert_allowed,
+                                  int *do_rectangular_split,
+                                  int *do_square_split) {
+  assert(cm->seq_params.sb_size >= BLOCK_64X64 &&
+         "Invalid sb_size for intra_cnn!");
+  const int bsize_idx = convert_bsize_to_idx(bsize);
 
-  // Start getting the features
-  int f_idx = 0;
+  if (bsize == BLOCK_128X128) {
+    return;
+  }
 
-  // Q_INDEX
-  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  aom_clear_system_state();
-  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-
-  // VARIANCE
-  unsigned int sse = 0;
-  unsigned int var = 0;
-  const MV ref_mv_full = { .row = 0, .col = 0 };
-  av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full,
-                            use_subpixel, &sse, &var);
-  aom_clear_system_state();
-  features[f_idx++] = logf(1.0f + (float)var);
-
-  // Regional
-  const uint8_t *src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  int r_idx = 0;
-  for (r_idx = 0; r_idx < 4; r_idx++) {
-    const int x_idx = (r_idx & 1) * bw / 2;
-    const int y_idx = (r_idx >> 1) * bh / 2;
-    const int src_offset = y_idx * src_stride + x_idx;
-    const int dst_offset = y_idx * dst_stride + x_idx;
-    const unsigned int sub_var = cpi->fn_ptr[subsize].vf(
-        src + src_offset, src_stride, dst + dst_offset, dst_stride, &sse);
+  // Precompute the CNN part and cache the result in MACROBLOCK
+  if (bsize == BLOCK_64X64 && !x->cnn_output_valid) {
     aom_clear_system_state();
-    const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
-    features[f_idx++] = var_ratio;
+    const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+    // Prepare the output
+    const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL };
+    const int num_outputs = 4;
+    const int output_dims[4] = { 1, 2, 4, 8 };
+    const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH,
+                             CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH };
+    float *output_buffer[CNN_TOT_OUT_CH];
+
+    float **cur_output_buf = output_buffer;
+    float *curr_buf_ptr = x->cnn_buffer;
+    for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+      const int num_chs = out_chs[output_idx];
+      const int ch_size = output_dims[output_idx] * output_dims[output_idx];
+      for (int ch = 0; ch < num_chs; ch++) {
+        cur_output_buf[ch] = curr_buf_ptr;
+        curr_buf_ptr += ch_size;
+      }
+      cur_output_buf += num_chs;
+    }
+
+    CNN_MULTI_OUT output = {
+      .num_outputs = 4,
+      .output_channels = out_chs,
+      .output_strides = output_dims,
+      .output_buffer = output_buffer,
+    };
+
+    // Prepare the input
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bit_depth = xd->bd;
+    const int dc_q =
+        av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
+    x->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+    x->log_q = (x->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+               av1_intra_mode_cnn_partition_std[0];
+
+    const int width = 65, height = 65,
+              stride = x->plane[AOM_PLANE_Y].src.stride;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *image[1] = {
+        CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
+      };
+
+      av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+                                           cnn_config, &thread_data, bit_depth,
+                                           &output);
+    } else {
+      uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
+
+      av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config,
+                                    &thread_data, &output);
+    }
+
+    x->cnn_output_valid = 1;
+  }
+
+  if (!x->cnn_output_valid) {
+    return;
+  }
+
+  const NN_CONFIG *dnn_configs[5] = {
+    NULL,
+    &av1_intra_mode_cnn_partition_branch_0_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_1_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_2_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_3_dnn_config,
+  };
+
+  const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
+
+  aom_clear_system_state();
+  float dnn_features[100];
+  float logits[4] = { 0.0f };
+
+  const float *branch_0 = x->cnn_buffer;
+  const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
+  const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
+  const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
+
+  if (bsize == BLOCK_64X64) {
+    int f_idx = 0;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_0[ch_idx];
+    }
+
+    const int spa_stride = 2 * 2;
+    for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) {
+      for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+        dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
+      }
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_32X32) {
+    int f_idx = 0;
+    for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
+      dnn_features[f_idx++] = branch_0[idx];
+    }
+
+    const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1];
+    const int spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_16X16) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1];
+    const int prev_spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5];
+    const int spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_8X8) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5];
+    const int prev_spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21];
+    const int spa_stride = 8 * 8;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else {
+    assert(0 && "Invalid bsize in intra_cnn partition");
+  }
+
+  // Make decision
+  av1_nn_predict(dnn_features, dnn_config, 1, logits);
+  aom_clear_system_state();
+
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  float split_only_thresh = 100.0f, no_split_thresh = -100.0f;
+  if (is_720p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx];
+  } else if (is_480p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx];
+  } else {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx];
+  }
+
+  if (logits[0] > split_only_thresh) {
+    *partition_none_allowed = 0;
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    *do_rectangular_split = 0;
+  }
+
+  if (logits[0] < no_split_thresh) {
+    *do_square_split = 0;
   }
 }
 
 void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_rectangular_split,
-    int *do_square_split) {
-  const NN_CONFIG *nn_config = NULL;
-  float split_only_thresh = 0.0f;
-  if (bsize == BLOCK_128X128) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_128;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_128;
-  } else if (bsize == BLOCK_64X64) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_64;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_64;
-  } else if (bsize == BLOCK_32X32) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_32;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_32;
-  } else if (bsize == BLOCK_16X16) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_16;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_16;
-  } else if (bsize == BLOCK_8X8) {
-    // Disable BLOCK_8X8 for now
-#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
-    nn_config = &av1_simple_motion_search_based_split_nn_config_8;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_8;
-#endif
-  } else {
-    assert(0 && "Unexpected block size in simple_motion_based_split");
-  }
-  if (nn_config) {
-    float features[6] = { 0 };
-    float score = 0;
-    get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
-    av1_nn_predict(features, nn_config, &score);
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_rectangular_split, int *do_square_split) {
+  aom_clear_system_state();
 
-    if (score > split_only_thresh) {
-      *partition_none_allowed = 0;
-      *partition_horz_allowed = 0;
-      *partition_vert_allowed = 0;
-      *do_rectangular_split = 0;
-    }
-    if (cpi->sf.simple_motion_search_split_only >= 2) {
-      if (score < -split_only_thresh) *do_square_split = 0;
-      // For larger scores (>split_only_thresh), none and rectangular partitions
-      // are skipped. As score reduces, possibility of split decreases. Hence
-      // for near larger scores (.875 * split_only_thresh to split_only_thresh)
-      // none partition is disabled, but rectangular partitions are evaluated
-      // additionally.
-      if (score > (split_only_thresh * 0.875)) *partition_none_allowed = 0;
-    }
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+  assert(bsize_idx >= 0 && bsize_idx <= 4 &&
+         "Invalid bsize in simple_motion_search_based_split");
+
+  const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx];
+  const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_split_nn_config[bsize_idx];
+  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+
+  const float split_only_thresh =
+      av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
+  const float no_split_thresh =
+      av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
+
+  float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_SPLIT_MODEL_FLAG);
+  for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
+    features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
+  }
+
+  float score = 0.0f;
+
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  if (score > split_only_thresh) {
+    *partition_none_allowed = 0;
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    *do_rectangular_split = 0;
+  }
+
+  if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
+      score < no_split_thresh) {
+    *do_square_split = 0;
   }
 }
 
 // Given a list of ref frames in refs, performs simple_motion_search on each of
 // the refs and returns the ref with the smallest sse. Returns -1 if none of the
 // ref in the list is available. Also stores the best sse and var in best_sse,
-// best_var, respectively. If save_mv_code is -1, don't update mv_ref_fulls in
-// pc_tree. If save_mv_code is between 0 and 3, update mv_ref_fulls under
-// pc_tree->split[i]. If save_mv_code is 4, update mv_ref_fulls under pc_tree.
+// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
+// pc_tree. If save_mv is 1, update mv_ref_fulls under pc_tree and the
+// subtrees.
 static int simple_motion_search_get_best_ref(
     AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
     int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
-    int use_subpixel, int save_mv_code, unsigned int *best_sse,
+    int use_subpixel, int save_mv, unsigned int *best_sse,
     unsigned int *best_var) {
-  // TODO(chiyotsai@google.com): The calculation of variance currently uses
-  // bsize, so we might take area outside of the image into account. We need to
-  // modify the SIMD functions to fix this later.
   const AV1_COMMON *const cm = &cpi->common;
   int best_ref = -1;
 
-  if (mi_col >= cm->mi_cols || mi_row >= cm->mi_rows) {
+  if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
     // If the whole block is outside of the image, set the var and sse to 0.
     *best_var = 0;
     *best_sse = 0;
@@ -160,7 +327,6 @@
   // Otherwise do loop through the reference frames and find the one with the
   // minimum SSE
   const MACROBLOCKD *xd = &x->e_mbd;
-  const MV *mv_ref_fulls = pc_tree->mv_ref_fulls;
 
   const int num_planes = 1;
 
@@ -170,9 +336,11 @@
     const int ref = refs[ref_idx];
 
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+      const FULLPEL_MV *start_mvs = pc_tree->start_mvs;
       unsigned int curr_sse = 0, curr_var = 0;
-      av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
-                               mv_ref_fulls[ref], num_planes, use_subpixel);
+      int_mv best_mv =
+          av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+                                   start_mvs[ref], num_planes, use_subpixel);
       curr_var = cpi->fn_ptr[bsize].vf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
           xd->plane[0].dst.stride, &curr_sse);
@@ -182,18 +350,17 @@
         best_ref = ref;
       }
 
-      const int new_mv_row = x->best_mv.as_mv.row / 8;
-      const int new_mv_col = x->best_mv.as_mv.col / 8;
-      if (save_mv_code == 4) {
-        pc_tree->mv_ref_fulls[ref].row = new_mv_row;
-        pc_tree->mv_ref_fulls[ref].col = new_mv_col;
-      } else if (save_mv_code >= 0 && save_mv_code < 4) {
-        // Propagate the new motion vectors to a lower level
-        pc_tree->split[save_mv_code]->mv_ref_fulls[ref].row = new_mv_row;
-        pc_tree->split[save_mv_code]->mv_ref_fulls[ref].col = new_mv_col;
-      } else {
-        assert(save_mv_code == -1 &&
-               "Unknown code in simple_motion_search_get_best_ref.");
+      if (save_mv) {
+        pc_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+        pc_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+
+        if (bsize >= BLOCK_8X8) {
+          for (int r_idx = 0; r_idx < 4; r_idx++) {
+            // Propagate the new motion vectors to a lower level
+            PC_TREE *sub_tree = pc_tree->split[r_idx];
+            sub_tree->start_mvs[ref] = pc_tree->start_mvs[ref];
+          }
+        }
       }
     }
   }
@@ -201,81 +368,110 @@
   return best_ref;
 }
 
-// Performs fullpixel simple_motion_search with LAST_FRAME and ALTREF_FRAME on
-// each subblock and extract the variance and sse of residues. Then store the
-// var and sse from each partition subblock to features. The DC qindex is also
-// stored in features.
-// Here features is assumed to be a length 19 array.
-// After this function is called, we will store the following to features:
-// features[0:17] = var and sse from subblocks
-// features[18] = DC q_index
-static void simple_motion_search_prune_part_features(
+// Collects features using simple_motion_search and store them in features. The
+// features are also cached in PC_TREE. By default, the features collected are
+// the sse and var from the subblocks flagged by features_to_get. Furthermore,
+// if features is not NULL, then 7 more features are appended to the end of
+// features:
+//  - log(1.0 + dc_q ** 2)
+//  - whether an above macroblock exists
+//  - width of above macroblock
+//  - height of above macroblock
+//  - whether a left marcoblock exists
+//  - width of left macroblock
+//  - height of left macroblock
+static AOM_INLINE void simple_motion_search_prune_part_features(
     AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features) {
-  // TODO(chiyotsai@google.com): Cache the result of the motion search from the
-  // larger bsize.
+    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) {
   const int w_mi = mi_size_wide[bsize];
   const int h_mi = mi_size_high[bsize];
-  int f_idx = 0;
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
   assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
          cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
 
   // Setting up motion search
-  const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
-  const int num_refs = 2;
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int num_refs = 1;
   const int use_subpixel = 1;
 
-  unsigned int int_features[FEATURE_SIZE_SMS_PRUNE_PART - 1];
-
   // Doing whole block first to update the mv
-  simple_motion_search_get_best_ref(
-      cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
-      4, &int_features[f_idx], &int_features[f_idx + 1]);
-  f_idx += 2;
+  if (!pc_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+    simple_motion_search_get_best_ref(cpi, x, pc_tree, mi_row, mi_col, bsize,
+                                      ref_list, num_refs, use_subpixel, 1,
+                                      &pc_tree->sms_none_feat[0],
+                                      &pc_tree->sms_none_feat[1]);
+    pc_tree->sms_none_valid = 1;
+  }
 
   // Split subblocks
-  BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  int r_idx = 0;
-  for (r_idx = 0; r_idx < 4; r_idx++) {
-    const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
-    const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int r_idx = 0; r_idx < 4; r_idx++) {
+      const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+      const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+      PC_TREE *sub_tree = pc_tree->split[r_idx];
 
-    simple_motion_search_get_best_ref(
-        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-        use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
-    f_idx += 2;
+      if (!sub_tree->sms_none_valid) {
+        simple_motion_search_get_best_ref(
+            cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list,
+            num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0],
+            &sub_tree->sms_none_feat[1]);
+        sub_tree->sms_none_valid = 1;
+      }
+    }
   }
 
-  // Horz subblocks
-  subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-  for (r_idx = 0; r_idx < 2; r_idx++) {
-    const int sub_mi_col = mi_col + 0;
-    const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+  // Rectangular subblocks
+  if (!pc_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+    // Horz subblock
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    for (int r_idx = 0; r_idx < 2; r_idx++) {
+      const int sub_mi_col = mi_col + 0;
+      const int sub_mi_row = mi_row + r_idx * h_mi / 2;
 
-    simple_motion_search_get_best_ref(
-        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-        use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
+      simple_motion_search_get_best_ref(
+          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &pc_tree->sms_rect_feat[2 * r_idx],
+          &pc_tree->sms_rect_feat[2 * r_idx + 1]);
+    }
 
-    f_idx += 2;
+    // Vert subblock
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    for (int r_idx = 0; r_idx < 2; r_idx++) {
+      const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+      const int sub_mi_row = mi_row + 0;
+
+      simple_motion_search_get_best_ref(
+          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &pc_tree->sms_rect_feat[4 + 2 * r_idx],
+          &pc_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+    }
+    pc_tree->sms_rect_valid = 1;
   }
 
-  // Vert subblock
-  subsize = get_partition_subsize(bsize, PARTITION_VERT);
-  for (r_idx = 0; r_idx < 2; r_idx++) {
-    const int sub_mi_col = mi_col + r_idx * w_mi / 2;
-    const int sub_mi_row = mi_row + 0;
-
-    simple_motion_search_get_best_ref(
-        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-        use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
-
-    f_idx += 2;
-  }
+  if (!features) return;
 
   aom_clear_system_state();
-  for (int idx = 0; idx < f_idx; idx++) {
-    features[idx] = logf(1.0f + (float)int_features[idx]);
+  int f_idx = 0;
+  if (features_to_get & FEATURE_SMS_NONE_FLAG) {
+    for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
+      features[f_idx++] = logf(1.0f + pc_tree->sms_none_feat[sub_idx]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    for (int sub_idx = 0; sub_idx < 4; sub_idx++) {
+      PC_TREE *sub_tree = pc_tree->split[sub_idx];
+      features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[0]);
+      features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[1]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_RECT_FLAG) {
+    for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
+      features[f_idx++] = logf(1.0f + pc_tree->sms_rect_feat[sub_idx]);
+    }
   }
 
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -296,73 +492,44 @@
   features[f_idx++] = (float)has_left;
   features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
   features[f_idx++] = (float)mi_size_high_log2[left_bsize];
-
-  assert(f_idx == FEATURE_SIZE_SMS_PRUNE_PART);
 }
 
-void av1_simple_motion_search_prune_part(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_square_split, int *do_rectangular_split, int *prune_horz,
-    int *prune_vert, float *features, int *valid) {
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         PC_TREE *pc_tree, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int *partition_horz_allowed,
+                                         int *partition_vert_allowed,
+                                         int *prune_horz, int *prune_vert) {
+  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
-  // Get model parameters
-  const NN_CONFIG *nn_config = NULL;
-  const float *prune_thresh = NULL, *only_thresh = NULL;
-  const float *ml_mean = NULL, *ml_std = NULL;
-  float normalized_features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
 
-  if (bsize == BLOCK_128X128) {
-    nn_config = &av1_simple_motion_search_prune_part_nn_config_128;
-    ml_mean = av1_simple_motion_search_prune_part_mean_128;
-    ml_std = av1_simple_motion_search_prune_part_std_128;
-    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_128;
-    only_thresh = av1_simple_motion_search_prune_part_only_thresh_128;
-  } else if (bsize == BLOCK_64X64) {
-    nn_config = &av1_simple_motion_search_prune_part_nn_config_64;
-    ml_mean = av1_simple_motion_search_prune_part_mean_64;
-    ml_std = av1_simple_motion_search_prune_part_std_64;
-    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_64;
-    only_thresh = av1_simple_motion_search_prune_part_only_thresh_64;
-  } else if (bsize == BLOCK_32X32) {
-    nn_config = &av1_simple_motion_search_prune_part_nn_config_32;
-    ml_mean = av1_simple_motion_search_prune_part_mean_32;
-    ml_std = av1_simple_motion_search_prune_part_std_32;
-    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_32;
-    only_thresh = av1_simple_motion_search_prune_part_only_thresh_32;
-  } else if (bsize == BLOCK_16X16) {
-    nn_config = &av1_simple_motion_search_prune_part_nn_config_16;
-    ml_mean = av1_simple_motion_search_prune_part_mean_16;
-    ml_std = av1_simple_motion_search_prune_part_std_16;
-    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_16;
-    only_thresh = av1_simple_motion_search_prune_part_only_thresh_16;
-  } else if (bsize == BLOCK_8X8) {
-    nn_config = &av1_simple_motion_search_prune_part_nn_config_8;
-    ml_mean = av1_simple_motion_search_prune_part_mean_8;
-    ml_std = av1_simple_motion_search_prune_part_std_8;
-    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_8;
-    only_thresh = av1_simple_motion_search_prune_part_only_thresh_8;
-  } else {
-    assert(0 && "Unexpected block size in simple_motion_prune_part");
-  }
+  // Get model parameters
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_prune_rect_nn_config[bsize_idx];
+  const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
+              *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
+
+  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+  const float prune_thresh =
+      av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
 
   // If there is no valid threshold, return immediately.
-  if (!nn_config || (prune_thresh[PARTITION_HORZ] == 0.0f &&
-                     prune_thresh[PARTITION_VERT] == 0.0f)) {
-    return;
-  }
-  if (bsize < BLOCK_8X8) {
+  if (!nn_config || prune_thresh == 0.0f) {
     return;
   }
 
   // Get features
+  float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
   simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
-                                           bsize, features);
-  *valid = 1;
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
   for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
-    normalized_features[f_idx] =
-        (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+    features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
   }
 
   // Get probabilities
@@ -372,24 +539,19 @@
                               ? PARTITION_TYPES
                               : EXT_PARTITION_TYPES;
 
-  av1_nn_predict(normalized_features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   aom_clear_system_state();
 
   av1_nn_softmax(scores, probs, num_classes);
 
   // Determine if we should prune rectangular partitions.
-  if (cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
       (*partition_horz_allowed || *partition_vert_allowed) &&
       bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
-    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh[PARTITION_HORZ];
-    *prune_vert = probs[PARTITION_VERT] <= prune_thresh[PARTITION_VERT];
+    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh;
+    *prune_vert = probs[PARTITION_VERT] <= prune_thresh;
   }
-
-  // Silence compiler warnings
-  (void)only_thresh;
-  (void)partition_none_allowed;
-  (void)do_square_split;
-  (void)do_rectangular_split;
 }
 
 // Early terminates PARTITION_NONE using simple_motion_search features and the
@@ -398,24 +560,23 @@
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
-    int *early_terminate, float *simple_motion_features,
-    int *simple_motion_features_are_valid) {
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x, PC_TREE *pc_tree,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize,
+                                              const RD_STATS *none_rdc,
+                                              int *early_terminate) {
   // TODO(chiyotsai@google.com): There are other features we can extract from
   // PARTITION_NONE. Play with this later.
-  int f_idx = 0;
-  if (!*simple_motion_features_are_valid) {
-    simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
-                                             bsize, simple_motion_features);
-    *simple_motion_features_are_valid = 1;
-  }
-  f_idx = 25;
+  float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+  int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
 
-  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
-  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
-  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+  features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+  features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+  features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
 
   assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
 
@@ -446,8 +607,8 @@
   if (ml_model) {
     float score = 0.0f;
     for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
-      score += ml_model[f_idx] *
-               (simple_motion_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+      score +=
+          ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
     }
     score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
 
@@ -457,122 +618,6 @@
   }
 }
 
-static void firstpass_simple_motion_search_features(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features) {
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-  assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
-         cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
-
-  // Setting up motion search
-  const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
-  const int num_refs = 2;
-  const int use_subpixel = 0;
-
-  unsigned int int_features[10] = { 0 };
-
-  int f_idx = 0;
-  // Doing whole block first to update the mv
-  simple_motion_search_get_best_ref(
-      cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
-      4, &int_features[f_idx], &int_features[f_idx + 1]);
-  f_idx += 2;
-
-  // Split subblocks
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  const int w_mi = mi_size_wide[bsize];
-  const int h_mi = mi_size_high[bsize];
-  for (int r_idx = 0; r_idx < 4; r_idx++) {
-    const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
-    const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
-
-    simple_motion_search_get_best_ref(
-        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-        use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
-    f_idx += 2;
-  }
-
-  aom_clear_system_state();
-  for (int idx = 0; idx < f_idx; idx++) {
-    features[idx] = logf(1.0f + (float)int_features[idx]);
-  }
-
-  const MACROBLOCKD *xd = &x->e_mbd;
-  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
-
-  // Q_INDEX
-  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-
-  // Neighbor stuff
-  const int has_above = !!xd->above_mbmi;
-  const int has_left = !!xd->left_mbmi;
-  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
-  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
-  features[f_idx++] = (float)has_above;
-  features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
-  features[f_idx++] = (float)mi_size_high_log2[above_bsize];
-  features[f_idx++] = (float)has_left;
-  features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
-  features[f_idx++] = (float)mi_size_high_log2[left_bsize];
-}
-
-void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
-                                                   MACROBLOCK *x,
-                                                   PC_TREE *pc_tree, int mi_row,
-                                                   int mi_col, BLOCK_SIZE bsize,
-                                                   const RD_STATS *none_rdc,
-                                                   int *do_square_split) {
-  const NN_CONFIG *nn_config = NULL;
-  float thresh = 0.0f;
-  const float *ml_mean = NULL, *ml_std = NULL;
-  if (bsize == BLOCK_32X32) {
-    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_32;
-    ml_mean = av1_fp_simple_motion_search_term_none_mean_32;
-    ml_std = av1_fp_simple_motion_search_term_none_std_32;
-    thresh = av1_fp_simple_motion_search_term_none_thresh_32;
-  } else if (bsize == BLOCK_16X16) {
-    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_16;
-    ml_mean = av1_fp_simple_motion_search_term_none_mean_16;
-    ml_std = av1_fp_simple_motion_search_term_none_std_16;
-    thresh = av1_fp_simple_motion_search_term_none_thresh_16;
-  } else if (bsize == BLOCK_8X8) {
-    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_8;
-    ml_mean = av1_fp_simple_motion_search_term_none_mean_8;
-    ml_std = av1_fp_simple_motion_search_term_none_std_8;
-    thresh = av1_fp_simple_motion_search_term_none_thresh_8;
-  } else {
-    assert(0 &&
-           "Unexpected bsize in firstpass_simple_motion_search_early_term");
-    return;
-  }
-
-  float ml_features[FEATURE_SIZE_FP_SMS_TERM_NONE] = { 0.0f };
-
-  firstpass_simple_motion_search_features(cpi, x, pc_tree, mi_row, mi_col,
-                                          bsize, ml_features);
-  int f_idx = 17;
-
-  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
-  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
-  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
-
-  for (f_idx = 0; f_idx < 20; f_idx++) {
-    ml_features[f_idx] = (ml_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
-  }
-
-  // Get probabilities
-  float score = 0.0f;
-
-  av1_nn_predict(ml_features, nn_config, &score);
-  aom_clear_system_state();
-
-  // Determine if we should prune square partitions.
-  if (score < thresh) {
-    *do_square_split = 0;
-  }
-}
-
 void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
                                         int mi_row, int mi_col,
                                         float *features) {
@@ -616,14 +661,13 @@
       const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
       unsigned int sse = 0;
       unsigned int var = 0;
-      const MV ref_mv_full = { .row = 0, .col = 0 };
-
-      av1_simple_motion_sse_var(cpi, x, this_mi_row, this_mi_col, mb_size,
-                                ref_mv_full, 0, &sse, &var);
+      const FULLPEL_MV start_mv = kZeroFullMv;
+      int_mv best_mv = av1_simple_motion_sse_var(
+          cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
 
       aom_clear_system_state();
-      const float mv_row = (float)(x->best_mv.as_mv.row / 8);
-      const float mv_col = (float)(x->best_mv.as_mv.col / 8);
+      const float mv_row = (float)(best_mv.as_mv.row / 8);
+      const float mv_col = (float)(best_mv.as_mv.col / 8);
       const float log_sse = logf(1.0f + (float)sse);
       const float abs_mv_row = fabsf(mv_row);
       const float abs_mv_col = fabsf(mv_col);
@@ -676,14 +720,16 @@
         probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
   const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
 
-  assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE);
+  assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+         NOT_IN_USE);
 
   aom_clear_system_state();
-  av1_nn_predict(features, nn_config, scores);
+  av1_nn_predict(features, nn_config, 1, scores);
   av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
-  if (cpi->sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) {
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      DIRECT_PRED) {
     result = 0;
     float max_prob = probs[0];
     for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
@@ -692,7 +738,7 @@
         result = i;
       }
     }
-  } else if (cpi->sf.auto_max_partition_based_on_simple_motion ==
+  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
              RELAXED_PRED) {
     for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
          --result) {
@@ -701,7 +747,8 @@
       }
       if (probs[result] > 0.2) break;
     }
-  } else if (cpi->sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) {
+  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+             ADAPT_PRED) {
     const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
     MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
@@ -725,3 +772,517 @@
 
   return (BLOCK_SIZE)((result + 2) * 3);
 }
+
+// Get the minimum partition block width and height(in log scale) under a
+// PC_TREE.
+static AOM_INLINE void get_min_bsize(const PC_TREE *pc_tree, int *min_bw,
+                                     int *min_bh) {
+  if (!pc_tree) return;
+
+  const BLOCK_SIZE bsize = pc_tree->block_size;
+  if (bsize == BLOCK_4X4) {
+    *min_bw = 0;
+    *min_bh = 0;
+    return;
+  }
+
+  PARTITION_TYPE part_type = pc_tree->partitioning;
+  if (part_type == PARTITION_INVALID) return;
+
+  if (part_type == PARTITION_SPLIT) {
+    for (int i = 0; i < 4; ++i) {
+      get_min_bsize(pc_tree->split[i], min_bw, min_bh);
+    }
+  } else {
+    if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
+        part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
+      part_type = PARTITION_SPLIT;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
+    if (subsize != BLOCK_INVALID) {
+      *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
+      *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]);
+    }
+  }
+}
+
+static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+                                  int *feature_idx) {
+  const int rd_valid = rd > 0 && rd < INT64_MAX;
+  const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
+  features[(*feature_idx)++] = (float)rd_valid;
+  features[(*feature_idx)++] = rd_ratio;
+}
+
+#define FEATURES 31
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd, int mi_row,
+                                   int mi_col,
+                                   int *const terminate_partition_search) {
+  if (best_rd <= 0 || best_rd == INT64_MAX || *terminate_partition_search)
+    return;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const NN_CONFIG *nn_config = NULL;
+  float thresh = -1e6;
+  switch (bsize) {
+    case BLOCK_128X128: break;
+    case BLOCK_64X64:
+      nn_config = &av1_early_term_after_split_nnconfig_64;
+      thresh = is_480p_or_larger ? -2.0f : -1.2f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_early_term_after_split_nnconfig_32;
+      thresh = is_480p_or_larger ? -2.6f : -2.3f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_early_term_after_split_nnconfig_16;
+      thresh = is_480p_or_larger ? -2.0f : -2.4f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &av1_early_term_after_split_nnconfig_8;
+      thresh = is_480p_or_larger ? -1.0f : -1.4f;
+      break;
+    case BLOCK_4X4: break;
+    default:
+      assert(0 && "Invalid block size in av1_ml_early_term_after_split().");
+      break;
+  }
+  if (!nn_config) return;
+
+  // Use more conservative threshold for level 1.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  const int bs = block_size_wide[bsize];
+  int f_idx = 0;
+  float features[FEATURES] = { 0.0f };
+
+  aom_clear_system_state();
+
+  features[f_idx++] = logf(1.0f + (float)dc_q / 4.0f);
+  features[f_idx++] = logf(1.0f + (float)best_rd / bs / bs / 1024.0f);
+
+  add_rd_feature(part_none_rd, best_rd, features, &f_idx);
+  add_rd_feature(part_split_rd, best_rd, features, &f_idx);
+
+  for (int i = 0; i < 4; ++i) {
+    add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
+    int min_bw = MAX_SB_SIZE_LOG2;
+    int min_bh = MAX_SB_SIZE_LOG2;
+    get_min_bsize(pc_tree->split[i], &min_bw, &min_bh);
+    features[f_idx++] = (float)min_bw;
+    features[f_idx++] = (float)min_bh;
+  }
+
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, NULL,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_none_feat[1]);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[0]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[1]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[2]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[3]->sms_none_feat[1]);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[3]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[5]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[7]);
+
+  assert(f_idx == FEATURES);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  // Score is indicator of confidence that we should NOT terminate.
+  if (score < thresh) *terminate_partition_search = 1;
+}
+#undef FEATURES
+
+void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                 int64_t best_rd, int64_t none_rd,
+                                 int64_t *split_rd, int *const dst_prune_horz,
+                                 int *const dst_prune_vert) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  best_rd = AOMMAX(best_rd, 1);
+  const NN_CONFIG *nn_config = NULL;
+  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+  float cur_thresh = 0.0f;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_rect_partition_nnconfig_8;
+      cur_thresh = prob_thresholds[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_rect_partition_nnconfig_16;
+      cur_thresh = prob_thresholds[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_rect_partition_nnconfig_32;
+      cur_thresh = prob_thresholds[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_rect_partition_nnconfig_64;
+      cur_thresh = prob_thresholds[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_rect_partition_nnconfig_128;
+      cur_thresh = prob_thresholds[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+  aom_clear_system_state();
+
+  // 1. Compute input features
+  float features[9];
+
+  // RD cost ratios
+  for (int i = 0; i < 5; i++) features[i] = 1.0f;
+  if (none_rd > 0 && none_rd < 1000000000)
+    features[0] = (float)none_rd / (float)best_rd;
+  for (int i = 0; i < 4; i++) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      features[1 + i] = (float)split_rd[i] / (float)best_rd;
+  }
+
+  // Variance ratios
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int whole_block_variance;
+  if (is_cur_buf_hbd(xd)) {
+    whole_block_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    whole_block_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+  int split_variance[4];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  struct buf_2d buf;
+  buf.stride = x->plane[0].src.stride;
+  const int bw = block_size_wide[bsize];
+  for (int i = 0; i < 4; ++i) {
+    const int x_idx = (i & 1) * bw / 2;
+    const int y_idx = (i >> 1) * bw / 2;
+    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+    if (is_cur_buf_hbd(xd)) {
+      split_variance[i] =
+          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+    } else {
+      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+    }
+  }
+
+  for (int i = 0; i < 4; i++)
+    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+  float raw_scores[3] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, raw_scores);
+  aom_clear_system_state();
+  float probs[3] = { 0.0f };
+  av1_nn_softmax(raw_scores, probs, 3);
+
+  // probs[0] is the probability of the fact that both rectangular partitions
+  // are worse than current best_rd
+  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                               int64_t best_rd, int64_t horz_rd[2],
+                               int64_t vert_rd[2], int64_t split_rd[4],
+                               int *const horza_partition_allowed,
+                               int *const horzb_partition_allowed,
+                               int *const verta_partition_allowed,
+                               int *const vertb_partition_allowed) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  aom_clear_system_state();
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  *horza_partition_allowed = 0;
+  *horzb_partition_allowed = 0;
+  *verta_partition_allowed = 0;
+  *vertb_partition_allowed = 0;
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *horza_partition_allowed = 1;
+      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+      if ((i >> 2) & 1) *verta_partition_allowed = 1;
+      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+    }
+  }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
+                              int64_t horz_rd[2], int64_t vert_rd[2],
+                              int64_t split_rd[4],
+                              int *const partition_horz4_allowed,
+                              int *const partition_vert4_allowed,
+                              unsigned int pb_source_variance, int mi_row,
+                              int mi_col) {
+  if (best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[FEATURES];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[4] = { 0 };
+  unsigned int vert_4_source_var[4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < 4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      if (is_cur_buf_hbd(xd)) {
+        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &horz_4_src, horz_4_bs, xd->bd);
+        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &vert_4_src, vert_4_bs, xd->bd);
+      } else {
+        horz_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
+        vert_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  assert(feature_index == FEATURES);
+
+  // Calculate scores using the NN model.
+  float score[LABELS] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  aom_clear_system_state();
+  int int_score[LABELS];
+  int max_score = -1000;
+  for (int i = 0; i < LABELS; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 500; break;
+    case BLOCK_32X32: thresh -= 500; break;
+    case BLOCK_64X64: thresh -= 200; break;
+    default: break;
+  }
+  *partition_horz4_allowed = 0;
+  *partition_vert4_allowed = 0;
+  for (int i = 0; i < LABELS; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+    }
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            unsigned int pb_source_variance) {
+  const NN_CONFIG *nn_config = NULL;
+  int thresh = 0;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_partition_breakout_nnconfig_8;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_partition_breakout_nnconfig_16;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_partition_breakout_nnconfig_32;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_partition_breakout_nnconfig_64;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_partition_breakout_nnconfig_128;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config || thresh < 0) return 0;
+
+  // Generate feature values.
+  float features[FEATURES];
+  int feature_index = 0;
+  aom_clear_system_state();
+
+  const int num_pels_log2 = num_pels_log2_lookup[bsize];
+  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+           rate_f;
+  features[feature_index++] = rate_f;
+
+  const float dist_f =
+      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+  features[feature_index++] = dist_f;
+
+  features[feature_index++] = (float)pb_source_variance;
+
+  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+  assert(feature_index == FEATURES);
+
+  // Calculate score using the NN model.
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  // Make decision.
+  return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+#endif  // !CONFIG_REALTIME_ONLY

diff --git a/libaom/av1/encoder/partition_strategy.h b/libaom/av1/encoder/partition_strategy.h
index 36b1e95..f9b4d8b 100644
--- a/libaom/av1/encoder/partition_strategy.h
+++ b/libaom/av1/encoder/partition_strategy.h

@@ -16,54 +16,66 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encoder.h"
 
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
 #define FEATURE_SIZE_SMS_PRUNE_PART 25
 #define FEATURE_SIZE_SMS_TERM_NONE 28
 #define FEATURE_SIZE_FP_SMS_TERM_NONE 20
 #define FEATURE_SIZE_MAX_MIN_PART_PRED 13
 #define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
 
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int bsize, int label_idx,
+                                  int *partition_none_allowed,
+                                  int *partition_horz_allowed,
+                                  int *partition_vert_allowed,
+                                  int *do_rectangular_split,
+                                  int *do_square_split);
+
 // Performs a simple_motion_search with a single reference frame and extract
 // the variance of residues. Then use the features to determine whether we want
 // to go straight to splitting without trying PARTITION_NONE
 void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_rectangular_split,
-    int *do_square_split);
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_rectangular_split, int *do_square_split);
 
 // Performs a simple_motion_search with two reference frames and extract
 // the variance of residues. Then use the features to determine whether we want
 // to prune some partitions.
-void av1_simple_motion_search_prune_part(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_square_split, int *do_rectangular_split, int *prune_horz,
-    int *prune_vert, float *features, int *valid);
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         PC_TREE *pc_tree, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int *partition_horz_allowed,
+                                         int *partition_vert_allowed,
+                                         int *prune_horz, int *prune_vert);
 
+#if !CONFIG_REALTIME_ONLY
 // Early terminates PARTITION_NONE using simple_motion_search features and the
 // rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
 //  - The frame is a show frame
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
-    int *early_terminate, float *simple_motion_features,
-    int *simple_motion_features_are_valid);
-
-// Early terminates after PARTITION_NONE in firstpass of two pass partition
-// search.
-void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
-                                                   MACROBLOCK *x,
-                                                   PC_TREE *pc_tree, int mi_row,
-                                                   int mi_col, BLOCK_SIZE bsize,
-                                                   const RD_STATS *none_rdc,
-                                                   int *do_square_split);
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x, PC_TREE *pc_tree,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize,
+                                              const RD_STATS *none_rdc,
+                                              int *early_terminate);
 
 // Get the features for selecting the max and min partition size. Currently this
-// performs simple_motion_search on 16X16 subblocks of the currnet superblock,
+// performs simple_motion_search on 16X16 subblocks of the current superblock,
 // and then extract the statistics of sse and motion vectors as features.
 void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
                                         int mi_row, int mi_col,
@@ -73,6 +85,54 @@
 BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
                                      const float *features);
 
+// Attempts an early termination after PARTITION_SPLIT.
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd, int mi_row,
+                                   int mi_col,
+                                   int *const terminate_partition_search);
+
+// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
+// PARTITION_VERT.
+// TODO(chiyotsai@google.com): Currently this model does not use q value and has
+// no information about rectangular partitions. Preliminary experiments suggest
+// that we can get better performance by adding in q_index and rectangular
+// sse/var from SMS. We should retrain and tune this model later.
+void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                 int64_t best_rd, int64_t none_rd,
+                                 int64_t *split_rd, int *const dst_prune_horz,
+                                 int *const dst_prune_vert);
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                               int64_t best_rd, int64_t horz_rd[2],
+                               int64_t vert_rd[2], int64_t split_rd[4],
+                               int *const horza_partition_allowed,
+                               int *const horzb_partition_allowed,
+                               int *const verta_partition_allowed,
+                               int *const vertb_partition_allowed);
+
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
+                              int64_t horz_rd[2], int64_t vert_rd[2],
+                              int64_t split_rd[4],
+                              int *const partition_horz4_allowed,
+                              int *const partition_vert4_allowed,
+                              unsigned int pb_source_variance, int mi_row,
+                              int mi_col);
+
+// ML-based partition search breakout after PARTITION_NONE
+int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            unsigned int pb_source_variance);
+#endif  // !CONFIG_REALTIME_ONLY
+
 // A simplified version of set_offsets meant to be used for
 // simple_motion_search.
 static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
@@ -80,12 +140,14 @@
                                                  int mi_row, int mi_col,
                                                  BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
 
   // Set up destination pointers.
   av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
@@ -93,33 +155,35 @@
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
-  x->mv_limits.row_min =
-      -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
-  x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+  av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
 
   set_plane_n4(xd, mi_width, mi_height, num_planes);
 
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+
   // Set up distance of MB to edge of frame in 1/8th pel units.
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - mi_height - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - mi_width - mi_col) * MI_SIZE) * 8;
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge =
+      GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+  xd->mb_to_right_edge =
+      GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
 
   // Set up source buffers.
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-
-  // R/D setup.
-  x->rdmult = cpi->rd.RDMULT;
 }
 
 static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
-  for (int idx = 0; idx < REF_FRAMES; idx++) {
-    pc_tree->mv_ref_fulls[idx].row = 0;
-    pc_tree->mv_ref_fulls[idx].col = 0;
-  }
+  av1_zero(pc_tree->start_mvs);
+
+  av1_zero(pc_tree->sms_none_feat);
+  av1_zero(pc_tree->sms_rect_feat);
+  av1_zero(pc_tree->sms_none_valid);
+  av1_zero(pc_tree->sms_rect_valid);
+
   if (pc_tree->block_size >= BLOCK_8X8) {
     init_simple_motion_search_mvs(pc_tree->split[0]);
     init_simple_motion_search_mvs(pc_tree->split[1]);
@@ -128,27 +192,31 @@
   }
 }
 
-static INLINE int is_full_sb(AV1_COMMON *const cm, int mi_row, int mi_col,
-                             BLOCK_SIZE sb_size) {
+static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, BLOCK_SIZE sb_size) {
   const int sb_mi_wide = mi_size_wide[sb_size];
   const int sb_mi_high = mi_size_high[sb_size];
 
-  return (mi_row + sb_mi_high) <= cm->mi_rows &&
-         (mi_col + sb_mi_wide) <= cm->mi_cols;
+  return (mi_row + sb_mi_high) <= mi_params->mi_rows &&
+         (mi_col + sb_mi_wide) <= mi_params->mi_cols;
 }
 
+// Do not use this criteria for screen content videos.
+// Since screen content videos could often find good predictors and the largest
+// block size is likely to be used.
 static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
   AV1_COMMON *const cm = &cpi->common;
-
-  return !frame_is_intra_only(cm) &&
-         cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE &&
-         sb_size == BLOCK_128X128 && is_full_sb(cm, mi_row, mi_col, sb_size) &&
-         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
-             OVERLAY_UPDATE &&
-         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
-             INTNL_OVERLAY_UPDATE;
+  return !frame_is_intra_only(cm) && !cpi->is_screen_content_type &&
+         cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+             NOT_IN_USE &&
+         sb_size == BLOCK_128X128 &&
+         is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
+         cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE &&
+         cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE;
 }
 
 #endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_

diff --git a/libaom/av1/encoder/pass2_strategy.c b/libaom/av1/encoder/pass2_strategy.c
index ac22b68..6adc1fb 100644
--- a/libaom/av1/encoder/pass2_strategy.c
+++ b/libaom/av1/encoder/pass2_strategy.c

@@ -19,34 +19,46 @@
 
 #include "aom_ports/system_state.h"
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/use_flat_gop_model_params.h"
+#include "av1/encoder/encode_strategy.h"
+
+#define DEFAULT_KF_BOOST 2300
+#define DEFAULT_GF_BOOST 2000
+#define GROUP_ADAPTIVE_MAXQ 1
+static void init_gf_stats(GF_GROUP_STATS *gf_stats);
 
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
 #define MAX_ACTIVE_AREA 1.0
-double calculate_active_area(const AV1_COMP *cpi,
-                             const FIRSTPASS_STATS *this_frame) {
-  double active_pct;
-
-  active_pct =
+static double calculate_active_area(const FRAME_INFO *frame_info,
+                                    const FIRSTPASS_STATS *this_frame) {
+  const double active_pct =
       1.0 -
       ((this_frame->intra_skip_pct / 2) +
-       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+       ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
   return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
 }
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
 #define ACT_AREA_CORRECTION 0.5
-double calculate_modified_err(const AV1_COMP *cpi, const TWO_PASS *twopass,
-                              const AV1EncoderConfig *oxcf,
-                              const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = twopass->stats_buf_ctx->total_stats;
+  if (stats == NULL) {
+    return 0;
+  }
   const double av_weight = stats->weight / stats->count;
   const double av_err = (stats->coded_error * av_weight) / stats->count;
   double modified_error =
@@ -60,7 +72,7 @@
   // 0.5N blocks of complexity 2X is a little easier than coding N
   // blocks of complexity X.
   modified_error *=
-      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+      pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
 
   return fclamp(modified_error, twopass->modified_error_min,
                 twopass->modified_error_max);
@@ -73,17 +85,29 @@
 }
 
 static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_in_end) return EOF;
+  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
 
   *fps = *p->stats_in;
   ++p->stats_in;
   return 1;
 }
 
+static int input_stats_lap(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  /* Move old stats[0] out to accommodate for next frame stats  */
+  memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
+          (p->stats_buf_ctx->stats_in_end - p->stats_in - 1) *
+              sizeof(FIRSTPASS_STATS));
+  p->stats_buf_ctx->stats_in_end--;
+  return 1;
+}
+
 // Read frame stats at an offset from the current position.
 static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
-  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
-      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
     return NULL;
   }
 
@@ -117,13 +141,6 @@
   section->duration -= frame->duration;
 }
 
-// Calculate the linear size relative to a baseline of 1080P
-#define BASE_SIZE 2073600.0  // 1920x1080
-static double get_linear_size_factor(const AV1_COMP *cpi) {
-  const double this_area = cpi->initial_width * cpi->initial_height;
-  return pow(this_area / BASE_SIZE, 0.5);
-}
-
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(const RATE_CONTROL *rc,
                           const AV1EncoderConfig *oxcf) {
@@ -138,61 +155,63 @@
   return (int)max_bits;
 }
 
-static double calc_correction_factor(double err_per_mb, double err_divisor,
-                                     double pt_low, double pt_high, int q,
-                                     aom_bit_depth_t bit_depth) {
-  const double error_term = err_per_mb / err_divisor;
-
-  // Adjustment based on actual quantizer to power term.
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+                                                            0.80, 0.85, 0.90,
+                                                            0.95, 0.95, 0.95 };
+#define ERR_DIVISOR 96.0
+static double calc_correction_factor(double err_per_mb, int q) {
+  const double error_term = err_per_mb / ERR_DIVISOR;
+  const int index = q >> 5;
+  // Adjustment to power term based on qindex
   const double power_term =
-      AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
-
-  // Calculate correction factor.
-  if (power_term < 1.0) assert(error_term >= 0.0);
-
+      q_pow_term[index] +
+      (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+  assert(error_term >= 0.0);
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-#define ERR_DIVISOR 100.0
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
+static void twopass_update_bpm_factor(TWO_PASS *twopass) {
+  // Based on recent history adjust expectations of bits per macroblock.
+  double last_group_rate_err =
+      (double)twopass->rolling_arf_group_actual_bits /
+      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+  last_group_rate_err = AOMMAX(0.25, AOMMIN(4.0, last_group_rate_err));
+  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+  twopass->bpm_factor = AOMMAX(0.25, AOMMIN(4.0, twopass->bpm_factor));
+}
+
+static int qbpm_enumerator(int rate_err_tol) {
+  return 1350000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+}
 
 // Similar to find_qindex_by_rate() function in ratectrl.c, but includes
 // calculation of a correction_factor.
 static int find_qindex_by_rate_with_correction(
-    int desired_bits_per_mb, aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
-    double error_per_mb, double ediv_size_correction,
-    double group_weight_factor, int best_qindex, int worst_qindex) {
+    int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
+    double group_weight_factor, int rate_err_tol, int best_qindex,
+    int worst_qindex) {
   assert(best_qindex <= worst_qindex);
   int low = best_qindex;
   int high = worst_qindex;
+
   while (low < high) {
     const int mid = (low + high) >> 1;
-    const double mid_factor =
-        calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
-                               FACTOR_PT_LOW, FACTOR_PT_HIGH, mid, bit_depth);
-    const int mid_bits_per_mb = av1_rc_bits_per_mb(
-        frame_type, mid, mid_factor * group_weight_factor, bit_depth);
+    const double mid_factor = calc_correction_factor(error_per_mb, mid);
+    const double q = av1_convert_qindex_to_q(mid, bit_depth);
+    const int enumerator = qbpm_enumerator(rate_err_tol);
+    const int mid_bits_per_mb =
+        (int)((enumerator * mid_factor * group_weight_factor) / q);
+
     if (mid_bits_per_mb > desired_bits_per_mb) {
       low = mid + 1;
     } else {
       high = mid;
     }
   }
-#if CONFIG_DEBUG
-  assert(low == high);
-  const double low_factor =
-      calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
-                             FACTOR_PT_LOW, FACTOR_PT_HIGH, low, bit_depth);
-  const int low_bits_per_mb = av1_rc_bits_per_mb(
-      frame_type, low, low_factor * group_weight_factor, bit_depth);
-  assert(low_bits_per_mb <= desired_bits_per_mb || low == worst_qindex);
-#endif  // CONFIG_DEBUG
   return low;
 }
 
-static int get_twopass_worst_quality(const AV1_COMP *cpi,
-                                     const double section_err,
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double section_err,
                                      double inactive_zone,
                                      int section_target_bandwidth,
                                      double group_weight_factor) {
@@ -206,30 +225,22 @@
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
-                            : cpi->common.MBs;
+                            : cpi->common.mi_params.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const int target_norm_bits_per_mb =
         (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
         active_mbs;
+    int rate_err_tol =
+        AOMMIN(cpi->oxcf.under_shoot_pct, cpi->oxcf.over_shoot_pct);
 
-    // Larger image formats are expected to be a little harder to code
-    // relatively given the same prediction error score. This in part at
-    // least relates to the increased size and hence coding overheads of
-    // motion vectors. Some account of this is made through adjustment of
-    // the error divisor.
-    double ediv_size_correction =
-        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
-    if (ediv_size_correction < 1.0)
-      ediv_size_correction = -(1.0 / ediv_size_correction);
-    ediv_size_correction *= 4.0;
-
+    twopass_update_bpm_factor(&cpi->twopass);
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     int q = find_qindex_by_rate_with_correction(
-        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, INTER_FRAME,
-        av_err_per_mb, ediv_size_correction, group_weight_factor,
-        rc->best_quality, rc->worst_quality);
+        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth,
+        av_err_per_mb, group_weight_factor, rate_err_tol, rc->best_quality,
+        rc->worst_quality);
 
     // Restriction on active max q for constrained quality mode.
     if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
@@ -245,10 +256,9 @@
 #define SR_DIFF_MAX 128.0
 #define NCOUNT_FRAME_II_THRESH 5.0
 
-static double get_sr_decay_rate(const AV1_COMP *cpi,
+static double get_sr_decay_rate(const FRAME_INFO *frame_info,
                                 const FIRSTPASS_STATS *frame) {
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
+  const int num_mbs = frame_info->num_mbs;
   double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
   double sr_decay = 1.0;
   double modified_pct_inter;
@@ -274,18 +284,18 @@
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const AV1_COMP *cpi,
+static double get_zero_motion_factor(const FRAME_INFO *frame_info,
                                      const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(cpi, frame);
+  double sr_decay = get_sr_decay_rate(frame_info, frame);
   return AOMMIN(sr_decay, zero_motion_pct);
 }
 
 #define ZM_POWER_FACTOR 0.75
 
-static double get_prediction_decay_rate(const AV1_COMP *cpi,
+static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
                                         const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame);
   const double zero_motion_factor =
       (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
                   ZM_POWER_FACTOR));
@@ -297,39 +307,35 @@
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
-                                      int still_interval,
-                                      double loop_decay_rate,
-                                      double last_decay_rate) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
-
+static int detect_transition_to_still(TWO_PASS *const twopass,
+                                      const int min_gf_interval,
+                                      const int frame_interval,
+                                      const int still_interval,
+                                      const double loop_decay_rate,
+                                      const double last_decay_rate) {
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+  if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-
     // Look ahead a few frames to see if static condition persists...
     for (j = 0; j < still_interval; ++j) {
       const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
-      if (stats >= twopass->stats_in_end) break;
+      if (stats >= twopass->stats_buf_ctx->stats_in_end) break;
 
       if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
     }
-
     // Only if it does do we signal a transition to still.
     return j == still_interval;
   }
-
   return 0;
 }
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this.
-static int detect_flash(const TWO_PASS *twopass, int offset) {
+static int detect_flash(const TWO_PASS *twopass, const int offset) {
   const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
 
   // What we are looking for here is a situation where there is a
@@ -344,16 +350,13 @@
 
 // Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
-                                          double *mv_in_out,
-                                          double *mv_in_out_accumulator,
-                                          double *abs_mv_in_out_accumulator,
-                                          double *mv_ratio_accumulator) {
+                                          GF_GROUP_STATS *gf_stats) {
   const double pct = stats->pcnt_motion;
 
   // Accumulate Motion In/Out of frame stats.
-  *mv_in_out = stats->mv_in_out_count * pct;
-  *mv_in_out_accumulator += *mv_in_out;
-  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+  gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct;
+  gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out;
+  gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out);
 
   // Accumulate a measure of how uniform (or conversely how random) the motion
   // field is (a ratio of abs(mv) / mv).
@@ -363,30 +366,152 @@
     const double mvc_ratio =
         fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
 
-    *mv_ratio_accumulator +=
+    gf_stats->mv_ratio_accumulator +=
         pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
-    *mv_ratio_accumulator +=
+    gf_stats->mv_ratio_accumulator +=
         pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
   }
 }
 
-#define BASELINE_ERR_PER_MB 1000.0
-#define BOOST_FACTOR 12.5
+static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
+                                        const double mod_frame_err,
+                                        GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+  gf_stats->gf_group_raw_error += stats->coded_error;
+#endif
+  gf_stats->gf_group_skip_pct += stats->intra_skip_pct;
+  gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
+}
 
-static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+static void accumulate_next_frame_stats(
+    const FIRSTPASS_STATS *stats, const FRAME_INFO *frame_info,
+    TWO_PASS *const twopass, const int flash_detected,
+    const int frames_since_key, const int cur_idx, const int can_disable_arf,
+    const int min_gf_interval, GF_GROUP_STATS *gf_stats) {
+  accumulate_frame_motion_stats(stats, gf_stats);
+  // sum up the metric values of current gf group
+  gf_stats->avg_sr_coded_error += stats->sr_coded_error;
+  gf_stats->avg_tr_coded_error += stats->tr_coded_error;
+  gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
+  gf_stats->avg_pcnt_third_ref += stats->pcnt_third_ref;
+  gf_stats->avg_new_mv_count += stats->new_mv_count;
+  gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
+  if (fabs(stats->raw_error_stdev) > 0.000001) {
+    gf_stats->non_zero_stdev_count++;
+    gf_stats->avg_raw_err_stdev += stats->raw_error_stdev;
+  }
+
+  // Accumulate the effect of prediction quality decay
+  if (!flash_detected) {
+    gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats);
+
+    gf_stats->decay_accumulator =
+        gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
+
+    // Monitor for static sections.
+    if ((frames_since_key + cur_idx - 1) > 1) {
+      gf_stats->zero_motion_accumulator =
+          AOMMIN(gf_stats->zero_motion_accumulator,
+                 get_zero_motion_factor(frame_info, stats));
+    }
+
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+    if (can_disable_arf &&
+        detect_transition_to_still(twopass, min_gf_interval, cur_idx, 5,
+                                   gf_stats->loop_decay_rate,
+                                   gf_stats->last_loop_decay_rate)) {
+      gf_stats->allow_alt_ref = 0;
+    }
+  }
+}
+
+static void average_gf_stats(const int total_frame,
+                             const FIRSTPASS_STATS *last_stat,
+                             GF_GROUP_STATS *gf_stats) {
+  if (total_frame) {
+    gf_stats->avg_sr_coded_error /= total_frame;
+    gf_stats->avg_tr_coded_error /= total_frame;
+    gf_stats->avg_pcnt_second_ref /= total_frame;
+    if (total_frame - 1) {
+      gf_stats->avg_pcnt_third_ref_nolast =
+          (gf_stats->avg_pcnt_third_ref - last_stat->pcnt_third_ref) /
+          (total_frame - 1);
+    } else {
+      gf_stats->avg_pcnt_third_ref_nolast =
+          gf_stats->avg_pcnt_third_ref / total_frame;
+    }
+    gf_stats->avg_pcnt_third_ref /= total_frame;
+    gf_stats->avg_new_mv_count /= total_frame;
+    gf_stats->avg_wavelet_energy /= total_frame;
+  }
+
+  if (gf_stats->non_zero_stdev_count)
+    gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
+}
+
+static void get_features_from_gf_stats(const GF_GROUP_STATS *gf_stats,
+                                       const GF_FRAME_STATS *first_frame,
+                                       const GF_FRAME_STATS *last_frame,
+                                       const int num_mbs,
+                                       const int constrained_gf_group,
+                                       const int kf_zeromotion_pct,
+                                       const int num_frames, float *features) {
+  *features++ = (float)gf_stats->abs_mv_in_out_accumulator;
+  *features++ = (float)(gf_stats->avg_new_mv_count / num_mbs);
+  *features++ = (float)gf_stats->avg_pcnt_second_ref;
+  *features++ = (float)gf_stats->avg_pcnt_third_ref;
+  *features++ = (float)gf_stats->avg_pcnt_third_ref_nolast;
+  *features++ = (float)(gf_stats->avg_sr_coded_error / num_mbs);
+  *features++ = (float)(gf_stats->avg_tr_coded_error / num_mbs);
+  *features++ = (float)(gf_stats->avg_wavelet_energy / num_mbs);
+  *features++ = (float)(constrained_gf_group);
+  *features++ = (float)gf_stats->decay_accumulator;
+  *features++ = (float)(first_frame->frame_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_sr_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_tr_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_err / num_mbs);
+  *features++ = (float)(kf_zeromotion_pct);
+  *features++ = (float)(last_frame->frame_coded_error / num_mbs);
+  *features++ = (float)(last_frame->frame_sr_coded_error / num_mbs);
+  *features++ = (float)(last_frame->frame_tr_coded_error / num_mbs);
+  *features++ = (float)num_frames;
+  *features++ = (float)gf_stats->mv_ratio_accumulator;
+  *features++ = (float)gf_stats->non_zero_stdev_count;
+}
+
+#define BOOST_FACTOR 12.5
+static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
+  unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area <= 640 * 360) {
+    return 500.0;
+  } else {
+    return 1000.0;
+  }
+}
+
+static double calc_frame_boost(const RATE_CONTROL *rc,
+                               const FRAME_INFO *frame_info,
+                               const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
-  const double lq = av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
+  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
   const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
+  const double active_area = calculate_active_area(frame_info, this_frame);
+  int num_mbs = frame_info->num_mbs;
 
   // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+                       this_frame->intra_error * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
   frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
@@ -402,22 +527,77 @@
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-#define GF_MAX_BOOST 90.0
-#define MIN_ARF_GF_BOOST 240
-#define MIN_DECAY_FACTOR 0.01
+static double calc_kf_frame_boost(const RATE_CONTROL *rc,
+                                  const FRAME_INFO *frame_info,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
+  const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
+  const double active_area = calculate_active_area(frame_info, this_frame);
+  int num_mbs = frame_info->num_mbs;
 
-static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
-                          int *f_boost, int *b_boost) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+                       this_frame->intra_error * active_area) /
+                DOUBLE_DIVIDE_CHECK(
+                    (this_frame->coded_error + *sr_accumulator) * active_area);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+  *sr_accumulator = AOMMAX(0.0, *sr_accumulator);
+
+  // Q correction and scaling
+  // The 40.0 value here is an experimentally derived baseline minimum.
+  // This value is in line with the minimum per frame boost in the alt_ref
+  // boost calculation.
+  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
+                                   int frames_to_project,
+                                   int num_stats_used_for_gfu_boost) {
+  /*
+   * If frames_to_project is equal to num_stats_used_for_gfu_boost,
+   * it means that gfu_boost was calculated over frames_to_project to
+   * begin with(ie; all stats required were available), hence return
+   * the original boost.
+   */
+  if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
+
+  double min_boost_factor = sqrt(rc->baseline_gf_interval);
+  // Get the current tpl factor (number of frames = frames_to_project).
+  double tpl_factor = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
+  // Get the tpl factor when number of frames = num_stats_used_for_prior_boost.
+  double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost);
+  int projected_gfu_boost =
+      (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats);
+  return projected_gfu_boost;
+}
+
+#define GF_MAX_BOOST 90.0
+#define MIN_DECAY_FACTOR 0.01
+int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+                       FRAME_INFO *frame_info, int offset, int f_frames,
+                       int b_frames, int *num_fpstats_used,
+                       int *num_fpstats_required) {
   int i;
-  double boost_score = 0.0;
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  double boost_score = (double)NORMAL_BOOST;
   int arf_boost;
   int flash_detected = 0;
+  if (num_fpstats_used) *num_fpstats_used = 0;
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
@@ -425,9 +605,7 @@
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+    accumulate_frame_motion_stats(this_frame, &gf_stats);
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -436,36 +614,32 @@
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR
-                              : decay_accumulator;
+      gf_stats.decay_accumulator *=
+          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
     }
 
     boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+        gf_stats.decay_accumulator *
+        calc_frame_boost(rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
   }
 
-  *f_boost = (int)boost_score;
+  arf_boost = (int)boost_score;
 
   // Reset for backward looking loop.
   boost_score = 0.0;
-  mv_ratio_accumulator = 0.0;
-  decay_accumulator = 1.0;
-  this_frame_mv_in_out = 0.0;
-  mv_in_out_accumulator = 0.0;
-  abs_mv_in_out_accumulator = 0.0;
-
+  init_gf_stats(&gf_stats);
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
     const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+    accumulate_frame_motion_stats(this_frame, &gf_stats);
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -474,22 +648,31 @@
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR
-                              : decay_accumulator;
+      gf_stats.decay_accumulator *=
+          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
     }
 
     boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+        gf_stats.decay_accumulator *
+        calc_frame_boost(rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
   }
-  *b_boost = (int)boost_score;
+  arf_boost += (int)boost_score;
 
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
-  arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+  if (num_fpstats_required) {
+    *num_fpstats_required = f_frames + b_frames;
+    if (num_fpstats_used) {
+      arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required,
+                                          *num_fpstats_used);
+    }
+  }
+
+  if (arf_boost < ((b_frames + f_frames) * 50))
+    arf_boost = ((b_frames + f_frames) * 50);
 
   return arf_boost;
 }
@@ -543,13 +726,15 @@
   return total_group_bits;
 }
 
-// Calculate the number bits extra to assign to boosted frames in a group.
+// Calculate the number of bits to assign to boosted frames in a group.
 static int calculate_boost_bits(int frame_count, int boost,
                                 int64_t total_group_bits) {
   int allocation_chunks;
 
   // return 0 for invalid inputs (could arise e.g. through rounding errors)
-  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+  if (!boost || (total_group_bits <= 0)) return 0;
+
+  if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX));
 
   allocation_chunks = (frame_count * 100) + boost;
 
@@ -565,30 +750,152 @@
                 0);
 }
 
-#define LEAF_REDUCTION_FACTOR 0.75
-static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
-  { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
-};
-static void allocate_gf_group_bits(
-    AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits,
-    const EncodeFrameParams *const frame_params) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = (frame_params->frame_type == KEY_FRAME);
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
-  int64_t total_group_bits = gf_group_bits;
+// Calculate the boost factor based on the number of bits assigned, i.e. the
+// inverse of calculate_boost_bits().
+static int calculate_boost_factor(int frame_count, int bits,
+                                  int64_t total_group_bits) {
+  aom_clear_system_state();
+  return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
+}
 
-  // Check if GF group has any internal arfs.
-  int has_internal_arfs = 0;
-  for (int i = 0; i < gf_group->size; ++i) {
-    if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
-      has_internal_arfs = 1;
-      break;
+// Reduce the number of bits assigned to keyframe or arf if necessary, to
+// prevent bitrate spikes that may break level constraints.
+// frame_type: 0: keyframe; 1: arf.
+static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
+                                              RATE_CONTROL *const rc,
+                                              int bits_assigned,
+                                              int64_t group_bits,
+                                              int frame_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
+       ++index) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[index],
+                               temporal_layer_id, spatial_layer_id)) {
+      continue;
+    }
+
+    const AV1_LEVEL target_level =
+        cpi->level_params.target_seq_level_idx[index];
+    if (target_level >= SEQ_LEVELS) continue;
+
+    assert(is_valid_seq_level_idx(target_level));
+
+    const double level_bitrate_limit = av1_get_max_bitrate_for_level(
+        target_level, seq_params->tier[0], seq_params->profile);
+    const int target_bits_per_frame =
+        (int)(level_bitrate_limit / cpi->framerate);
+    if (frame_type == 0) {
+      // Maximum bits for keyframe is 8 times the target_bits_per_frame.
+      const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
+      if (bits_assigned > level_enforced_max_kf_bits) {
+        const int frames = rc->frames_to_key - 1;
+        rc->kf_boost = calculate_boost_factor(
+            frames, level_enforced_max_kf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits);
+      }
+    } else if (frame_type == 1) {
+      // Maximum bits for arf is 4 times the target_bits_per_frame.
+      const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
+      if (bits_assigned > level_enforced_max_arf_bits) {
+        rc->gfu_boost = calculate_boost_factor(
+            rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(rc->baseline_gf_interval,
+                                             rc->gfu_boost, group_bits);
+      }
+    } else {
+      assert(0);
     }
   }
 
+  return bits_assigned;
+}
+
+// Compile time switch on alternate algorithm to allocate bits in ARF groups
+// #define ALT_ARF_ALLOCATION
+#ifdef ALT_ARF_ALLOCATION
+double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
+                                              0.60, 1.0,  1.0 };
+static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+                                   int64_t gf_group_bits, int gf_arf_bits,
+                                   int key_frame, int use_arf) {
+  int64_t total_group_bits = gf_group_bits;
+  int base_frame_bits;
+  const int gf_group_size = gf_group->size;
+  int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
+
+  // Subtract the extra bits set aside for ARF frames from the Group Total
+  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
+
+  if (rc->baseline_gf_interval)
+    base_frame_bits = (int)(total_group_bits / rc->baseline_gf_interval);
+  else
+    base_frame_bits = (int)1;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  int frame_index = 0;
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] =
+          base_frame_bits + (int)(gf_arf_bits * layer_fraction[1]);
+  }
+  frame_index++;
+
+  // Check the number of frames in each layer in case we have a
+  // non standard group length.
+  int max_arf_layer = gf_group->max_layer_depth - 1;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    if ((gf_group->update_type[idx] == ARF_UPDATE) ||
+        (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
+      // max_arf_layer = AOMMAX(max_arf_layer, gf_group->layer_depth[idx]);
+      layer_frames[gf_group->layer_depth[idx]]++;
+    }
+  }
+
+  // Allocate extra bits to each ARF layer
+  int i;
+  int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 };
+  for (i = 1; i <= max_arf_layer; ++i) {
+    double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i];
+    layer_extra_bits[i] =
+        (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i]));
+    gf_arf_bits -= (int)(gf_arf_bits * fraction);
+  }
+
+  // Now combine ARF layer and baseline bits to give total bits for each frame.
+  int arf_extra_bits;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    switch (gf_group->update_type[idx]) {
+      case ARF_UPDATE:
+      case INTNL_ARF_UPDATE:
+        arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
+        gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+        break;
+      case INTNL_OVERLAY_UPDATE:
+      case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
+      default: gf_group->bit_allocation[idx] = base_frame_bits; break;
+    }
+  }
+
+  // Set the frame following the current GOP to 0 bit allocation. For ARF
+  // groups, this next frame will be overlay frame, which is the first frame
+  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+  // Setting this frame to use 0 bit (of out the current GOP budget) will
+  // simplify logics in reference frame management.
+  gf_group->bit_allocation[gf_group_size] = 0;
+}
+#else
+static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+                                   int64_t gf_group_bits, int gf_arf_bits,
+                                   int key_frame, int use_arf) {
+  int64_t total_group_bits = gf_group_bits;
+
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
   // === [frame_index == 0] ===
@@ -598,122 +905,70 @@
       gf_group->bit_allocation[frame_index] = 0;
     else
       gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
-    // Step over the golden frame / overlay frame
-    FIRSTPASS_STATS frame_stats;
-    if (EOF == input_stats(twopass, &frame_stats)) return;
   }
 
   // Deduct the boost bits for arf (or gf if it is not a key frame)
   // from the group total.
-  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
 
   frame_index++;
 
   // Store the bits to spend on the ARF if there is one.
   // === [frame_index == 1] ===
-  if (rc->source_alt_ref_pending) {
+  if (use_arf) {
     gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
     ++frame_index;
+  }
 
-    // Skip all the internal ARFs right after ARF at the starting segment of
-    // the current GF group.
-    if (has_internal_arfs) {
-      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
-        ++frame_index;
-      }
+  const int gf_group_size = gf_group->size;
+  int arf_depth_bits[MAX_ARF_LAYERS + 1] = { 0 };
+  int arf_depth_count[MAX_ARF_LAYERS + 1] = { 0 };
+  int arf_depth_boost[MAX_ARF_LAYERS + 1] = { 0 };
+  int total_arfs = 0;
+  int total_overlays = rc->source_alt_ref_active;
+
+  for (int idx = 0; idx < gf_group_size; ++idx) {
+    if (gf_group->update_type[idx] == ARF_UPDATE ||
+        gf_group->update_type[idx] == INTNL_ARF_UPDATE ||
+        gf_group->update_type[idx] == LF_UPDATE) {
+      arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->arf_boost[idx];
+      ++arf_depth_count[gf_group->layer_depth[idx]];
     }
   }
 
-  // Save.
-  const int tmp_frame_index = frame_index;
-  int budget_reduced_from_leaf_level = 0;
+  for (int idx = 2; idx <= MAX_ARF_LAYERS; ++idx) {
+    arf_depth_bits[idx] =
+        calculate_boost_bits(rc->baseline_gf_interval - total_arfs -
+                                 total_overlays - arf_depth_count[idx],
+                             arf_depth_boost[idx], total_group_bits);
+    total_group_bits -= arf_depth_bits[idx];
+    total_arfs += arf_depth_count[idx];
+  }
 
-  // Allocate bits to frames other than first frame, which is either a keyframe,
-  // overlay frame or golden frame.
-  const int normal_frames = rc->baseline_gf_interval - 1;
-
-  for (int i = 0; i < normal_frames; ++i) {
-    FIRSTPASS_STATS frame_stats;
-    if (EOF == input_stats(twopass, &frame_stats)) break;
-
-    const double modified_err =
-        calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
-    const double err_fraction =
-        (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error)
-                          : 0.0;
-    const int target_frame_size =
-        clamp((int)((double)total_group_bits * err_fraction), 0,
-              AOMMIN(max_bits, (int)total_group_bits));
-
-    if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
-      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
-             "non-valid height for a pyramid structure");
-
-      const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
-      gf_group->bit_allocation[frame_index] = 0;
-
-      gf_group->bit_allocation[arf_pos] = target_frame_size;
-      // Note: Boost, if needed, is added in the next loop.
-    } else {
-      assert(gf_group->update_type[frame_index] == LF_UPDATE);
-      gf_group->bit_allocation[frame_index] = target_frame_size;
-      if (has_internal_arfs) {
-        const int this_budget_reduction =
-            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
-        gf_group->bit_allocation[frame_index] -= this_budget_reduction;
-        budget_reduced_from_leaf_level += this_budget_reduction;
-      }
-    }
-
-    ++frame_index;
-
-    // Skip all the internal ARFs.
-    if (has_internal_arfs) {
-      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
-        ++frame_index;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    switch (gf_group->update_type[idx]) {
+      case ARF_UPDATE:
+      case INTNL_ARF_UPDATE:
+      case LF_UPDATE:
+        gf_group->bit_allocation[idx] =
+            (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
+                   gf_group->arf_boost[idx]) /
+                  arf_depth_boost[gf_group->layer_depth[idx]]);
+        break;
+      case INTNL_OVERLAY_UPDATE:
+      case OVERLAY_UPDATE:
+      default: gf_group->bit_allocation[idx] = 0; break;
     }
   }
 
-  if (budget_reduced_from_leaf_level > 0) {
-    assert(has_internal_arfs);
-    // Restore.
-    frame_index = tmp_frame_index;
-
-    // Re-distribute this extra budget to overlay frames in the group.
-    for (int i = 0; i < normal_frames; ++i) {
-      if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
-        assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
-               "non-valid height for a pyramid structure");
-        const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
-        const int this_lvl = gf_group->pyramid_level[arf_pos];
-        const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
-        const double lvl_boost_factor =
-            lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
-        const int extra_size =
-            (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
-                  gf_group->pyramid_lvl_nodes[this_lvl]);
-        gf_group->bit_allocation[arf_pos] += extra_size;
-      }
-      ++frame_index;
-
-      // Skip all the internal ARFs.
-      if (has_internal_arfs) {
-        while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
-          ++frame_index;
-        }
-      }
-    }
-  }
+  // Set the frame following the current GOP to 0 bit allocation. For ARF
+  // groups, this next frame will be overlay frame, which is the first frame
+  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+  // Setting this frame to use 0 bit (of out the current GOP budget) will
+  // simplify logics in reference frame management.
+  gf_group->bit_allocation[gf_group_size] = 0;
 }
-
-// Given the maximum allowed height of the pyramid structure, return the fixed
-// GF length to be used.
-static INLINE int get_fixed_gf_length(int max_pyr_height) {
-  (void)max_pyr_height;
-  return MAX_GF_INTERVAL;
-}
+#endif
 
 // Returns true if KF group and GF group both are almost completely static.
 static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
@@ -722,53 +977,591 @@
 }
 
 #define ARF_ABS_ZOOM_THRESH 4.4
-#define GROUP_ADAPTIVE_MAXQ 1
+static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+                                int flash_detected, int active_max_gf_interval,
+                                int active_min_gf_interval,
+                                GF_GROUP_STATS *gf_stats) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  // Motion breakout threshold for loop below depends on image size.
+  const double mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  if (!flash_detected) {
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+    if (detect_transition_to_still(
+            twopass, rc->min_gf_interval, frame_index - cur_start, 5,
+            gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) {
+      return 1;
+    }
+  }
+
+  // Some conditions to breakout after min interval.
+  if (frame_index - cur_start >= active_min_gf_interval &&
+      // If possible don't break very close to a kf
+      (rc->frames_to_key - frame_index >= rc->min_gf_interval) &&
+      ((frame_index - cur_start) & 0x01) && !flash_detected &&
+      (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+       gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+    return 1;
+  }
+
+  // If almost totally static, we will not use the the max GF length later,
+  // so we can continue for more frames.
+  if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
+      !is_almost_static(gf_stats->zero_motion_accumulator,
+                        twopass->kf_zeromotion_pct)) {
+    return 1;
+  }
+  return 0;
+}
+
+#define MAX_PAD_GF_CHECK 6  // padding length to check for gf length
+#define AVG_SI_THRES 0.6    // thres for average silouette
+#define GF_SHRINK_OUTPUT 0  // print output for gf length decision
+int determine_high_err_gf(double *errs, int *is_high, double *si, int len,
+                          double *ratio, int gf_start, int gf_end,
+                          int before_pad) {
+  (void)gf_start;
+  (void)gf_end;
+  (void)before_pad;
+  // alpha and beta controls the threshold placement
+  // e.g. a smaller alpha makes the lower group more rigid
+  const double alpha = 0.5;
+  const double beta = 1 - alpha;
+  double mean = 0;
+  double mean_low = 0;
+  double mean_high = 0;
+  double prev_mean_low = 0;
+  double prev_mean_high = 0;
+  int count_low = 0;
+  int count_high = 0;
+  // calculate mean of errs
+  for (int i = 0; i < len; i++) {
+    mean += errs[i];
+  }
+  mean /= len;
+  // separate into two initial groups with greater / lower than mean
+  for (int i = 0; i < len; i++) {
+    if (errs[i] <= mean) {
+      is_high[i] = 0;
+      count_low++;
+      prev_mean_low += errs[i];
+    } else {
+      is_high[i] = 1;
+      count_high++;
+      prev_mean_high += errs[i];
+    }
+  }
+  prev_mean_low /= count_low;
+  prev_mean_high /= count_high;
+  // kmeans to refine
+  int count = 0;
+  while (count < 10) {
+    // re-group
+    mean_low = 0;
+    mean_high = 0;
+    count_low = 0;
+    count_high = 0;
+    double thres = prev_mean_low * alpha + prev_mean_high * beta;
+    for (int i = 0; i < len; i++) {
+      if (errs[i] <= thres) {
+        is_high[i] = 0;
+        count_low++;
+        mean_low += errs[i];
+      } else {
+        is_high[i] = 1;
+        count_high++;
+        mean_high += errs[i];
+      }
+    }
+    mean_low /= count_low;
+    mean_high /= count_high;
+
+    // break if not changed much
+    if (fabs((mean_low - prev_mean_low) / (prev_mean_low + 0.00001)) <
+            0.00001 &&
+        fabs((mean_high - prev_mean_high) / (prev_mean_high + 0.00001)) <
+            0.00001)
+      break;
+
+    // update means
+    prev_mean_high = mean_high;
+    prev_mean_low = mean_low;
+
+    count++;
+  }
+
+  // count how many jumps of group changes
+  int num_change = 0;
+  for (int i = 0; i < len - 1; i++) {
+    if (is_high[i] != is_high[i + 1]) num_change++;
+  }
+
+  // get silhouette as a measure of the classification quality
+  double avg_si = 0;
+  // ai: avg dist of its own class, bi: avg dist to the other class
+  double ai, bi;
+  if (count_low > 1 && count_high > 1) {
+    for (int i = 0; i < len; i++) {
+      ai = 0;
+      bi = 0;
+      // calculate average distance to everyone in the same group
+      // and in the other group
+      for (int j = 0; j < len; j++) {
+        if (i == j) continue;
+        if (is_high[i] == is_high[j]) {
+          ai += fabs(errs[i] - errs[j]);
+        } else {
+          bi += fabs(errs[i] - errs[j]);
+        }
+      }
+      if (is_high[i] == 0) {
+        ai = ai / (count_low - 1);
+        bi = bi / count_high;
+      } else {
+        ai = ai / (count_high - 1);
+        bi = bi / count_low;
+      }
+      if (ai <= bi) {
+        si[i] = 1 - ai / (bi + 0.00001);
+      } else {
+        si[i] = bi / (ai + 0.00001) - 1;
+      }
+      avg_si += si[i];
+    }
+    avg_si /= len;
+  }
+
+  int reset = 0;
+  *ratio = mean_high / (mean_low + 0.00001);
+  // if the two groups too similar, or
+  // if too many numbers of changes, or
+  // silhouette is too small, not confident
+  // reset everything to 0 later so we fallback to the original decision
+  if (*ratio < 1.3 || num_change > AOMMAX(len / 3, 6) ||
+      avg_si < AVG_SI_THRES) {
+    reset = 1;
+  }
+
+#if GF_SHRINK_OUTPUT
+  printf("\n");
+  for (int i = 0; i < len; i++) {
+    printf("%d: err %.1f, ishigh %d, si %.2f, (i=%d)\n",
+           gf_start + i - before_pad, errs[i], is_high[i], si[i], gf_end);
+  }
+  printf(
+      "count: %d, mean_high: %.1f, mean_low: %.1f, avg_si: %.2f, num_change: "
+      "%d, ratio %.2f, reset: %d\n",
+      count, mean_high, mean_low, avg_si, num_change,
+      mean_high / (mean_low + 0.000001), reset);
+#endif
+
+  if (reset) {
+    memset(is_high, 0, sizeof(is_high[0]) * len);
+    memset(si, 0, sizeof(si[0]) * len);
+  }
+  return reset;
+}
+
 #if GROUP_ADAPTIVE_MAXQ
 #define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.75
+#define RC_FACTOR_MAX 1.25
 #endif  // GROUP_ADAPTIVE_MAXQ
 #define MIN_FWD_KF_INTERVAL 8
+#define MIN_SHRINK_LEN 6      // the minimum length of gf if we are shrinking
+#define SI_HIGH AVG_SI_THRES  // high quality classification
+#define SI_LOW 0.3            // very unsure classification
+// this function finds an low error frame previously to the current last frame
+// in the gf group, and set the last frame to it.
+// The resulting last frame is then returned by *cur_last_ptr
+// *cur_start_ptr and cut_pos[n] could also change due to shrinking
+// previous gf groups
+void set_last_prev_low_err(int *cur_start_ptr, int *cur_last_ptr, int *cut_pos,
+                           int count_cuts, int before_pad, double ratio,
+                           int *is_high, double *si, int prev_lows) {
+  int n;
+  int cur_start = *cur_start_ptr;
+  int cur_last = *cur_last_ptr;
+  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
+    // try to find a point that is very probable to be good
+    if (is_high[n - cur_start + before_pad] == 0 &&
+        si[n - cur_start + before_pad] > SI_HIGH) {
+      *cur_last_ptr = n;
+      return;
+    }
+  }
+  // could not find a low-err point, then let's try find an "unsure"
+  // point at least
+  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
+    if ((is_high[n - cur_start + before_pad] == 0) ||
+        (is_high[n - cur_start + before_pad] &&
+         si[n - cur_start + before_pad] < SI_LOW)) {
+      *cur_last_ptr = n;
+      return;
+    }
+  }
+  if (prev_lows) {
+    // try with shrinking previous all_zero interval
+    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
+      if (is_high[n - cur_start + before_pad] == 0 &&
+          si[n - cur_start + before_pad] > SI_HIGH) {
+        int tentative_start = n - MIN_SHRINK_LEN;
+        // check if the previous interval can shrink this much
+        int available =
+            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
+            cur_start - tentative_start < prev_lows;
+        // shrinking too agressively may worsen performance
+        // set stricter thres for shorter length
+        double ratio_thres =
+            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
+            1.0;
+
+        if (available && (ratio > ratio_thres)) {
+          cut_pos[count_cuts - 1] = tentative_start;
+          *cur_start_ptr = tentative_start;
+          *cur_last_ptr = n;
+          return;
+        }
+      }
+    }
+  }
+  if (prev_lows) {
+    // try with shrinking previous all_zero interval with unsure points
+    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
+      if ((is_high[n - cur_start + before_pad] == 0) ||
+          (is_high[n - cur_start + before_pad] &&
+           si[n - cur_start + before_pad] < SI_LOW)) {
+        int tentative_start = n - MIN_SHRINK_LEN;
+        // check if the previous interval can shrink this much
+        int available =
+            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
+            cur_start - tentative_start < prev_lows;
+        // shrinking too agressively may worsen performance
+        double ratio_thres =
+            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
+            1.0;
+
+        if (available && (ratio > ratio_thres)) {
+          cut_pos[count_cuts - 1] = tentative_start;
+          *cur_start_ptr = tentative_start;
+          *cur_last_ptr = n;
+          return;
+        }
+      }
+    }
+  }  // prev_lows
+  return;
+}
+
+// This function decides the gf group length of future frames in batch
+// rc->gf_intervals is modified to store the group lengths
+static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
+                                int max_intervals) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  int i;
+
+  int flash_detected;
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  if (has_no_stats_stage(cpi)) {
+    for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
+      rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+    }
+    rc->cur_gf_index = 0;
+    rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
+    return;
+  }
+
+  // TODO(urvang): Try logic to vary min and max interval based on q.
+  const int active_min_gf_interval = rc->min_gf_interval;
+  const int active_max_gf_interval =
+      AOMMIN(rc->max_gf_interval, max_gop_length);
+
+  i = 0;
+  max_intervals = cpi->lap_enabled ? 1 : max_intervals;
+  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { 0 };
+  int count_cuts = 1;
+  int cur_start = 0, cur_last;
+  int cut_here;
+  int prev_lows = 0;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  while (count_cuts < max_intervals + 1) {
+    ++i;
+
+    // reaches next key frame, break here
+    if (i >= rc->frames_to_key) {
+      cut_pos[count_cuts] = i - 1;
+      count_cuts++;
+      break;
+    }
+
+    // reached maximum len, but nothing special yet (almost static)
+    // let's look at the next interval
+    if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      cut_here = 1;
+    } else {
+      // reaches last frame, break
+      if (EOF == input_stats(twopass, &next_frame)) {
+        cut_pos[count_cuts] = i - 1;
+        count_cuts++;
+        break;
+      }
+      // Test for the case where there is a brief flash but the prediction
+      // quality back to an earlier frame is then restored.
+      flash_detected = detect_flash(twopass, 0);
+      // TODO(bohanli): remove redundant accumulations here, or unify
+      // this and the ones in define_gf_group
+      accumulate_next_frame_stats(&next_frame, frame_info, twopass,
+                                  flash_detected, rc->frames_since_key, i, 0,
+                                  rc->min_gf_interval, &gf_stats);
+
+      cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
+                               active_max_gf_interval, active_min_gf_interval,
+                               &gf_stats);
+    }
+    if (cut_here) {
+      cur_last = i - 1;  // the current last frame in the gf group
+      // only try shrinking if interval smaller than active_max_gf_interval
+      if (cur_last - cur_start <= active_max_gf_interval) {
+        // determine in the current decided gop the higher and lower errs
+        int n;
+        double ratio;
+
+        // load neighboring coded errs
+        int is_high[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        double errs[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        double si[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        int before_pad =
+            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_since_key - 1 + cur_start);
+        int after_pad =
+            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_to_key - cur_last - 1);
+        for (n = cur_start - before_pad; n <= cur_last + after_pad; n++) {
+          if (start_pos + n - 1 > twopass->stats_buf_ctx->stats_in_end) {
+            after_pad = n - cur_last - 1;
+            assert(after_pad >= 0);
+            break;
+          } else if (start_pos + n - 1 <
+                     twopass->stats_buf_ctx->stats_in_start) {
+            before_pad = cur_start - n - 1;
+            continue;
+          }
+          errs[n + before_pad - cur_start] = (start_pos + n - 1)->coded_error;
+        }
+        const int len = before_pad + after_pad + cur_last - cur_start + 1;
+        const int reset = determine_high_err_gf(
+            errs, is_high, si, len, &ratio, cur_start, cur_last, before_pad);
+
+        // if the current frame may have high error, try shrinking
+        if (is_high[cur_last - cur_start + before_pad] == 1 ||
+            (!reset && si[cur_last - cur_start + before_pad] < SI_LOW)) {
+          // try not to cut in high err area
+          set_last_prev_low_err(&cur_start, &cur_last, cut_pos, count_cuts,
+                                before_pad, ratio, is_high, si, prev_lows);
+        }  // if current frame high error
+        // count how many trailing lower error frames we have in this decided
+        // gf group
+        prev_lows = 0;
+        for (n = cur_last - 1; n > cur_start + MIN_SHRINK_LEN; n--) {
+          if (is_high[n - cur_start + before_pad] == 0 &&
+              (si[n - cur_start + before_pad] > SI_HIGH || reset)) {
+            prev_lows++;
+          } else {
+            break;
+          }
+        }
+      }
+      cut_pos[count_cuts] = cur_last;
+      count_cuts++;
+
+      // reset pointers to the shrinked location
+      twopass->stats_in = start_pos + cur_last;
+      cur_start = cur_last;
+      i = cur_last;
+
+      // reset accumulators
+      init_gf_stats(&gf_stats);
+    }
+  }
+
+  // save intervals
+  rc->intervals_till_gf_calculate_due = count_cuts - 1;
+  for (int n = 1; n < count_cuts; n++) {
+    rc->gf_intervals[n - 1] = cut_pos[n] + 1 - cut_pos[n - 1];
+  }
+  rc->cur_gf_index = 0;
+  twopass->stats_in = start_pos;
+
+#if GF_SHRINK_OUTPUT
+  printf("\nf_to_key: %d, count_cut: %d. ", rc->frames_to_key, count_cuts);
+  for (int n = 0; n < count_cuts; n++) {
+    printf("%d ", cut_pos[n]);
+  }
+  printf("\n");
+
+  for (int n = 0; n < rc->intervals_till_gf_calculate_due; n++) {
+    printf("%d ", rc->gf_intervals[n]);
+  }
+  printf("\n\n");
+#endif
+}
+
+static void correct_frames_to_key(AV1_COMP *cpi) {
+  int lookahead_size =
+      (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage) + 1;
+  if (lookahead_size <
+      av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage)) {
+    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+  }
+}
+
+static void define_gf_group_pass0(AV1_COMP *cpi,
+                                  const EncodeFrameParams *const frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  int target;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_set_golden_update(cpi);
+  } else {
+    rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
+    rc->intervals_till_gf_calculate_due--;
+    rc->cur_gf_index++;
+  }
+
+  // correct frames_to_key when lookahead queue is flushing
+  correct_frames_to_key(cpi);
+
+  if (rc->baseline_gf_interval > rc->frames_to_key)
+    rc->baseline_gf_interval = rc->frames_to_key;
+
+  rc->gfu_boost = DEFAULT_GF_BOOST;
+  rc->constrained_gf_group =
+      (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+
+  gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+
+  // Rare case when the look-ahead is less than the target GOP length, can't
+  // generate ARF frame.
+  if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames ||
+      !is_altref_enabled(cpi) || rc->baseline_gf_interval < rc->min_gf_interval)
+    gf_group->max_layer_depth_allowed = 0;
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi, frame_params);
+
+  // Allocate bits to each of the frames in the GF group.
+  // TODO(sarahparker) Extend this to work with pyramid structure.
+  for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
+    const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      if (cur_update_type == KEY_FRAME) {
+        target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
+      }
+    } else {
+      if (cur_update_type == KEY_FRAME) {
+        target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
+      }
+    }
+    gf_group->bit_allocation[cur_index] = target;
+  }
+}
+
+static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
+                                            int active_max_gf_interval,
+                                            int use_alt_ref,
+                                            int is_final_pass) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  // Set the interval until the next gf.
+  // If forward keyframes are enabled, ensure the final gf group obeys the
+  // MIN_FWD_KF_INTERVAL.
+  if (cpi->oxcf.fwd_kf_enabled && use_alt_ref &&
+      ((twopass->stats_in - arf_position + rc->frames_to_key) <
+       twopass->stats_buf_ctx->stats_in_end) &&
+      cpi->rc.next_is_fwd_key) {
+    if (arf_position == rc->frames_to_key) {
+      rc->baseline_gf_interval = arf_position;
+      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+    } else if ((rc->frames_to_key - arf_position <
+                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+               (rc->frames_to_key != arf_position)) {
+      // if possible, merge the last two gf groups
+      if (rc->frames_to_key <= active_max_gf_interval) {
+        rc->baseline_gf_interval = rc->frames_to_key;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+        // if merging the last two gf groups creates a group that is too long,
+        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+      } else {
+        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+      }
+    } else {
+      rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+    }
+  } else {
+    rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+  }
+}
+
+// initialize GF_GROUP_STATS
+static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err = 0.0;
+  gf_stats->gf_group_raw_error = 0.0;
+  gf_stats->gf_group_skip_pct = 0.0;
+  gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+  gf_stats->mv_ratio_accumulator = 0.0;
+  gf_stats->decay_accumulator = 1.0;
+  gf_stats->zero_motion_accumulator = 1.0;
+  gf_stats->loop_decay_rate = 1.0;
+  gf_stats->last_loop_decay_rate = 1.0;
+  gf_stats->this_frame_mv_in_out = 0.0;
+  gf_stats->mv_in_out_accumulator = 0.0;
+  gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+  gf_stats->avg_sr_coded_error = 0.0;
+  gf_stats->avg_tr_coded_error = 0.0;
+  gf_stats->avg_pcnt_second_ref = 0.0;
+  gf_stats->avg_pcnt_third_ref = 0.0;
+  gf_stats->avg_pcnt_third_ref_nolast = 0.0;
+  gf_stats->avg_new_mv_count = 0.0;
+  gf_stats->avg_wavelet_energy = 0.0;
+  gf_stats->avg_raw_err_stdev = 0.0;
+  gf_stats->non_zero_stdev_count = 0;
+
+  gf_stats->allow_alt_ref = 0;
+}
 
 // Analyse and define a gf/arf group.
+#define MAX_GF_BOOST 5400
 static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                            const EncodeFrameParams *const frame_params) {
+                            const EncodeFrameParams *const frame_params,
+                            int max_gop_length, int is_final_pass) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  FRAME_INFO *frame_info = &cpi->frame_info;
   int i;
 
-  double boost_score = 0.0;
-  double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
-  double gf_group_raw_error = 0.0;
-#endif
-  double gf_group_skip_pct = 0.0;
-  double gf_group_inactive_zone_rows = 0.0;
-  double gf_first_frame_err = 0.0;
-  double mod_frame_err = 0.0;
-
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double zero_motion_accumulator = 1.0;
-
-  double loop_decay_rate = 1.00;
-  double last_loop_decay_rate = 1.00;
-
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-
-  unsigned int allow_alt_ref = is_altref_enabled(cpi);
-
-  int f_boost = 0;
-  int b_boost = 0;
   int flash_detected;
   int64_t gf_group_bits;
-  double gf_group_error_left;
-  int gf_arf_bits;
   const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
                             frame_params->frame_type == INTRA_ONLY_FRAME;
   const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
@@ -778,144 +1571,140 @@
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (!is_intra_only) {
-    av1_zero(twopass->gf_group);
+    av1_zero(cpi->gf_group);
   }
 
   aom_clear_system_state();
   av1_zero(next_frame);
 
+  if (has_no_stats_stage(cpi)) {
+    define_gf_group_pass0(cpi, frame_params);
+    return;
+  }
+
+  // correct frames_to_key when lookahead queue is emptying
+  if (cpi->lap_enabled) {
+    correct_frames_to_key(cpi);
+  }
+
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  GF_FRAME_STATS first_frame_stats, last_frame_stats;
+
+  gf_stats.allow_alt_ref = is_altref_enabled(cpi);
+  const int can_disable_arf = (oxcf->gf_min_pyr_height == MIN_PYRAMID_LVL);
+
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  double mod_frame_err =
+      calculate_modified_err(frame_info, twopass, oxcf, this_frame);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
-  gf_first_frame_err = mod_frame_err;
+  first_frame_stats.frame_err = mod_frame_err;
+  first_frame_stats.frame_coded_error = this_frame->coded_error;
+  first_frame_stats.frame_sr_coded_error = this_frame->sr_coded_error;
+  first_frame_stats.frame_tr_coded_error = this_frame->tr_coded_error;
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
   if (arf_active_or_kf) {
-    gf_group_err -= gf_first_frame_err;
+    gf_stats.gf_group_err -= first_frame_stats.frame_err;
 #if GROUP_ADAPTIVE_MAXQ
-    gf_group_raw_error -= this_frame->coded_error;
+    gf_stats.gf_group_raw_error -= this_frame->coded_error;
 #endif
-    gf_group_skip_pct -= this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+    gf_stats.gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_stats.gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
-  // Motion breakout threshold for loop below depends on image size.
-  const double mv_ratio_accumulator_thresh =
-      (cpi->initial_height + cpi->initial_width) / 4.0;
 
   // TODO(urvang): Try logic to vary min and max interval based on q.
   const int active_min_gf_interval = rc->min_gf_interval;
   const int active_max_gf_interval =
-      AOMMIN(rc->max_gf_interval, get_fixed_gf_length(oxcf->gf_max_pyr_height));
-
-  double avg_sr_coded_error = 0;
-  double avg_raw_err_stdev = 0;
-  int non_zero_stdev_count = 0;
+      AOMMIN(rc->max_gf_interval, max_gop_length);
 
   i = 0;
-  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+  // get the determined gf group length from rc->gf_intervals
+  while (i < rc->gf_intervals[rc->cur_gf_index]) {
     ++i;
-
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-    gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
-    gf_group_raw_error += this_frame->coded_error;
-#endif
-    gf_group_skip_pct += this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+    mod_frame_err =
+        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+    // accumulate stats for this frame
+    accumulate_this_frame_stats(this_frame, mod_frame_err, &gf_stats);
 
+    // read in the next frame
     if (EOF == input_stats(twopass, &next_frame)) break;
 
     // Test for the case where there is a brief flash but the prediction
     // quality back to an earlier frame is then restored.
     flash_detected = detect_flash(twopass, 0);
 
-    // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-    // sum up the metric values of current gf group
-    avg_sr_coded_error += next_frame.sr_coded_error;
-    if (fabs(next_frame.raw_error_stdev) > 0.000001) {
-      non_zero_stdev_count++;
-      avg_raw_err_stdev += next_frame.raw_error_stdev;
-    }
+    // accumulate stats for next frame
+    accumulate_next_frame_stats(
+        &next_frame, frame_info, twopass, flash_detected, rc->frames_since_key,
+        i, can_disable_arf, rc->min_gf_interval, &gf_stats);
 
-    // Accumulate the effect of prediction quality decay.
-    if (!flash_detected) {
-      last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
-      // Monitor for static sections.
-      if ((rc->frames_since_key + i - 1) > 1) {
-        zero_motion_accumulator = AOMMIN(
-            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-      }
-
-      // Break clause to detect very still sections after motion. For example,
-      // a static image after a fade or other transition.
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
-                                     last_loop_decay_rate)) {
-        allow_alt_ref = 0;
-        break;
-      }
-    }
-
-    // Calculate a boost number for this frame.
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-    // If almost totally static, we will not use the the max GF length later,
-    // so we can continue for more frames.
-    if ((i >= active_max_gf_interval + 1) &&
-        !is_almost_static(zero_motion_accumulator,
-                          twopass->kf_zeromotion_pct)) {
-      break;
-    }
-
-    // Some conditions to breakout after min interval.
-    if (i >= active_min_gf_interval &&
-        // If possible don't break very close to a kf
-        (rc->frames_to_key - i >= rc->min_gf_interval) && (i & 0x01) &&
-        !flash_detected &&
-        (mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
-         abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
-      break;
-    }
     *this_frame = next_frame;
   }
+  // save the errs for the last frame
+  last_frame_stats.frame_coded_error = next_frame.coded_error;
+  last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
+  last_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
+
+  if (is_final_pass) {
+    rc->intervals_till_gf_calculate_due--;
+    rc->cur_gf_index++;
+  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cm->mi_params.MBs;
   assert(num_mbs > 0);
-  if (i) avg_sr_coded_error /= i;
 
-  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+  average_gf_stats(i, &next_frame, &gf_stats);
 
   // Disable internal ARFs for "still" gf groups.
   //   zero_motion_accumulator: minimum percentage of (0,0) motion;
   //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
   //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
   //                            motion error per block of each frame.
-  if (zero_motion_accumulator > MIN_ZERO_MOTION &&
-      avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
-      avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+  const int can_disable_internal_arfs =
+      (oxcf->gf_min_pyr_height <= MIN_PYRAMID_LVL + 1);
+  if (can_disable_internal_arfs &&
+      gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
+      gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+      gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
     cpi->internal_altref_allowed = 0;
   }
 
-  const int use_alt_ref =
-      !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) &&
-      allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval) &&
-      (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+  int use_alt_ref;
+  if (can_disable_arf) {
+    use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator,
+                                    twopass->kf_zeromotion_pct) &&
+                  gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+                  (i >= MIN_GF_INTERVAL) &&
+                  (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+
+    // TODO(urvang): Improve and use model for VBR, CQ etc as well.
+    if (use_alt_ref && cpi->oxcf.rc_mode == AOM_Q &&
+        cpi->oxcf.cq_level <= 200) {
+      aom_clear_system_state();
+      float features[21];
+      get_features_from_gf_stats(
+          &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs,
+          rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
+      // Infer using ML model.
+      float score;
+      av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
+      use_alt_ref = (score <= 0.0);
+    }
+  } else {
+    assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+    use_alt_ref =
+        gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i > 2);
+  }
 
 #define REDUCE_GF_LENGTH_THRESH 4
 #define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
@@ -950,49 +1739,50 @@
       if (i - roll_back >= active_min_gf_interval + 1) {
         alt_offset = -roll_back;
         i -= roll_back;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
       }
     }
   }
 
   // Should we use the alternate reference frame.
   if (use_alt_ref) {
-    // Calculate the boost for alt ref.
-    rc->gfu_boost =
-        calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
     rc->source_alt_ref_pending = 1;
+    gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
+                             is_final_pass);
 
-    // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
-    cpi->preserve_arf_as_gld = 1;
+    const int forward_frames = (rc->frames_to_key - i >= i - 1)
+                                   ? i - 1
+                                   : AOMMAX(0, rc->frames_to_key - i);
+
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = av1_calc_arf_boost(
+        twopass, rc, frame_info, alt_offset, forward_frames, (i - 1),
+        cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
+        cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
   } else {
-    rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    reset_fpf_position(twopass, start_pos);
     rc->source_alt_ref_pending = 0;
-    cpi->preserve_arf_as_gld = 0;
+    gf_group->max_layer_depth_allowed = 0;
+    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
+                             is_final_pass);
+
+    rc->gfu_boost = AOMMIN(
+        MAX_GF_BOOST,
+        av1_calc_arf_boost(
+            twopass, rc, frame_info, alt_offset, (i - 1), 0,
+            cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
+            cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
   }
 
-  // Set the interval until the next gf.
-  // If forward keyframes are enabled, ensure the final gf group obeys the
-  // MIN_FWD_KF_INTERVAL.
-  if (cpi->oxcf.fwd_kf_enabled &&
-      ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
-    if (i == rc->frames_to_key) {
-      rc->baseline_gf_interval = i;
-      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
-    } else if ((rc->frames_to_key - i <
-                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
-               (rc->frames_to_key != i)) {
-      // if possible, merge the last two gf groups
-      if (rc->frames_to_key <= active_max_gf_interval) {
-        rc->baseline_gf_interval = rc->frames_to_key;
-        // if merging the last two gf groups creates a group that is too long,
-        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
-      } else {
-        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
-      }
-    } else {
-      rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
-    }
-  } else {
-    rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
+  // rc->gf_intervals assumes the usage of alt_ref, therefore adding one overlay
+  // frame to the next gf. If no alt_ref is used, should substract 1 frame from
+  // the next gf group.
+  // TODO(bohanli): should incorporate the usage of alt_ref into
+  // calculate_gf_length
+  if (is_final_pass && rc->source_alt_ref_pending == 0 &&
+      rc->intervals_till_gf_calculate_due > 0) {
+    rc->gf_intervals[rc->cur_gf_index]--;
   }
 
 #define LAST_ALR_BOOST_FACTOR 0.2f
@@ -1011,7 +1801,8 @@
   reset_fpf_position(twopass, start_pos);
 
   // Calculate the bits to be allocated to the gf/arf group as a whole
-  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
+  rc->gf_group_bits = gf_group_bits;
 
 #if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
@@ -1022,67 +1813,85 @@
   if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
     const int vbr_group_bits_per_frame =
         (int)(gf_group_bits / rc->baseline_gf_interval);
-    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+    const double group_av_err =
+        gf_stats.gf_group_raw_error / rc->baseline_gf_interval;
     const double group_av_skip_pct =
-        gf_group_skip_pct / rc->baseline_gf_interval;
+        gf_stats.gf_group_skip_pct / rc->baseline_gf_interval;
     const double group_av_inactive_zone =
-        ((gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mb_rows));
+        ((gf_stats.gf_group_inactive_zone_rows * 2) /
+         (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
 
     int tmp_q;
     // rc factor is a weight factor that corrects for local rate control drift.
     double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = AOMMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = AOMMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    int64_t bits = cpi->oxcf.target_bandwidth;
+
+    if (bits > 0) {
+      int rate_error;
+
+      rate_error = (int)((rc->vbr_bits_off_target * 100) / bits);
+      rate_error = clamp(rate_error, -100, 100);
+      if (rate_error > 0) {
+        rc_factor = AOMMAX(RC_FACTOR_MIN, (double)(100 - rate_error) / 100.0);
+      } else {
+        rc_factor = AOMMIN(RC_FACTOR_MAX, (double)(100 - rate_error) / 100.0);
+      }
     }
+
     tmp_q = get_twopass_worst_quality(
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
-        vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
-    twopass->active_worst_quality =
-        AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+        vbr_group_bits_per_frame, rc_factor);
+    rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
   }
 #endif
 
-  // Calculate the extra bits to be used for boosted frame(s)
-  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
-                                     gf_group_bits);
-
   // Adjust KF group bits and error remaining.
-  twopass->kf_group_error_left -= (int64_t)gf_group_err;
-
-  // If this is an arf update we want to remove the score for the overlay
-  // frame at the end which will usually be very cheap to code.
-  // The overlay frame has already, in effect, been coded so we want to spread
-  // the remaining bits among the other frames.
-  // For normal GFs remove the score for the GF itself unless this is
-  // also a key frame in which case it has already been accounted for.
-  if (rc->source_alt_ref_pending) {
-    gf_group_error_left = gf_group_err - mod_frame_err;
-  } else if (!is_intra_only) {
-    gf_group_error_left = gf_group_err - gf_first_frame_err;
-  } else {
-    gf_group_error_left = gf_group_err;
-  }
+  if (is_final_pass)
+    twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
   av1_gop_setup_structure(cpi, frame_params);
 
-  // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits,
-                         frame_params);
-
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
   // Calculate a section intra ratio used in setting max loop filter.
   if (frame_params->frame_type != KEY_FRAME) {
     twopass->section_intra_rating = calculate_section_intra_ratio(
-        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+        start_pos, twopass->stats_buf_ctx->stats_in_end,
+        rc->baseline_gf_interval);
   }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+
+  av1_gop_bit_allocation(cpi, rc, gf_group,
+                         frame_params->frame_type == KEY_FRAME, use_alt_ref,
+                         gf_group_bits);
+}
+
+// #define FIXED_ARF_BITS
+#ifdef FIXED_ARF_BITS
+#define ARF_BITS_FRACTION 0.75
+#endif
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits) {
+  // Calculate the extra bits to be used for boosted frame(s)
+#ifdef FIXED_ARF_BITS
+  int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
+#else
+  int gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                         rc->gfu_boost, gf_group_bits);
+#endif
+
+  gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
+                                                   gf_group_bits, 1);
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame,
+                         use_arf);
 }
 
 // Minimum % intra coding observed in first pass (1.0 = 100%)
@@ -1128,7 +1937,7 @@
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
                              const FIRSTPASS_STATS *next_frame,
-                             int frame_count_so_far) {
+                             int frame_count_so_far, enum aom_rc_mode rc_mode) {
   int is_viable_kf = 0;
   double pcnt_intra = 1.0 - this_frame->pcnt_inter;
   double modified_pcnt_inter =
@@ -1139,7 +1948,8 @@
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+  if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
+      (this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
       (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
        ((pcnt_intra > MIN_INTRA_LEVEL) &&
@@ -1164,7 +1974,7 @@
     double decay_accumulator = 1.0;
 
     // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < 16; ++i) {
+    for (i = 0; i < SCENE_CUT_KEY_TEST_INTERVAL; ++i) {
       double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
@@ -1213,78 +2023,99 @@
 #define FRAMES_TO_CHECK_DECAY 8
 #define KF_MIN_FRAME_BOOST 80.0
 #define KF_MAX_FRAME_BOOST 128.0
-#define MIN_KF_BOOST 300          // Minimum boost for non-static KF interval
+#define MIN_KF_BOOST 600  // Minimum boost for non-static KF interval
+#define MAX_KF_BOOST 3200
 #define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 
-static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int i, j;
-  RATE_CONTROL *const rc = &cpi->rc;
+static int detect_app_forced_key(AV1_COMP *cpi) {
+  if (cpi->oxcf.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
+  int num_frames_to_app_forced_key = is_forced_keyframe_pending(
+      cpi->lookahead, cpi->lookahead->max_sz, cpi->compressor_stage);
+  if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
+  return num_frames_to_app_forced_key;
+}
+
+static int get_projected_kf_boost(AV1_COMP *cpi) {
+  /*
+   * If num_stats_used_for_kf_boost >= frames_to_key, then
+   * all stats needed for prior boost calculation are available.
+   * Hence projecting the prior boost is not needed in this cases.
+   */
+  if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->rc.kf_boost;
+
+  // Get the current tpl factor (number of frames = frames_to_key).
+  double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
+  // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
+  double tpl_factor_num_stats =
+      av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost);
+  int projected_kf_boost =
+      (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats);
+  return projected_kf_boost;
+}
+
+static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                              double *kf_group_err,
+                              int num_frames_to_detect_scenecut) {
   TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
+  RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const FIRSTPASS_STATS first_frame = *this_frame;
-  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-  FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS last_frame;
-  int kf_bits = 0;
-  int loop_decay_counter = 0;
-  double decay_accumulator = 1.0;
-  double av_decay_accumulator = 0.0;
-  double zero_motion_accumulator = 1.0;
-  double boost_score = 0.0;
-  double kf_mod_err = 0.0;
-  double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  FIRSTPASS_STATS last_frame;
+  double decay_accumulator = 1.0;
+  int i = 0, j;
+  int frames_to_key = 1;
+  int frames_since_key = rc->frames_since_key + 1;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  int num_stats_used_for_kf_boost = 1;
+  int scenecut_detected = 0;
 
-  av1_zero(next_frame);
+  int num_frames_to_next_key = detect_app_forced_key(cpi);
 
-  rc->frames_since_key = 0;
+  if (num_frames_to_detect_scenecut == 0) {
+    if (num_frames_to_next_key != -1)
+      return num_frames_to_next_key;
+    else
+      return rc->frames_to_key;
+  }
 
-  // Reset the GF group data structures.
-  av1_zero(*gf_group);
-
-  // Is this a forced key frame by interval.
-  rc->this_key_frame_forced = rc->next_key_frame_forced;
-
-  // Clear the alt ref active flag and last group multi arf flags as they
-  // can never be set for a key frame.
-  rc->source_alt_ref_active = 0;
-
-  // KF is always a GF so clear frames till next gf counter.
-  rc->frames_till_gf_update_due = 0;
-
-  rc->frames_to_key = 1;
-
-  twopass->kf_group_bits = 0;        // Total bits available to kf group
-  twopass->kf_group_error_left = 0;  // Group modified error score.
-
-  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  if (num_frames_to_next_key != -1)
+    num_frames_to_detect_scenecut =
+        AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key);
 
   // Initialize the decay rates for the recent frames to check
   for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
 
-  // Find the next keyframe.
   i = 0;
-  while (twopass->stats_in < twopass->stats_in_end &&
-         rc->frames_to_key < cpi->oxcf.key_freq) {
+  while (twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
+         frames_to_key < num_frames_to_detect_scenecut) {
+    // Accumulate total number of stats available till next key frame
+    num_stats_used_for_kf_boost++;
+
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    if (kf_group_err != NULL)
+      *kf_group_err +=
+          calculate_modified_err(frame_info, twopass, oxcf, this_frame);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+    if (cpi->rc.enable_scenecut_detection && cpi->oxcf.auto_key &&
+        twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
       double loop_decay_rate;
 
       // Check for a scene cut.
       if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
-                            rc->frames_to_key))
+                            frames_since_key, oxcf->rc_mode)) {
+        scenecut_detected = 1;
         break;
+      }
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+      loop_decay_rate =
+          get_prediction_decay_rate(frame_info, twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -1296,22 +2127,105 @@
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
-                                     loop_decay_rate, decay_accumulator))
+      if (detect_transition_to_still(twopass, rc->min_gf_interval, i,
+                                     cpi->oxcf.key_freq - i, loop_decay_rate,
+                                     decay_accumulator)) {
+        scenecut_detected = 1;
         break;
+      }
 
       // Step on to the next frame.
-      ++rc->frames_to_key;
+      ++frames_to_key;
+      ++frames_since_key;
 
       // If we don't have a real key frame within the next two
       // key_freq intervals then break out of the loop.
-      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+      if (frames_to_key >= 2 * cpi->oxcf.key_freq) break;
     } else {
-      ++rc->frames_to_key;
+      ++frames_to_key;
+      ++frames_since_key;
     }
     ++i;
   }
 
+  if (kf_group_err != NULL)
+    rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
+
+  if (cpi->lap_enabled && !scenecut_detected)
+    frames_to_key = num_frames_to_next_key;
+
+  return frames_to_key;
+}
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  FIRSTPASS_STATS next_frame;
+  av1_zero(next_frame);
+
+  rc->frames_since_key = 0;
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  if (has_no_stats_stage(cpi)) {
+    int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
+    rc->this_key_frame_forced =
+        current_frame->frame_number != 0 && rc->frames_to_key == 0;
+    if (num_frames_to_app_forced_key != -1)
+      rc->frames_to_key = num_frames_to_app_forced_key;
+    else
+      rc->frames_to_key = AOMMAX(1, cpi->oxcf.key_freq);
+    correct_frames_to_key(cpi);
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+    gf_group->update_type[0] = KF_UPDATE;
+    return;
+  }
+  int i;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  int kf_bits = 0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_raw_err = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double sr_accumulator = 0.0;
+  int frames_to_key;
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_raw_err = this_frame->intra_error;
+  kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+  frames_to_key =
+      define_kf_interval(cpi, this_frame, &kf_group_err, oxcf->key_freq);
+
+  if (frames_to_key != -1)
+    rc->frames_to_key = AOMMIN(oxcf->key_freq, frames_to_key);
+  else
+    rc->frames_to_key = oxcf->key_freq;
+
+  if (cpi->lap_enabled) correct_frames_to_key(cpi);
+
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural interval
@@ -1328,11 +2242,13 @@
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
-      input_stats(twopass, &tmp_frame);
+      kf_group_err +=
+          calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame);
+      if (EOF == input_stats(twopass, &tmp_frame)) break;
     }
     rc->next_key_frame_forced = 1;
-  } else if (twopass->stats_in == twopass->stats_in_end ||
+  } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
+              is_stat_consumption_stage_twopass(cpi)) ||
              rc->frames_to_key >= cpi->oxcf.key_freq) {
     rc->next_key_frame_forced = 1;
   } else {
@@ -1340,9 +2256,10 @@
   }
 
   // Special case for the last key frame of the file.
-  if (twopass->stats_in >= twopass->stats_in_end) {
+  if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err +=
+        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
@@ -1372,7 +2289,6 @@
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
-  decay_accumulator = 1.0;
   boost_score = 0.0;
   const double kf_max_boost =
       cpi->oxcf.rc_mode == AOM_Q
@@ -1385,32 +2301,28 @@
     // Monitor for static sections.
     // For the first frame in kf group, the second ref indicator is invalid.
     if (i > 0) {
-      zero_motion_accumulator = AOMMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      zero_motion_accumulator =
+          AOMMIN(zero_motion_accumulator,
+                 get_zero_motion_factor(frame_info, &next_frame));
     } else {
       zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
     }
 
     // Not all frames in the group are necessarily used in calculating boost.
-    if ((i <= rc->max_gf_interval) ||
-        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
-      const double frame_boost =
-          calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
+    if ((sr_accumulator < (kf_raw_err * 1.50)) &&
+        (i <= rc->max_gf_interval * 2)) {
+      double frame_boost;
+      double zm_factor;
 
-      // How fast is prediction quality decaying.
-      if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate =
-            get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
-        av_decay_accumulator += decay_accumulator;
-        ++loop_decay_counter;
-      }
-      boost_score += (decay_accumulator * frame_boost);
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+      if (i < 2) sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(rc, frame_info, &next_frame,
+                                        &sr_accumulator, kf_max_boost);
+      boost_score += frame_boost * zm_factor;
     }
   }
-  if (loop_decay_counter > 0)
-    av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -1419,9 +2331,13 @@
 
   // Calculate a section intra ratio used in setting max loop filter.
   twopass->section_intra_rating = calculate_section_intra_ratio(
-      start_position, twopass->stats_in_end, rc->frames_to_key);
+      start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
 
-  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = (int)boost_score;
+
+  if (cpi->lap_enabled) {
+    rc->kf_boost = get_projected_kf_boost(cpi);
+  }
 
   // Special case for static / slide show content but don't apply
   // if the kf group is very short.
@@ -1432,6 +2348,9 @@
     // Apply various clamps for min and max boost
     rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
     rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+#ifdef STRICT_RC
+    rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST);
+#endif
   }
 
   // Work out how many bits to allocate for the key frame itself.
@@ -1439,16 +2358,8 @@
                                  twopass->kf_group_bits);
   // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
   //        kf_bits, twopass->kf_zeromotion_pct);
-
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-        (double)(twopass->kf_group_bits - kf_bits) /
-        (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
+  kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
+                                               twopass->kf_group_bits, 0);
 
   twopass->kf_group_bits -= kf_bits;
 
@@ -1466,6 +2377,7 @@
 }
 
 static int is_skippable_frame(const AV1_COMP *cpi) {
+  if (has_no_stats_stage(cpi)) return 0;
   // If the current frame does not have non-zero motion vector detected in the
   // first  pass, and so do its previous and forward frames, then this frame
   // can be skipped for partition check, and the partition size is assigned
@@ -1473,8 +2385,8 @@
   const TWO_PASS *const twopass = &cpi->twopass;
 
   return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
+          twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start &&
+          twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
           (twopass->stats_in - 1)->pcnt_inter -
                   (twopass->stats_in - 1)->pcnt_motion ==
               1 &&
@@ -1490,70 +2402,45 @@
 #endif
 #define DEFAULT_GRP_WEIGHT 1.0
 
-void av1_get_second_pass_params(AV1_COMP *cpi,
-                                EncodeFrameParams *const frame_params,
-                                unsigned int frame_flags) {
+static void process_first_pass_stats(AV1_COMP *cpi,
+                                     FIRSTPASS_STATS *this_frame) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
-  FIRSTPASS_STATS this_frame;
 
-  int target_rate;
-
-  frames_left = (int)(twopass->total_stats.count - current_frame->frame_number);
-
-  if (!twopass->stats_in) return;
-
-  // If this is an arf frame then we dont want to read the stats file or
-  // advance the input pointer as we already have what we need.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-    target_rate = gf_group->bit_allocation[gf_group->index];
-    target_rate = av1_rc_clamp_pframe_target_size(
-        cpi, target_rate, gf_group->update_type[gf_group->index]);
-    rc->base_frame_target = target_rate;
-
-    if (cpi->no_show_kf) {
-      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
-      frame_params->frame_type = KEY_FRAME;
-    } else {
-      frame_params->frame_type = INTER_FRAME;
+  if (cpi->oxcf.rc_mode != AOM_Q && current_frame->frame_number == 0 &&
+      cpi->twopass.stats_buf_ctx->total_stats &&
+      cpi->twopass.stats_buf_ctx->total_left_stats) {
+    if (cpi->lap_enabled) {
+      /*
+       * Accumulate total_stats using available limited number of stats,
+       * and assign it to total_left_stats.
+       */
+      *cpi->twopass.stats_buf_ctx->total_left_stats =
+          *cpi->twopass.stats_buf_ctx->total_stats;
     }
+    const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+                                  current_frame->frame_number);
 
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
-
-    return;
-  }
-
-  aom_clear_system_state();
-
-  if (cpi->oxcf.rc_mode == AOM_Q) {
-    twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (current_frame->frame_number == 0) {
     // Special case code for first frame.
     const int section_target_bandwidth =
         (int)(twopass->bits_left / frames_left);
-    const double section_length = twopass->total_left_stats.count;
+    const double section_length =
+        twopass->stats_buf_ctx->total_left_stats->count;
     const double section_error =
-        twopass->total_left_stats.coded_error / section_length;
+        twopass->stats_buf_ctx->total_left_stats->coded_error / section_length;
     const double section_intra_skip =
-        twopass->total_left_stats.intra_skip_pct / section_length;
+        twopass->stats_buf_ctx->total_left_stats->intra_skip_pct /
+        section_length;
     const double section_inactive_zone =
-        (twopass->total_left_stats.inactive_zone_rows * 2) /
-        ((double)cm->mb_rows * section_length);
+        (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) /
+        ((double)cm->mi_params.mb_rows * section_length);
     const int tmp_q = get_twopass_worst_quality(
         cpi, section_error, section_intra_skip + section_inactive_zone,
         section_target_bandwidth, DEFAULT_GRP_WEIGHT);
 
-    twopass->active_worst_quality = tmp_q;
-    twopass->baseline_active_worst_quality = tmp_q;
+    rc->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->last_q[INTER_FRAME] = tmp_q;
     rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
@@ -1562,14 +2449,104 @@
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
 
-  av1_zero(this_frame);
-  if (EOF == input_stats(twopass, &this_frame)) return;
+  int err = 0;
+  if (cpi->lap_enabled) {
+    err = input_stats_lap(twopass, this_frame);
+  } else {
+    err = input_stats(twopass, this_frame);
+  }
+  if (err == EOF) return;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cm->mi_params.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  if (twopass->stats_buf_ctx->total_left_stats)
+    subtract_stats(twopass->stats_buf_ctx->total_left_stats, this_frame);
 
   // Set the frame content type flag.
-  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+  if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
     twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
   else
     twopass->fr_content_type = FC_NORMAL;
+}
+
+static void setup_target_rate(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+
+  int target_rate = gf_group->bit_allocation[gf_group->index];
+
+  if (has_no_stats_stage(cpi)) {
+    av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
+                            cpi->common.height);
+  }
+
+  rc->base_frame_target = target_rate;
+}
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                const EncodeFrameInput *const frame_input,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *cm = &cpi->common;
+
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    av1_set_screen_content_options(cpi, features);
+    cpi->is_screen_content_type = features->allow_screen_content_tools;
+  }
+
+  if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
+
+  if (rc->frames_till_gf_update_due > 0 && !(frame_flags & FRAMEFLAGS_KEY)) {
+    assert(gf_group->index < gf_group->size);
+    const int update_type = gf_group->update_type[gf_group->index];
+
+    setup_target_rate(cpi);
+
+    // If this is an arf frame then we dont want to read the stats file or
+    // advance the input pointer as we already have what we need.
+    if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
+      if (cpi->no_show_kf) {
+        assert(update_type == ARF_UPDATE);
+        frame_params->frame_type = KEY_FRAME;
+      } else {
+        frame_params->frame_type = INTER_FRAME;
+      }
+
+      // Do the firstpass stats indicate that this frame is skippable for the
+      // partition search?
+      if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+        cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+      }
+
+      return;
+    }
+  }
+
+  aom_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.cq_level;
+  FIRSTPASS_STATS this_frame;
+  av1_zero(this_frame);
+  // call above fn
+  if (is_stat_consumption_stage(cpi)) {
+    process_first_pass_stats(cpi, &this_frame);
+  } else {
+    rc->active_worst_quality = cpi->oxcf.cq_level;
+  }
 
   // Keyframe and section processing.
   if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
@@ -1581,20 +2558,95 @@
     this_frame = this_frame_copy;
   } else {
     frame_params->frame_type = INTER_FRAME;
+    const int altref_enabled = is_altref_enabled(cpi);
+    const int sframe_dist = cpi->oxcf.sframe_dist;
+    const int sframe_mode = cpi->oxcf.sframe_mode;
+    const int sframe_enabled = cpi->oxcf.sframe_enabled;
+    const int update_type = gf_group->update_type[gf_group->index];
+    CurrentFrame *const current_frame = &cpi->common.current_frame;
+    if (sframe_enabled) {
+      if (altref_enabled) {
+        if (sframe_mode == 1) {
+          // sframe_mode == 1: insert sframe if it matches altref frame.
+          if (current_frame->frame_number % sframe_dist == 0 &&
+              current_frame->frame_number != 0 && update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
+          }
+        } else {
+          // sframe_mode != 1: if sframe will be inserted at the next available
+          // altref frame
+          if (current_frame->frame_number % sframe_dist == 0 &&
+              current_frame->frame_number != 0) {
+            rc->sframe_due = 1;
+          }
+          if (rc->sframe_due && update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
+            rc->sframe_due = 0;
+          }
+        }
+      } else {
+        if (current_frame->frame_number % sframe_dist == 0 &&
+            current_frame->frame_number != 0) {
+          frame_params->frame_type = S_FRAME;
+        }
+      }
+    }
   }
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame, frame_params);
+    assert(cpi->common.current_frame.frame_number == 0 ||
+           gf_group->index == gf_group->size);
+    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+    int num_frames_to_detect_scenecut, frames_to_key;
+    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection)
+      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+    else
+      num_frames_to_detect_scenecut = 0;
+    frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
+                                       num_frames_to_detect_scenecut);
+    reset_fpf_position(twopass, start_position);
+    if (frames_to_key != -1)
+      rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
 
+    int max_gop_length = (cpi->oxcf.lag_in_frames >= 32 &&
+                          is_stat_consumption_stage_twopass(cpi))
+                             ? MAX_GF_INTERVAL
+                             : MAX_GF_LENGTH_LAP;
+    if (rc->intervals_till_gf_calculate_due == 0) {
+      calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+    }
+
+    if (max_gop_length > 16) {
+      if (rc->gf_intervals[rc->cur_gf_index] - 1 > 16) {
+        // The calculate_gf_length function is previously used with
+        // max_gop_length = 32 with look-ahead gf intervals.
+        define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
+        if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
+          // Tpl decides that a shorter gf interval is better.
+          // TODO(jingning): Remove redundant computations here.
+          max_gop_length = 16;
+          calculate_gf_length(cpi, max_gop_length, 1);
+        }
+      } else {
+        // Even based on 32 we still decide to use a short gf interval.
+        // Better to re-decide based on 16 then
+        max_gop_length = 16;
+        calculate_gf_length(cpi, max_gop_length, 1);
+      }
+    }
+    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    cpi->num_gf_group_show_frames = 0;
+    assert(gf_group->index == 0);
 
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
       fpfile = fopen("arf.stt", "a");
       ++arf_count;
-      fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number,
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
+              cpi->common.current_frame.frame_number,
               rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
               rc->gfu_boost);
 
@@ -1602,54 +2654,30 @@
     }
 #endif
   }
+  assert(gf_group->index < gf_group->size);
 
   // Do the firstpass stats indicate that this frame is skippable for the
   // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+  if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
     cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
   }
 
-  target_rate = gf_group->bit_allocation[gf_group->index];
-
-  if (frame_params->frame_type == KEY_FRAME) {
-    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
-  } else {
-    target_rate = av1_rc_clamp_pframe_target_size(
-        cpi, target_rate, gf_group->update_type[gf_group->index]);
-  }
-
-  rc->base_frame_target = target_rate;
-
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
-    twopass->frame_avg_haar_energy =
-        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
-  }
-
-  // Update the total stats remaining structure.
-  subtract_stats(&twopass->total_left_stats, &this_frame);
+  setup_target_rate(cpi);
 }
 
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
-  av1_twopass_zero_stats(&twopass->total_stats);
-  av1_twopass_zero_stats(&twopass->total_left_stats);
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
 
-  if (!twopass->stats_in_end) return;
+  stats = twopass->stats_buf_ctx->total_stats;
 
-  stats = &twopass->total_stats;
-
-  *stats = *twopass->stats_in_end;
-  twopass->total_left_stats = *stats;
+  *stats = *twopass->stats_buf_ctx->stats_in_end;
+  *twopass->stats_buf_ctx->total_left_stats = *stats;
 
   frame_rate = 10000000.0 * stats->count / stats->duration;
   // Each frame can have a different duration, as the frame rate in the source
@@ -1675,8 +2703,9 @@
         (avg_error * oxcf->two_pass_vbrmin_section) / 100;
     twopass->modified_error_max =
         (avg_error * oxcf->two_pass_vbrmax_section) / 100;
-    while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+    while (s < twopass->stats_buf_ctx->stats_in_end) {
+      modified_error_total +=
+          calculate_modified_err(frame_info, twopass, oxcf, s);
       ++s;
     }
     twopass->modified_error_left = modified_error_total;
@@ -1691,6 +2720,44 @@
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+}
+
+void av1_init_single_pass_lap(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  twopass->bits_left = 0;
+  twopass->modified_error_min = 0.0;
+  twopass->modified_error_max = 0.0;
+  twopass->modified_error_left = 0.0;
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
 }
 
 #define MINQ_ADJ_LIMIT 48
@@ -1709,6 +2776,10 @@
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
 
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
     rc->rate_error_estimate =
@@ -1718,6 +2789,44 @@
     rc->rate_error_estimate = 0;
   }
 
+  // Update the active best quality pyramid.
+  if (!rc->is_src_frame_alt_ref) {
+    const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index];
+    int i;
+    for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
+      rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+      // if (pyramid_level >= 2) {
+      //   rc->active_best_quality[pyramid_level] =
+      //     AOMMAX(rc->active_best_quality[pyramid_level],
+      //            cpi->common.base_qindex);
+      // }
+    }
+  }
+
+#if 0
+  {
+    AV1_COMMON *cm = &cpi->common;
+    FILE *fpfile;
+    fpfile = fopen("details.stt", "a");
+    fprintf(fpfile,
+            "%10d %10d %10d %10" PRId64 " %10" PRId64
+            " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
+            cm->current_frame.frame_number, rc->base_frame_target,
+            rc->projected_frame_size, rc->total_actual_bits,
+            rc->vbr_bits_off_target, rc->rate_error_estimate,
+            twopass->rolling_arf_group_target_bits,
+            twopass->rolling_arf_group_actual_bits,
+            (double)twopass->rolling_arf_group_actual_bits /
+                (double)twopass->rolling_arf_group_target_bits,
+            twopass->bpm_factor,
+            av1_convert_qindex_to_q(quant_params->base_qindex,
+                                    cm->seq_params.bit_depth),
+            av1_convert_qindex_to_q(rc->active_worst_quality,
+                                    cm->seq_params.bit_depth));
+    fclose(fpfile);
+  }
+#endif
+
   if (cpi->common.current_frame.frame_type != KEY_FRAME) {
     twopass->kf_group_bits -= bits_used;
     twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
@@ -1726,8 +2835,7 @@
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
-    const int maxq_adj_limit =
-        rc->worst_quality - twopass->active_worst_quality;
+    const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
     const int minq_adj_limit =
         (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
 

diff --git a/libaom/av1/encoder/pass2_strategy.h b/libaom/av1/encoder/pass2_strategy.h
index bf37746..437fb8f 100644
--- a/libaom/av1/encoder/pass2_strategy.h
+++ b/libaom/av1/encoder/pass2_strategy.h

@@ -18,15 +18,57 @@
 
 struct AV1_COMP;
 struct EncodeFrameParams;
+// structure of accumulated stats and features in a gf group
+typedef struct {
+  double gf_group_err;
+  double gf_group_raw_error;
+  double gf_group_skip_pct;
+  double gf_group_inactive_zone_rows;
+
+  double mv_ratio_accumulator;
+  double decay_accumulator;
+  double zero_motion_accumulator;
+  double loop_decay_rate;
+  double last_loop_decay_rate;
+  double this_frame_mv_in_out;
+  double mv_in_out_accumulator;
+  double abs_mv_in_out_accumulator;
+
+  double avg_sr_coded_error;
+  double avg_tr_coded_error;
+  double avg_pcnt_second_ref;
+  double avg_pcnt_third_ref;
+  double avg_pcnt_third_ref_nolast;
+  double avg_new_mv_count;
+  double avg_wavelet_energy;
+  double avg_raw_err_stdev;
+  int non_zero_stdev_count;
+
+  unsigned int allow_alt_ref;
+} GF_GROUP_STATS;
+
+typedef struct {
+  double frame_err;
+  double frame_coded_error;
+  double frame_sr_coded_error;
+  double frame_tr_coded_error;
+} GF_FRAME_STATS;
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 
+void av1_init_single_pass_lap(AV1_COMP *cpi);
+
 void av1_get_second_pass_params(struct AV1_COMP *cpi,
                                 struct EncodeFrameParams *const frame_params,
+                                const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags);
 
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/pickcdef.c b/libaom/av1/encoder/pickcdef.c
index fb07056..a1092fd 100644
--- a/libaom/av1/encoder/pickcdef.c
+++ b/libaom/av1/encoder/pickcdef.c

@@ -15,24 +15,46 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
 
-#define REDUCED_PRI_STRENGTHS 8
-#define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
 #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
-static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
+static const int priconv_lvl1[REDUCED_TOTAL_STRENGTHS_LVL1] = { 0, 1, 2,  3,
+                                                                5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_TOTAL_STRENGTHS_LVL2] = { 0, 2, 4, 8,
+                                                                14 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+  TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2,
+  TOTAL_STRENGTHS
+};
+
+// Get primary strength value for the given index and search method
+static INLINE int get_pri_strength(CDEF_PICK_METHOD pick_method, int pri_idx) {
+  switch (pick_method) {
+    case CDEF_FAST_SEARCH_LVL1: return priconv_lvl1[pri_idx];
+    case CDEF_FAST_SEARCH_LVL2: return priconv_lvl2[pri_idx];
+    default: assert(0 && "Invalid CDEF primary index"); return -1;
+  }
+}
 
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
 static uint64_t search_one(int *lev, int nb_strengths,
                            uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
-                           int fast) {
+                           CDEF_PICK_METHOD pick_method) {
   uint64_t tot_mse[TOTAL_STRENGTHS];
-  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+  const int total_strengths = nb_cdef_strengths[pick_method];
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id = 0;
@@ -67,13 +89,13 @@
    already selected nb_strengths options. */
 static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
                                 uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
-                                int fast) {
+                                CDEF_PICK_METHOD pick_method) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
   int best_id1 = 0;
-  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+  const int total_strengths = nb_cdef_strengths[pick_method];
   memset(tot_mse, 0, sizeof(tot_mse));
   for (i = 0; i < sb_count; i++) {
     int gi;
@@ -116,13 +138,16 @@
 /* Search for the set of strengths that minimizes mse. */
 static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
                                       uint64_t mse[][TOTAL_STRENGTHS],
-                                      int sb_count, int fast) {
+                                      int sb_count,
+                                      CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
+  int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
+              pick_method == CDEF_FAST_SEARCH_LVL2);
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
   for (i = 0; i < nb_strengths; i++) {
-    best_tot_mse = search_one(best_lev, i, mse, sb_count, fast);
+    best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method);
   }
   /* Trying to refine the greedy search by reconsidering each
      already-selected option. */
@@ -131,7 +156,7 @@
       int j;
       for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
       best_tot_mse =
-          search_one(best_lev, nb_strengths - 1, mse, sb_count, fast);
+          search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method);
     }
   }
   return best_tot_mse;
@@ -141,14 +166,15 @@
 static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
                                            int nb_strengths,
                                            uint64_t (**mse)[TOTAL_STRENGTHS],
-                                           int sb_count, int fast) {
+                                           int sb_count,
+                                           CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
   for (i = 0; i < nb_strengths; i++) {
     best_tot_mse =
-        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, fast);
+        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method);
   }
   /* Trying to refine the greedy search by reconsidering each
      already-selected option. */
@@ -159,58 +185,47 @@
       best_lev1[j] = best_lev1[j + 1];
     }
     best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
-                                   sb_count, fast);
+                                   sb_count, pick_method);
   }
   return best_tot_mse;
 }
 
-/* FIXME: SSE-optimize this. */
-static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
+                          int src_voffset, int src_hoffset, int sstride,
+                          int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+                                        cdef_list *dlist, int cdef_count,
+                                        BLOCK_SIZE bsize, int coeff_shift,
+                                        int row, int col);
+
+static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
+                                int src_voffset, int src_hoffset, int sstride,
+                                int vsize, int hsize) {
+  int r;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR((uint8_t *)src);
+  const uint16_t *base = &src16[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++)
+    memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
+}
+
+static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
                          int src_voffset, int src_hoffset, int sstride,
                          int vsize, int hsize) {
   int r, c;
-  const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
-  for (r = 0; r < vsize; r++) {
-    for (c = 0; c < hsize; c++) {
-      dst[r * dstride + c] = base[r * sstride + c];
-    }
-  }
+  const uint8_t *src8 = (uint8_t *)src;
+  const uint8_t *base = &src8[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++)
+    for (c = 0; c < hsize; c++)
+      dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
 }
 
-static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                      int sstride, int coeff_shift) {
-  uint64_t svar = 0;
-  uint64_t dvar = 0;
-  uint64_t sum_s = 0;
-  uint64_t sum_d = 0;
-  uint64_t sum_s2 = 0;
-  uint64_t sum_d2 = 0;
-  uint64_t sum_sd = 0;
-  int i, j;
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      sum_s += src[i * sstride + j];
-      sum_d += dst[i * dstride + j];
-      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
-      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
-      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
-    }
-  }
-  /* Compute the variance -- the calculation cannot go negative. */
-  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
-  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
-  return (uint64_t)floor(
-      .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
-               (svar + dvar + (400 << 2 * coeff_shift)) /
-               (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
-}
-
-static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                     int sstride) {
+static INLINE uint64_t mse_wxh_16bit_highbd(uint16_t *dst, int dstride,
+                                            uint16_t *src, int sstride, int w,
+                                            int h) {
   uint64_t sum = 0;
   int i, j;
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
       int e = dst[i * dstride + j] - src[i * sstride + j];
       sum += e * e;
     }
@@ -218,310 +233,355 @@
   return sum;
 }
 
-static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                     int sstride) {
+static INLINE uint64_t mse_wxh_16bit(uint8_t *dst, int dstride, uint16_t *src,
+                                     int sstride, int w, int h) {
   uint64_t sum = 0;
   int i, j;
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      int e = dst[i * dstride + j] - src[i * sstride + j];
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
       sum += e * e;
     }
   }
   return sum;
 }
 
+static INLINE void init_src_params(int *src_stride, int *width, int *height,
+                                   int *width_log2, int *height_log2,
+                                   BLOCK_SIZE bsize) {
+  *src_stride = block_size_wide[bsize];
+  *width = block_size_wide[bsize];
+  *height = block_size_high[bsize];
+  *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+  *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+}
+
 /* Compute MSE only on the blocks we filtered. */
-uint64_t compute_cdef_dist(uint16_t *dst, int dstride, uint16_t *src,
-                           cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize,
-                           int coeff_shift, int pli) {
+static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
+                                         cdef_list *dlist, int cdef_count,
+                                         BLOCK_SIZE bsize, int coeff_shift,
+                                         int row, int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
   uint64_t sum = 0;
   int bi, bx, by;
-  if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      if (pli == 0) {
-        sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                              &src[bi << (3 + 3)], 8, coeff_shift);
-      } else {
-        sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                             &src[bi << (3 + 3)], 8);
-      }
-    }
-  } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
-                           &src[bi << (3 + 2)], 4);
-      sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
-                           &src[(bi << (3 + 2)) + 4 * 4], 4);
-    }
-  } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
-                           &src[bi << (2 + 3)], 8);
-      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
-                           &src[(bi << (2 + 3)) + 4], 8);
-    }
-  } else {
-    assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                           &src[bi << (2 + 2)], 4);
-    }
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst);
+  uint16_t *dst_buff = &dst16[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    sum += mse_wxh_16bit_highbd(
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
   }
   return sum >> 2 * coeff_shift;
 }
 
-void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast) {
+static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
+                                  cdef_list *dlist, int cdef_count,
+                                  BLOCK_SIZE bsize, int coeff_shift, int row,
+                                  int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
+  uint64_t sum = 0;
+  int bi, bx, by;
+  uint8_t *dst8 = (uint8_t *)dst;
+  uint8_t *dst_buff = &dst8[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    sum += mse_wxh_16bit(
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+  }
+  return sum >> 2 * coeff_shift;
+}
+
+static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row,
+                       int mi_col) {
+  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+  const int stride = mi_params->mi_stride;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+  for (int r = 0; r < maxr; ++r, mbmi += stride) {
+    for (int c = 0; c < maxc; ++c) {
+      if (!mbmi[c]->skip) return 0;
+    }
+  }
+  return 1;
+}
+
+static void pick_cdef_from_qp(AV1_COMMON *const cm) {
+  const int bd = cm->seq_params.bit_depth;
+  const int q =
+      av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
   CdefInfo *const cdef_info = &cm->cdef_info;
-  int r, c;
-  int fbr, fbc;
-  uint16_t *src[3];
-  uint16_t *ref_coeff[3];
-  static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  cdef_info->cdef_bits = 0;
+  cdef_info->nb_cdef_strengths = 1;
+  cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
+
+  int predicted_y_f1 = 0;
+  int predicted_y_f2 = 0;
+  int predicted_uv_f1 = 0;
+  int predicted_uv_f2 = 0;
+  aom_clear_system_state();
+  if (!frame_is_intra_only(cm)) {
+    predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
+                                       q * 0.0068615186f + 0.02709886f),
+                           0, 15);
+    predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f +
+                                       q * 0.0013993345f + 0.03831067f),
+                           0, 3);
+    predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f +
+                                        q * 0.0034628846f + 0.00887099f),
+                            0, 15);
+    predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f +
+                                        q * 0.00028223585f + 0.05576307f),
+                            0, 3);
+  } else {
+    predicted_y_f1 = clamp(
+        (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f),
+        0, 15);
+    predicted_y_f2 = clamp(
+        (int)roundf(q * q * 0.0000029167343f + q * 0.0027798624f + 0.0079405f),
+        0, 3);
+    predicted_uv_f1 = clamp(
+        (int)roundf(q * q * -0.0000130790995f + q * 0.012892405f - 0.00748388f),
+        0, 15);
+    predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f +
+                                        q * 0.00035520183f + 0.00228092f),
+                            0, 3);
+  }
+  cdef_info->cdef_strengths[0] =
+      predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2;
+  cdef_info->cdef_uv_strengths[0] =
+      predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
+
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
+  for (int r = 0; r < nvfb; ++r) {
+    for (int c = 0; c < nhfb; ++c) {
+      mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0;
+    }
+    mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
+  }
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+                     int rdmult) {
+  if (pick_method == CDEF_PICK_FROM_Q) {
+    pick_cdef_from_qp(cm);
+    return;
+  }
+
+  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int stride[3];
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+  const int damping = 3 + (cm->quant_params.base_qindex >> 6);
+  const int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
+                    pick_method == CDEF_FAST_SEARCH_LVL2);
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  const int num_planes = av1_num_planes(cm);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+
   int bsize[3];
   int mi_wide_l2[3];
   int mi_high_l2[3];
   int xdec[3];
   int ydec[3];
-  int pli;
-  int cdef_count;
-  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  uint64_t best_tot_mse = (uint64_t)1 << 63;
-  uint64_t tot_mse;
-  int sb_count;
-  int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
-  int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
-  uint64_t(*mse[2])[TOTAL_STRENGTHS];
-  int pri_damping = 3 + (cm->base_qindex >> 6);
-  int sec_damping = 3 + (cm->base_qindex >> 6);
-  int i;
-  int nb_strengths;
-  int nb_strength_bits;
-  int quantizer;
-  double lambda;
-  const int num_planes = av1_num_planes(cm);
-  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
-  uint16_t *in;
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
-  quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >>
-              (cm->seq_params.bit_depth - 8);
-  lambda = .12 * quantizer * quantizer / 256.;
+  uint8_t *ref_buffer[3] = { ref->y_buffer, ref->u_buffer, ref->v_buffer };
+  int ref_stride[3] = { ref->y_stride, ref->uv_stride, ref->uv_stride };
 
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  for (pli = 0; pli < num_planes; pli++) {
-    uint8_t *ref_buffer;
-    int ref_stride;
-    switch (pli) {
-      case 0:
-        ref_buffer = ref->y_buffer;
-        ref_stride = ref->y_stride;
-        break;
-      case 1:
-        ref_buffer = ref->u_buffer;
-        ref_stride = ref->uv_stride;
-        break;
-      case 2:
-        ref_buffer = ref->v_buffer;
-        ref_stride = ref->uv_stride;
-        break;
-    }
-    src[pli] = aom_memalign(
-        32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
-    ref_coeff[pli] = aom_memalign(
-        32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+  for (int pli = 0; pli < num_planes; pli++) {
     xdec[pli] = xd->plane[pli].subsampling_x;
     ydec[pli] = xd->plane[pli].subsampling_y;
     bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
                            : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
-    stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
     mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
     mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
-
-    const int frame_height =
-        (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y;
-    const int frame_width =
-        (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x;
-
-    for (r = 0; r < frame_height; ++r) {
-      for (c = 0; c < frame_width; ++c) {
-        if (cm->seq_params.use_highbitdepth) {
-          src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
-              xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
-          ref_coeff[pli][r * stride[pli] + c] =
-              CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
-        } else {
-          src[pli][r * stride[pli] + c] =
-              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
-          ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
-        }
-      }
-    }
   }
-  in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
-  sb_count = 0;
-  for (fbr = 0; fbr < nvfb; ++fbr) {
-    for (fbc = 0; fbc < nhfb; ++fbc) {
-      int nvb, nhb;
-      int gi;
-      int dirinit = 0;
-      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
-      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
-      int hb_step = 1;
-      int vb_step = 1;
-      BLOCK_SIZE bs = BLOCK_64X64;
-      MB_MODE_INFO *const mbmi =
-          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc];
+
+  copy_fn_t copy_fn;
+  compute_cdef_dist_t compute_cdef_dist_fn;
+
+  if (cm->seq_params.use_highbitdepth) {
+    copy_fn = copy_sb16_16_highbd;
+    compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  } else {
+    copy_fn = copy_sb16_16;
+    compute_cdef_dist_fn = compute_cdef_dist;
+  }
+
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  const int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  int sb_count = 0;
+  for (int fbr = 0; fbr < nvfb; ++fbr) {
+    for (int fbc = 0; fbc < nhfb; ++fbc) {
+      // No filtering if the entire filter block is skipped
+      if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+        continue;
+
+      const MB_MODE_INFO *const mbmi =
+          mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                  MI_SIZE_64X64 * fbc];
       if (((fbc & 1) &&
            (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
           ((fbr & 1) &&
            (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
         continue;
+
+      int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      int hb_step = 1;
+      int vb_step = 1;
+      BLOCK_SIZE bs;
       if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
-          mbmi->sb_type == BLOCK_64X128)
+          mbmi->sb_type == BLOCK_64X128) {
         bs = mbmi->sb_type;
-      if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
-        nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
-        hb_step = 2;
+        if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+          nhb =
+              AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+          hb_step = 2;
+        }
+        if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+          nvb =
+              AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+          vb_step = 2;
+        }
+      } else {
+        bs = BLOCK_64X64;
       }
-      if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
-        nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
-        vb_step = 2;
-      }
-      // No filtering if the entire filter block is skipped
-      if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
-      cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
-                                        fbc * MI_SIZE_64X64, dlist, bs);
-      for (pli = 0; pli < num_planes; pli++) {
-        for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
-        for (gi = 0; gi < total_strengths; gi++) {
-          int threshold;
-          uint64_t curr_mse;
-          int sec_strength;
-          threshold = gi / CDEF_SEC_STRENGTHS;
-          if (fast) threshold = priconv[threshold];
-          /* We avoid filtering the pixels for which some of the pixels to
-             average
-             are outside the frame. We could change the filter instead, but it
-             would add special cases for any future vectorization. */
-          int yoff = CDEF_VBORDER * (fbr != 0);
-          int xoff = CDEF_HBORDER * (fbc != 0);
-          int ysize = (nvb << mi_high_l2[pli]) +
-                      CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
-          int xsize = (nhb << mi_wide_l2[pli]) +
-                      CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
-          sec_strength = gi % CDEF_SEC_STRENGTHS;
-          copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
-                       src[pli],
-                       (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
-                       (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
-                       stride[pli], ysize, xsize);
-          cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
-                         dir, &dirinit, var, pli, dlist, cdef_count, threshold,
-                         sec_strength + (sec_strength == 3), pri_damping,
-                         sec_damping, coeff_shift);
-          curr_mse = compute_cdef_dist(
-              ref_coeff[pli] +
-                  (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
-                  (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
-              stride[pli], tmp_dst, dlist, cdef_count, bsize[pli], coeff_shift,
-              pli);
+
+      const int cdef_count = av1_cdef_compute_sb_list(
+          mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+
+      const int yoff = CDEF_VBORDER * (fbr != 0);
+      const int xoff = CDEF_HBORDER * (fbc != 0);
+      int dirinit = 0;
+      for (int pli = 0; pli < num_planes; pli++) {
+        for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+        /* We avoid filtering the pixels for which some of the pixels to
+           average are outside the frame. We could change the filter instead,
+           but it would add special cases for any future vectorization. */
+        const int ysize = (nvb << mi_high_l2[pli]) +
+                          CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
+        const int xsize = (nhb << mi_wide_l2[pli]) +
+                          CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
+        const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+        const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+        for (int gi = 0; gi < total_strengths; gi++) {
+          int pri_strength = gi / CDEF_SEC_STRENGTHS;
+          if (fast) pri_strength = get_pri_strength(pick_method, pri_strength);
+          const int sec_strength = gi % CDEF_SEC_STRENGTHS;
+          copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                  xd->plane[pli].dst.buf, row - yoff, col - xoff,
+                  xd->plane[pli].dst.stride, ysize, xsize);
+          av1_cdef_filter_fb(
+              NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli], dir,
+              &dirinit, var, pli, dlist, cdef_count, pri_strength,
+              sec_strength + (sec_strength == 3), damping, coeff_shift);
+          const uint64_t curr_mse = compute_cdef_dist_fn(
+              ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
+              bsize[pli], coeff_shift, row, col);
           if (pli < 2)
             mse[pli][sb_count][gi] = curr_mse;
           else
             mse[1][sb_count][gi] += curr_mse;
-          sb_index[sb_count] =
-              MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
         }
       }
-      sb_count++;
+      sb_index[sb_count++] =
+          MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
     }
   }
-  nb_strength_bits = 0;
+
   /* Search for different number of signalling bits. */
-  for (i = 0; i <= 3; i++) {
-    int j;
+  int nb_strength_bits = 0;
+  uint64_t best_rd = UINT64_MAX;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  for (int i = 0; i <= 3; i++) {
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
-    nb_strengths = 1 << i;
-    if (num_planes >= 3)
+    const int nb_strengths = 1 << i;
+    uint64_t tot_mse;
+    if (num_planes > 1) {
       tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
-                                           mse, sb_count, fast);
-    else
+                                           mse, sb_count, pick_method);
+    } else {
       tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
-                                      fast);
-    /* Count superblock signalling cost. */
-    tot_mse += (uint64_t)(sb_count * lambda * i);
-    /* Count header signalling cost. */
-    tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
-    if (tot_mse < best_tot_mse) {
-      best_tot_mse = tot_mse;
+                                      pick_method);
+    }
+
+    const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS *
+                                              (num_planes > 1 ? 2 : 1);
+    const int rate_cost = av1_cost_literal(total_bits);
+    const uint64_t dist = tot_mse * 16;
+    const uint64_t rd = RDCOST(rdmult, rate_cost, dist);
+    if (rd < best_rd) {
+      best_rd = rd;
       nb_strength_bits = i;
-      for (j = 0; j < 1 << nb_strength_bits; j++) {
-        cdef_info->cdef_strengths[j] = best_lev0[j];
-        cdef_info->cdef_uv_strengths[j] = best_lev1[j];
+      memcpy(cdef_info->cdef_strengths, best_lev0,
+             nb_strengths * sizeof(best_lev0[0]));
+      if (num_planes > 1) {
+        memcpy(cdef_info->cdef_uv_strengths, best_lev1,
+               nb_strengths * sizeof(best_lev1[0]));
       }
     }
   }
-  nb_strengths = 1 << nb_strength_bits;
 
   cdef_info->cdef_bits = nb_strength_bits;
-  cdef_info->nb_cdef_strengths = nb_strengths;
-  for (i = 0; i < sb_count; i++) {
-    int gi;
-    int best_gi;
-    uint64_t best_mse = (uint64_t)1 << 63;
-    best_gi = 0;
-    for (gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
+  cdef_info->nb_cdef_strengths = 1 << nb_strength_bits;
+  for (int i = 0; i < sb_count; i++) {
+    uint64_t best_mse = UINT64_MAX;
+    int best_gi = 0;
+    for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
       uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]];
-      if (num_planes >= 3) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
+      if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
       if (curr < best_mse) {
         best_gi = gi;
         best_mse = curr;
       }
     }
-    selected_strength[i] = best_gi;
-    cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
+    mi_params->mi_grid_base[sb_index[i]]->cdef_strength = best_gi;
   }
 
   if (fast) {
-    for (int j = 0; j < nb_strengths; j++) {
-      cdef_info->cdef_strengths[j] =
-          priconv[cm->cdef_info.cdef_strengths[j] / CDEF_SEC_STRENGTHS] *
-              CDEF_SEC_STRENGTHS +
-          (cdef_info->cdef_strengths[j] % CDEF_SEC_STRENGTHS);
-      cdef_info->cdef_uv_strengths[j] =
-          priconv[cdef_info->cdef_uv_strengths[j] / CDEF_SEC_STRENGTHS] *
-              CDEF_SEC_STRENGTHS +
-          (cdef_info->cdef_uv_strengths[j] % CDEF_SEC_STRENGTHS);
+    for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
+      const int luma_strength = cdef_info->cdef_strengths[j];
+      const int chroma_strength = cdef_info->cdef_uv_strengths[j];
+      int pri_strength;
+      pri_strength =
+          get_pri_strength(pick_method, luma_strength / CDEF_SEC_STRENGTHS);
+      cdef_info->cdef_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
+                                     (luma_strength % CDEF_SEC_STRENGTHS);
+      pri_strength =
+          get_pri_strength(pick_method, chroma_strength / CDEF_SEC_STRENGTHS);
+      cdef_info->cdef_uv_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
+                                        (chroma_strength % CDEF_SEC_STRENGTHS);
     }
   }
-  cdef_info->cdef_pri_damping = pri_damping;
-  cdef_info->cdef_sec_damping = sec_damping;
+
+  cdef_info->cdef_damping = damping;
+
   aom_free(mse[0]);
   aom_free(mse[1]);
-  for (pli = 0; pli < num_planes; pli++) {
-    aom_free(src[pli]);
-    aom_free(ref_coeff[pli]);
-  }
   aom_free(sb_index);
-  aom_free(selected_strength);
 }

diff --git a/libaom/av1/encoder/picklpf.c b/libaom/av1/encoder/picklpf.c
index aca089c..17c9965 100644
--- a/libaom/av1/encoder/picklpf.c
+++ b/libaom/av1/encoder/picklpf.c

@@ -19,8 +19,8 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
 
 #include "av1/encoder/av1_quantize.h"
@@ -38,7 +38,7 @@
 }
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
-  if (cpi->oxcf.pass == 2) {
+  if (is_stat_consumption_stage_twopass(cpi)) {
     return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
                                                  : MAX_LOOP_FILTER;
   } else {
@@ -57,7 +57,7 @@
   if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
   if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
 
-  // set base filters for use of get_filter_level when in DELTA_Q_LF mode
+  // set base filters for use of av1_get_filter_level when in DELTA_LF mode
   switch (plane) {
     case 0:
       cm->lf.filter_level[0] = filter_level[0];
@@ -72,13 +72,13 @@
   if (cpi->num_workers > 1)
     av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
                              plane + 1, partial_frame,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                              0,
 #endif
                              cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
   else
     av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                           0,
 #endif
                           plane, plane + 1, partial_frame);
@@ -142,11 +142,12 @@
     // Bias against raising loop filter in favor of lowering it.
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+    if ((is_stat_consumption_stage_twopass(cpi)) &&
+        (cpi->twopass.section_intra_rating < 20))
       bias = (bias * cpi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
-    if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+    if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
 
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
@@ -212,7 +213,8 @@
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
-    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
+    const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+                                   cm->seq_params.bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
     const int strength_boost_q_treshold = 700;
@@ -262,12 +264,14 @@
     lf->filter_level[0] = lf->filter_level[1] =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
                             last_frame_filter_level, NULL, 0, 2);
-    lf->filter_level[0] =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                            last_frame_filter_level, NULL, 0, 0);
-    lf->filter_level[1] =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                            last_frame_filter_level, NULL, 0, 1);
+    if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
+      lf->filter_level[0] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 0, 0);
+      lf->filter_level[1] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 0, 1);
+    }
 
     if (num_planes > 1) {
       lf->filter_level_u =

diff --git a/libaom/av1/encoder/pickrst.c b/libaom/av1/encoder/pickrst.c
index 1b4f26c..ccbe1cc 100644
--- a/libaom/av1/encoder/pickrst.c
+++ b/libaom/av1/encoder/pickrst.c

@@ -23,7 +23,7 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/restoration.h"
 
@@ -46,7 +46,16 @@
 // Working precision for Wiener filter coefficients
 #define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16)
 
-const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+#define SGRPROJ_EP_GRP1_START_IDX 0
+#define SGRPROJ_EP_GRP1_END_IDX 9
+#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4
+#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2
+static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6,
+                                                                       9 };
+static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
+  { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 },
+  { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
+};
 
 typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
                                       const YV12_BUFFER_CONFIG *b);
@@ -54,13 +63,28 @@
                                            const YV12_BUFFER_CONFIG *b,
                                            int hstart, int width, int vstart,
                                            int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                            int hstart, int width, int vstart,
+                                            int height);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #define NUM_EXTRACTORS (3 * (1 + 1))
-
+#else
+#define NUM_EXTRACTORS 3
+#endif
 static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
   aom_get_y_sse_part,        aom_get_u_sse_part,
-  aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
-  aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+  aom_get_v_sse_part,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part,
+  aom_highbd_get_v_sse_part,
+#endif
+};
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_var,        aom_get_u_var,        aom_get_v_var,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+#endif
 };
 
 static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
@@ -72,6 +96,14 @@
       limits->v_start, limits->v_end - limits->v_start);
 }
 
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+                                     const YV12_BUFFER_CONFIG *src, int plane,
+                                     int highbd) {
+  return var_part_extractors[3 * highbd + plane](
+      src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+      limits->v_end - limits->v_start);
+}
+
 typedef struct {
   // The best coefficients for Wiener or Sgrproj restoration
   WienerInfo wiener;
@@ -83,6 +115,10 @@
   // The rtype to use for this unit given a frame rtype as
   // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
   RestorationType best_rtype[RESTORE_TYPES - 1];
+
+  // This flag will be set based on the speed feature
+  // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+  uint8_t skip_sgr_eval;
 } RestUnitSearchInfo;
 
 typedef struct {
@@ -116,22 +152,23 @@
   AV1PixelRect tile_rect;
 } RestSearchCtxt;
 
-static void rsc_on_tile(void *priv) {
+static AOM_INLINE void rsc_on_tile(void *priv) {
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   set_default_sgrproj(&rsc->sgrproj);
   set_default_wiener(&rsc->wiener);
   rsc->tile_stripe0 = 0;
 }
 
-static void reset_rsc(RestSearchCtxt *rsc) {
+static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
   rsc->sse = 0;
   rsc->bits = 0;
 }
 
-static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
-                     const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane,
-                     RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
-                     RestSearchCtxt *rsc) {
+static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
+                                const AV1_COMMON *cm, const MACROBLOCK *x,
+                                const SPEED_FEATURES *sf, int plane,
+                                RestUnitSearchInfo *rusi,
+                                YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
   rsc->src = src;
   rsc->dst = dst;
   rsc->cm = cm;
@@ -251,6 +288,7 @@
   return err;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
                                       int height, int src_stride,
                                       const uint8_t *dat8, int dat_stride,
@@ -324,6 +362,7 @@
   }
   return err;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
@@ -332,16 +371,25 @@
                                     int32_t *flt1, int flt1_stride, int *xqd,
                                     const sgr_params_type *params) {
   int xq[2];
-  decode_xq(xqd, xq, params);
-  if (!use_highbitdepth) {
-    return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, flt0, flt0_stride, flt1,
-                                      flt1_stride, xq, params);
-  } else {
+  av1_decode_xq(xqd, xq, params);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_highbitdepth) {
     return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
                                        dat_stride, flt0, flt0_stride, flt1,
                                        flt1_stride, xq, params);
+
+  } else {
+    return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, flt0, flt0_stride, flt1,
+                                      flt1_stride, xq, params);
   }
+#else
+  (void)use_highbitdepth;
+  return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt0, flt0_stride, flt1,
+                                    flt1_stride, xq, params);
+#endif
 }
 
 #define USE_SGRPROJ_REFINEMENT_SEARCH 1
@@ -413,60 +461,25 @@
     return (dividend + divisor / 2) / divisor;
 }
 
-static void get_proj_subspace(const uint8_t *src8, int width, int height,
-                              int src_stride, const uint8_t *dat8,
-                              int dat_stride, int use_highbitdepth,
-                              int32_t *flt0, int flt0_stride, int32_t *flt1,
-                              int flt1_stride, int *xq,
-                              const sgr_params_type *params) {
-  int i, j;
-  int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
-  int64_t C[2] = { 0, 0 };
+static AOM_INLINE void calc_proj_params_r0_r1_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
-
-  // Default values to be returned if the problem becomes ill-posed
-  xq[0] = 0;
-  xq[1] = 0;
-
-  if (!use_highbitdepth) {
-    const uint8_t *src = src8;
-    const uint8_t *dat = dat8;
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t u =
-            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t s =
-            (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const int32_t f1 =
-            (params->r[0] > 0) ? (int32_t)flt0[i * flt0_stride + j] - u : 0;
-        const int32_t f2 =
-            (params->r[1] > 0) ? (int32_t)flt1[i * flt1_stride + j] - u : 0;
-        H[0][0] += (int64_t)f1 * f1;
-        H[1][1] += (int64_t)f2 * f2;
-        H[0][1] += (int64_t)f1 * f2;
-        C[0] += (int64_t)f1 * s;
-        C[1] += (int64_t)f2 * s;
-      }
-    }
-  } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t u =
-            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t s =
-            (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const int32_t f1 =
-            (params->r[0] > 0) ? (int32_t)flt0[i * flt0_stride + j] - u : 0;
-        const int32_t f2 =
-            (params->r[1] > 0) ? (int32_t)flt1[i * flt1_stride + j] - u : 0;
-        H[0][0] += (int64_t)f1 * f1;
-        H[1][1] += (int64_t)f2 * f2;
-        H[0][1] += (int64_t)f1 * f2;
-        C[0] += (int64_t)f1 * s;
-        C[1] += (int64_t)f2 * s;
-      }
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
     }
   }
   H[0][0] /= size;
@@ -475,6 +488,196 @@
   H[1][0] = H[0][1];
   C[0] /= size;
   C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt0,
+                                             int flt0_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt1,
+                                             int flt1_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
+                            int src_stride, const uint8_t *dat8, int dat_stride,
+                            int32_t *flt0, int flt0_stride, int32_t *flt1,
+                            int flt1_stride, int64_t H[2][2], int64_t C[2],
+                            const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void av1_calc_proj_params_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
+    const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int use_highbitdepth, int32_t *flt0,
+                                         int flt0_stride, int32_t *flt1,
+                                         int flt1_stride, int *xq,
+                                         const sgr_params_type *params) {
+  int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+  int64_t C[2] = { 0, 0 };
+
+  // Default values to be returned if the problem becomes ill-posed
+  xq[0] = 0;
+  xq[1] = 0;
+
+  if (!use_highbitdepth) {
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
+                           flt0, flt0_stride, flt1, flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C,
+                             params);
+    }
+  } else {
+    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                   dat_stride, flt0, flt0_stride, flt1,
+                                   flt1_stride, H, C, params);
+  }
+
   if (params->r[0] == 0) {
     // H matrix is now only the scalar H[1][1]
     // C vector is now only the scalar C[1]
@@ -510,7 +713,8 @@
   }
 }
 
-static void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
+static AOM_INLINE void encode_xq(int *xq, int *xqd,
+                                 const sgr_params_type *params) {
   if (params->r[0] == 0) {
     xqd[0] = 0;
     xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
@@ -527,10 +731,11 @@
 }
 
 // Apply the self-guided filter across an entire restoration unit.
-static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
-                      int height, int dat_stride, int use_highbd, int bit_depth,
-                      int pu_width, int pu_height, int32_t *flt0, int32_t *flt1,
-                      int flt_stride) {
+static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
+                                 int width, int height, int dat_stride,
+                                 int use_highbd, int bit_depth, int pu_width,
+                                 int pu_height, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride) {
   for (int i = 0; i < height; i += pu_height) {
     const int h = AOMMIN(pu_height, height - i);
     int32_t *flt0_row = flt0 + i * flt_stride;
@@ -549,13 +754,45 @@
   }
 }
 
+static AOM_INLINE void compute_sgrproj_err(
+    const uint8_t *dat8, const int width, const int height,
+    const int dat_stride, const uint8_t *src8, const int src_stride,
+    const int use_highbitdepth, const int bit_depth, const int pu_width,
+    const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
+    const int flt_stride, int *exqd, int64_t *err) {
+  int exq[2];
+  apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+            pu_width, pu_height, flt0, flt1, flt_stride);
+  aom_clear_system_state();
+  const sgr_params_type *const params = &av1_sgr_params[ep];
+  get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+                    use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                    params);
+  aom_clear_system_state();
+  encode_xq(exq, exqd, params);
+  *err = finer_search_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt_stride, flt1, flt_stride, 2, exqd, params);
+}
+
+static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
+                                      const int *exqd, int *bestxqd,
+                                      int *bestep, const int ep) {
+  if (*besterr == -1 || err < *besterr) {
+    *bestep = ep;
+    *besterr = err;
+    bestxqd[0] = exqd[0];
+    bestxqd[1] = exqd[1];
+  }
+}
+
 static SgrprojInfo search_selfguided_restoration(
     const uint8_t *dat8, int width, int height, int dat_stride,
     const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
-    int pu_width, int pu_height, int32_t *rstbuf) {
+    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
   int32_t *flt0 = rstbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
-  int ep, bestep = 0;
+  int ep, idx, bestep = 0;
   int64_t besterr = -1;
   int exqd[2], bestxqd[2] = { 0, 0 };
   int flt_stride = ((width + 7) & ~7) + 8;
@@ -563,26 +800,43 @@
          pu_width == RESTORATION_PROC_UNIT_SIZE);
   assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_height == RESTORATION_PROC_UNIT_SIZE);
-
-  for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
-    int exq[2];
-    apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
-              pu_width, pu_height, flt0, flt1, flt_stride);
-    aom_clear_system_state();
-    const sgr_params_type *const params = &sgr_params[ep];
-    get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
-                      use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
-                      params);
-    aom_clear_system_state();
-    encode_xq(exq, exqd, params);
-    int64_t err = finer_search_pixel_proj_error(
-        src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
-        flt0, flt_stride, flt1, flt_stride, 2, exqd, params);
-    if (besterr == -1 || err < besterr) {
-      bestep = ep;
-      besterr = err;
-      bestxqd[0] = exqd[0];
-      bestxqd[1] = exqd[1];
+  if (!enable_sgr_ep_pruning) {
+    for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+  } else {
+    // evaluate first four seed ep in first group
+    for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp1_seed[idx];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate left and right ep of winner in seed ep
+    int bestep_ref = bestep;
+    for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) {
+      if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX)
+        continue;
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate last two group
+    for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp2_3[idx][bestep];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
   }
 
@@ -596,7 +850,7 @@
 static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
                               SgrprojInfo *ref_sgrproj_info) {
   int bits = SGRPROJ_PARAMS_BITS;
-  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
   if (params->r[0] > 0)
     bits += aom_count_primitive_refsubexpfin(
         SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
@@ -610,10 +864,11 @@
   return bits;
 }
 
-static void search_sgrproj(const RestorationTileLimits *limits,
-                           const AV1PixelRect *tile, int rest_unit_idx,
-                           void *priv, int32_t *tmpbuf,
-                           RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
+                                      const AV1PixelRect *tile,
+                                      int rest_unit_idx, void *priv,
+                                      int32_t *tmpbuf,
+                                      RestorationLineBuffers *rlbs) {
   (void)rlbs;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -623,6 +878,16 @@
   const int highbd = cm->seq_params.use_highbitdepth;
   const int bit_depth = cm->seq_params.bit_depth;
 
+  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
+  if (rusi->skip_sgr_eval) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_SGRPROJ] = INT64_MAX;
+    return;
+  }
+
   uint8_t *dgd_start =
       rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
   const uint8_t *src_start =
@@ -638,7 +903,7 @@
       dgd_start, limits->h_end - limits->h_start,
       limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
       rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
-      tmpbuf);
+      tmpbuf, rsc->sf->lpf_sf.enable_sgr_ep_pruning);
 
   RestorationUnitInfo rui;
   rui.restoration_type = RESTORE_SGRPROJ;
@@ -646,7 +911,6 @@
 
   rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
 
-  const int64_t bits_none = x->sgrproj_restore_cost[0];
   const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
                            (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
                             << AV1_PROB_COST_SHIFT);
@@ -656,7 +920,8 @@
   double cost_sgr =
       RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
   if (rusi->sgrproj.ep < 10)
-    cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+    cost_sgr *=
+        (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
 
   RestorationType rtype =
       (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
@@ -708,6 +973,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
                                 const uint8_t *src8, int h_start, int h_end,
                                 int v_start, int v_end, int dgd_stride,
@@ -761,6 +1027,7 @@
     }
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE int wrap_index(int i, int wiener_win) {
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
@@ -812,8 +1079,8 @@
 }
 
 // Fix vector b, update vector a
-static void update_a_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc,
-                             int32_t *a, int32_t *b) {
+static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
   int i, j;
   int32_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
@@ -868,8 +1135,8 @@
 }
 
 // Fix vector a, update vector b
-static void update_b_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc,
-                             int32_t *a, int32_t *b) {
+static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
   int i, j;
   int32_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
@@ -1001,7 +1268,8 @@
   return Score - iScore;
 }
 
-static void finalize_sym_filter(int wiener_win, int32_t *f, InterpKernel fi) {
+static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
+                                           InterpKernel fi) {
   int i;
   const int wiener_halfwin = (wiener_win >> 1);
 
@@ -1174,75 +1442,117 @@
       } while (1);
     }
   }
-// printf("err post = %"PRId64"\n", err);
+  // printf("err post = %"PRId64"\n", err);
 #endif  // USE_WIENER_REFINEMENT_SEARCH
   return err;
 }
 
-static void search_wiener(const RestorationTileLimits *limits,
-                          const AV1PixelRect *tile_rect, int rest_unit_idx,
-                          void *priv, int32_t *tmpbuf,
-                          RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect,
+                                     int rest_unit_idx, void *priv,
+                                     int32_t *tmpbuf,
+                                     RestorationLineBuffers *rlbs) {
   (void)tmpbuf;
   (void)rlbs;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
+
+  // Skip Wiener search for low variance contents
+  if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) {
+    const int scale[3] = { 0, 1, 2 };
+    // Obtain the normalized Qscale
+    const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
+                                    rsc->cm->seq_params.bit_depth) >>
+                   3;
+    // Derive threshold as sqr(normalized Qscale) * scale / 16,
+    const uint64_t thresh =
+        (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4;
+    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+    const uint64_t src_var =
+        var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+    // Do not perform Wiener search if source variance is lower than threshold
+    // or if the reconstruction error is zero
+    int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
+    if (prune_wiener) {
+      rsc->bits += bits_none;
+      rsc->sse += rusi->sse[RESTORE_NONE];
+      rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+      rusi->sse[RESTORE_WIENER] = INT64_MAX;
+      if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2)
+        rusi->skip_sgr_eval = 1;
+      return;
+    }
+  }
+
   const int wiener_win =
       (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
+  int reduced_wiener_win = wiener_win;
+  if (rsc->sf->lpf_sf.reduce_wiener_window_size) {
+    reduced_wiener_win =
+        (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
+  }
+
   int64_t M[WIENER_WIN2];
   int64_t H[WIENER_WIN2 * WIENER_WIN2];
   int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
 
+#if CONFIG_AV1_HIGHBITDEPTH
   const AV1_COMMON *const cm = rsc->cm;
   if (cm->seq_params.use_highbitdepth) {
-    av1_compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
-                             limits->h_start, limits->h_end, limits->v_start,
-                             limits->v_end, rsc->dgd_stride, rsc->src_stride, M,
-                             H, cm->seq_params.bit_depth);
+    av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
+                             rsc->src_buffer, limits->h_start, limits->h_end,
+                             limits->v_start, limits->v_end, rsc->dgd_stride,
+                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
   } else {
-    av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+    av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                       limits->h_start, limits->h_end, limits->v_start,
                       limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
   }
+#else
+  av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                    limits->h_start, limits->h_end, limits->v_start,
+                    limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+#endif
 
-  const MACROBLOCK *const x = rsc->x;
-  const int64_t bits_none = x->wiener_restore_cost[0];
-
-  if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilter, hfilter)) {
+  if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
     rsc->bits += bits_none;
     rsc->sse += rusi->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
     rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
     return;
   }
 
   RestorationUnitInfo rui;
   memset(&rui, 0, sizeof(rui));
   rui.restoration_type = RESTORE_WIENER;
-  finalize_sym_filter(wiener_win, vfilter, rui.wiener_info.vfilter);
-  finalize_sym_filter(wiener_win, hfilter, rui.wiener_info.hfilter);
+  finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter);
+  finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter);
 
   // Filter score computes the value of the function x'*A*x - x'*b for the
   // learned filter and compares it against identity filer. If there is no
   // reduction in the function, the filter is reverted back to identity
-  if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter,
+  if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
                     rui.wiener_info.hfilter) > 0) {
     rsc->bits += bits_none;
     rsc->sse += rusi->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
     rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
     return;
   }
 
   aom_clear_system_state();
 
-  rusi->sse[RESTORE_WIENER] =
-      finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win);
+  rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
+      rsc, limits, tile_rect, &rui, reduced_wiener_win);
   rusi->wiener = rui.wiener_info;
 
-  if (wiener_win != WIENER_WIN) {
+  if (reduced_wiener_win != WIENER_WIN) {
     assert(rui.wiener_info.vfilter[0] == 0 &&
            rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
     assert(rui.wiener_info.hfilter[0] == 0 &&
@@ -1263,15 +1573,24 @@
       (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
   rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
 
+  // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
+  // RESTORE_NONE or based on best_rtype
+  if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 1) {
+    rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+  } else if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) {
+    rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+  }
+
   rsc->sse += rusi->sse[rtype];
   rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
   if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
 }
 
-static void search_norestore(const RestorationTileLimits *limits,
-                             const AV1PixelRect *tile_rect, int rest_unit_idx,
-                             void *priv, int32_t *tmpbuf,
-                             RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile_rect,
+                                        int rest_unit_idx, void *priv,
+                                        int32_t *tmpbuf,
+                                        RestorationLineBuffers *rlbs) {
   (void)tile_rect;
   (void)tmpbuf;
   (void)rlbs;
@@ -1286,10 +1605,11 @@
   rsc->sse += rusi->sse[RESTORE_NONE];
 }
 
-static void search_switchable(const RestorationTileLimits *limits,
-                              const AV1PixelRect *tile_rect, int rest_unit_idx,
-                              void *priv, int32_t *tmpbuf,
-                              RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
+                                         const AV1PixelRect *tile_rect,
+                                         int rest_unit_idx, void *priv,
+                                         int32_t *tmpbuf,
+                                         RestorationLineBuffers *rlbs) {
   (void)limits;
   (void)tile_rect;
   (void)tmpbuf;
@@ -1332,7 +1652,8 @@
     const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
     double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
     if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
-      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+      cost *=
+          (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
     if (r == 0 || cost < best_cost) {
       best_cost = cost;
       best_bits = bits;
@@ -1348,9 +1669,9 @@
   if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
 }
 
-static void copy_unit_info(RestorationType frame_rtype,
-                           const RestUnitSearchInfo *rusi,
-                           RestorationUnitInfo *rui) {
+static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
+                                      const RestUnitSearchInfo *rusi,
+                                      RestorationUnitInfo *rui) {
   assert(frame_rtype > 0);
   rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
   if (rui->restoration_type == RESTORE_WIENER)
@@ -1380,7 +1701,7 @@
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   int ntiles[2];
   for (int is_uv = 0; is_uv < 2; ++is_uv)
@@ -1413,10 +1734,10 @@
     RestorationType best_rtype = RESTORE_NONE;
 
     const int highbd = rsc.cm->seq_params.use_highbitdepth;
-    if (!cpi->sf.disable_loop_restoration_chroma || !plane) {
-      extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
-                   rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
-                   highbd);
+    if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
+      av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                       rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                       highbd);
 
       for (RestorationType r = 0; r < num_rtypes; ++r) {
         if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&

diff --git a/libaom/av1/encoder/pickrst.h b/libaom/av1/encoder/pickrst.h
index f34359c..eee3055 100644
--- a/libaom/av1/encoder/pickrst.h
+++ b/libaom/av1/encoder/pickrst.h

@@ -42,6 +42,7 @@
   return (uint8_t)avg;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
                                            int h_end, int v_start, int v_end,
                                            int stride) {
@@ -54,6 +55,7 @@
   uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
   return (uint16_t)avg;
 }
+#endif
 
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 

diff --git a/libaom/av1/encoder/pustats.h b/libaom/av1/encoder/pustats.h
index 40dd467..2e87101 100644
--- a/libaom/av1/encoder/pustats.h
+++ b/libaom/av1/encoder/pustats.h

@@ -43,8 +43,8 @@
       -1.0533f, -0.3566f, 0.5294f,   -0.4335f,  0.1626f,
     };
 
-static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
-    {
+static const float
+    av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
       10.5266f, 5.3268f, -1.0678f, 7.7411f,  8.7164f,  -0.3235f,
       7.3028f,  9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
     };
@@ -72,8 +72,8 @@
       -2.7566f,
     };
 
-static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
-    {
+static const float
+    av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
       13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
       6.1715f,  0.5094f,  7.6433f,  -0.3992f, -1.3555f,
     };
@@ -124,8 +124,8 @@
       -0.0806f, 0.5231f,  0.3928f,  0.4146f,  2.0956f,
     };
 
-static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
-    {
+static const float
+    av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
       1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
       0.f,     1.1485f, 2.7085f,  -4.7897f, 1.4093f,  -1.657f,
     };
@@ -153,8 +153,8 @@
       -0.4164f,
     };
 
-static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
-    {
+static const float
+    av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
       -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
       2.7149f,  -2.5649f, 2.7765f, 2.9617f,  2.7684f,
     };

diff --git a/libaom/av1/encoder/ransac.c b/libaom/av1/encoder/ransac.c
index 6a8854c..07e1a5f 100644
--- a/libaom/av1/encoder/ransac.c
+++ b/libaom/av1/encoder/ransac.c

@@ -34,14 +34,13 @@
 typedef int (*FindTransformationFunc)(int points, double *points1,
                                       double *points2, double *params);
 typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
-                                        double *proj, const int n,
-                                        const int stride_points,
-                                        const int stride_proj);
+                                        double *proj, int n, int stride_points,
+                                        int stride_proj);
 
 static void project_points_double_translation(double *mat, double *points,
-                                              double *proj, const int n,
-                                              const int stride_points,
-                                              const int stride_proj) {
+                                              double *proj, int n,
+                                              int stride_points,
+                                              int stride_proj) {
   int i;
   for (i = 0; i < n; ++i) {
     const double x = *(points++), y = *(points++);
@@ -53,9 +52,8 @@
 }
 
 static void project_points_double_rotzoom(double *mat, double *points,
-                                          double *proj, const int n,
-                                          const int stride_points,
-                                          const int stride_proj) {
+                                          double *proj, int n,
+                                          int stride_points, int stride_proj) {
   int i;
   for (i = 0; i < n; ++i) {
     const double x = *(points++), y = *(points++);
@@ -67,9 +65,8 @@
 }
 
 static void project_points_double_affine(double *mat, double *points,
-                                         double *proj, const int n,
-                                         const int stride_points,
-                                         const int stride_proj) {
+                                         double *proj, int n, int stride_points,
+                                         int stride_proj) {
   int i;
   for (i = 0; i < n; ++i) {
     const double x = *(points++), y = *(points++);
@@ -265,8 +262,10 @@
 }
 
 static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+  assert(np > 0);
   const int np2 = np * 2;
   double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42));
+  if (a == NULL) return 1;
   double *b = a + np2 * 6;
   double *temp = b + np2;
   int i;
@@ -367,12 +366,12 @@
   motion->num_inliers = 0;
   motion->variance = kInfiniteVariance;
   memset(motion->inlier_indices, 0,
-         sizeof(*motion->inlier_indices * num_points));
+         sizeof(*motion->inlier_indices) * num_points);
 }
 
 static int ransac(const int *matched_points, int npoints,
-                  int *num_inliers_by_motion, double *params_by_motion,
-                  int num_desired_motions, const int minpts,
+                  int *num_inliers_by_motion, MotionModel *params_by_motion,
+                  int num_desired_motions, int minpts,
                   IsDegenerateFunc is_degenerate,
                   FindTransformationFunc find_transformation,
                   ProjectPointsDoubleFunc projectpoints) {
@@ -526,9 +525,13 @@
                              motions[i].num_inliers);
 
       find_transformation(motions[i].num_inliers, points1, points2,
-                          params_by_motion + (MAX_PARAMDIM - 1) * i);
+                          params_by_motion[i].params);
+
+      params_by_motion[i].num_inliers = motions[i].num_inliers;
+      memcpy(params_by_motion[i].inliers, motions[i].inlier_indices,
+             sizeof(*motions[i].inlier_indices) * npoints);
+      num_inliers_by_motion[i] = motions[i].num_inliers;
     }
-    num_inliers_by_motion[i] = motions[i].num_inliers;
   }
 
 finish_ransac:
@@ -548,8 +551,9 @@
 
 static int ransac_double_prec(const double *matched_points, int npoints,
                               int *num_inliers_by_motion,
-                              double *params_by_motion, int num_desired_motions,
-                              const int minpts, IsDegenerateFunc is_degenerate,
+                              MotionModel *params_by_motion,
+                              int num_desired_motions, int minpts,
+                              IsDegenerateFunc is_degenerate,
                               FindTransformationFunc find_transformation,
                               ProjectPointsDoubleFunc projectpoints) {
   int trial_count = 0;
@@ -702,7 +706,9 @@
                              motions[i].num_inliers);
 
       find_transformation(motions[i].num_inliers, points1, points2,
-                          params_by_motion + (MAX_PARAMDIM - 1) * i);
+                          params_by_motion[i].params);
+      memcpy(params_by_motion[i].inliers, motions[i].inlier_indices,
+             sizeof(*motions[i].inlier_indices) * npoints);
     }
     num_inliers_by_motion[i] = motions[i].num_inliers;
   }
@@ -737,55 +743,78 @@
   return is_collinear3(p, p + 2, p + 4);
 }
 
-int ransac_translation(int *matched_points, int npoints,
-                       int *num_inliers_by_motion, double *params_by_motion,
-                       int num_desired_motions) {
+static int ransac_translation(int *matched_points, int npoints,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion,
+                              int num_desired_motions) {
   return ransac(matched_points, npoints, num_inliers_by_motion,
                 params_by_motion, num_desired_motions, 3,
                 is_degenerate_translation, find_translation,
                 project_points_double_translation);
 }
 
-int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
-                   double *params_by_motion, int num_desired_motions) {
+static int ransac_rotzoom(int *matched_points, int npoints,
+                          int *num_inliers_by_motion,
+                          MotionModel *params_by_motion,
+                          int num_desired_motions) {
   return ransac(matched_points, npoints, num_inliers_by_motion,
                 params_by_motion, num_desired_motions, 3, is_degenerate_affine,
                 find_rotzoom, project_points_double_rotzoom);
 }
 
-int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
-                  double *params_by_motion, int num_desired_motions) {
+static int ransac_affine(int *matched_points, int npoints,
+                         int *num_inliers_by_motion,
+                         MotionModel *params_by_motion,
+                         int num_desired_motions) {
   return ransac(matched_points, npoints, num_inliers_by_motion,
                 params_by_motion, num_desired_motions, 3, is_degenerate_affine,
                 find_affine, project_points_double_affine);
 }
 
-int ransac_translation_double_prec(double *matched_points, int npoints,
-                                   int *num_inliers_by_motion,
-                                   double *params_by_motion,
-                                   int num_desired_motions) {
+RansacFunc av1_get_ransac_type(TransformationType type) {
+  switch (type) {
+    case AFFINE: return ransac_affine;
+    case ROTZOOM: return ransac_rotzoom;
+    case TRANSLATION: return ransac_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+static int ransac_translation_double_prec(double *matched_points, int npoints,
+                                          int *num_inliers_by_motion,
+                                          MotionModel *params_by_motion,
+                                          int num_desired_motions) {
   return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
                             params_by_motion, num_desired_motions, 3,
                             is_degenerate_translation, find_translation,
                             project_points_double_translation);
 }
 
-int ransac_rotzoom_double_prec(double *matched_points, int npoints,
-                               int *num_inliers_by_motion,
-                               double *params_by_motion,
-                               int num_desired_motions) {
+static int ransac_rotzoom_double_prec(double *matched_points, int npoints,
+                                      int *num_inliers_by_motion,
+                                      MotionModel *params_by_motion,
+                                      int num_desired_motions) {
   return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
                             params_by_motion, num_desired_motions, 3,
                             is_degenerate_affine, find_rotzoom,
                             project_points_double_rotzoom);
 }
 
-int ransac_affine_double_prec(double *matched_points, int npoints,
-                              int *num_inliers_by_motion,
-                              double *params_by_motion,
-                              int num_desired_motions) {
+static int ransac_affine_double_prec(double *matched_points, int npoints,
+                                     int *num_inliers_by_motion,
+                                     MotionModel *params_by_motion,
+                                     int num_desired_motions) {
   return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
                             params_by_motion, num_desired_motions, 3,
                             is_degenerate_affine, find_affine,
                             project_points_double_affine);
 }
+
+RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type) {
+  switch (type) {
+    case AFFINE: return ransac_affine_double_prec;
+    case ROTZOOM: return ransac_rotzoom_double_prec;
+    case TRANSLATION: return ransac_translation_double_prec;
+    default: assert(0); return NULL;
+  }
+}

diff --git a/libaom/av1/encoder/ransac.h b/libaom/av1/encoder/ransac.h
index b754bac..583d971 100644
--- a/libaom/av1/encoder/ransac.h
+++ b/libaom/av1/encoder/ransac.h

@@ -18,33 +18,14 @@
 #include <memory.h>
 
 #include "av1/common/warped_motion.h"
+#include "av1/encoder/global_motion.h"
 
 typedef int (*RansacFunc)(int *matched_points, int npoints,
-                          int *num_inliers_by_motion, double *params_by_motion,
-                          int num_motions);
+                          int *num_inliers_by_motion,
+                          MotionModel *params_by_motion, int num_motions);
 typedef int (*RansacFuncDouble)(double *matched_points, int npoints,
                                 int *num_inliers_by_motion,
-                                double *params_by_motion, int num_motions);
-
-/* Each of these functions fits a motion model from a set of
-   corresponding points in 2 frames using RANSAC. */
-int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
-                  double *params_by_motion, int num_motions);
-int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
-                   double *params_by_motion, int num_motions);
-int ransac_translation(int *matched_points, int npoints,
-                       int *num_inliers_by_motion, double *params_by_motion,
-                       int num_motions);
-int ransac_translation_double_prec(double *matched_points, int npoints,
-                                   int *num_inliers_by_motion,
-                                   double *params_by_motion,
-                                   int num_desired_motions);
-int ransac_rotzoom_double_prec(double *matched_points, int npoints,
-                               int *num_inliers_by_motion,
-                               double *params_by_motion,
-                               int num_desired_motions);
-int ransac_affine_double_prec(double *matched_points, int npoints,
-                              int *num_inliers_by_motion,
-                              double *params_by_motion,
-                              int num_desired_motions);
+                                MotionModel *params_by_motion, int num_motions);
+RansacFunc av1_get_ransac_type(TransformationType type);
+RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type);
 #endif  // AOM_AV1_ENCODER_RANSAC_H_

diff --git a/libaom/av1/encoder/rate_distortion_model_params.h b/libaom/av1/encoder/rate_distortion_model_params.h
deleted file mode 100644
index 7cd0962..0000000
--- a/libaom/av1/encoder/rate_distortion_model_params.h
+++ /dev/null

@@ -1,591 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
-#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "av1/encoder/ml.h"
-
-// 22 float features +
-// 2 categorical features with 4 possible values, converted to one-hot vectors.
-// So, total 22 + 2 * 4 = 30 features.
-#define NUM_FEATURES 30
-#define NUM_HIDDEN_LAYERS 1
-#define NUM_HIDDEN_NODES 96
-#define NUM_OUTPUTS 1
-
-//------------------------------------------------------------------------------
-// RDCost model
-
-static const float
-    av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = {
-      -0.0699f,   0.2790f,    0.1915f,    0.2669f,    0.4637f,    0.4095f,
-      0.2129f,    0.0634f,    0.2306f,    -0.2232f,   -0.5711f,   -0.6493f,
-      -0.7406f,   -0.8440f,   0.4105f,    0.1392f,    0.5218f,    -0.1618f,
-      -0.1719f,   0.3409f,    0.1111f,    -0.3609f,   -0.2929f,   0.3869f,
-      -0.5373f,   0.0700f,    0.2572f,    0.2483f,    -0.0314f,   0.5228f,
-      0.0169f,    -0.1357f,   0.0419f,    -0.1722f,   0.1303f,    0.1198f,
-      -0.0013f,   0.1309f,    0.0293f,    -0.1941f,   0.0668f,    -0.0643f,
-      -0.0381f,   0.1249f,    -0.0731f,   -0.1649f,   0.0964f,    0.0270f,
-      0.1354f,    0.0538f,    -0.2064f,   -0.2067f,   -0.0569f,   0.0449f,
-      0.1680f,    -0.0732f,   -0.0785f,   0.1884f,    -0.2137f,   -0.0189f,
-      0.2976f,    0.2818f,    -0.0222f,   0.2658f,    0.0488f,    0.2778f,
-      -0.1110f,   0.2069f,    -0.0072f,   -0.0095f,   -0.1105f,   -0.1365f,
-      -0.4245f,   -0.4751f,   -0.0736f,   0.2333f,    0.0653f,    -0.0249f,
-      0.0055f,    -0.0838f,   -0.0489f,   -0.2597f,   0.2621f,    -0.0251f,
-      -0.0545f,   0.0816f,    -0.0816f,   0.3396f,    -0.1047f,   0.3678f,
-      0.1487f,    -0.0270f,   0.2574f,    0.1018f,    0.2560f,    -0.0598f,
-      -0.0446f,   -0.1792f,   0.5336f,    -0.1590f,   -0.9820f,   -0.6514f,
-      -0.6304f,   -0.8359f,   -0.0699f,   0.0295f,    -0.0057f,   -0.3088f,
-      -0.1466f,   0.2220f,    -0.1980f,   -0.3400f,   -0.1228f,   0.2667f,
-      -0.4816f,   0.0155f,    -0.0194f,   0.2051f,    0.0513f,    0.1575f,
-      -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f,  -26.6396f,
-      2.7020f,    102.0452f,  -85.5128f,  0.0076f,    122.2206f,  107.5265f,
-      108.3773f,  93.4847f,   20.3705f,   -89.6993f,  -176.9070f, -41.7543f,
-      -123.0293f, -91.6437f,  -205.7099f, -62.5346f,  -83.2987f,  21.3830f,
-      56.6341f,   -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f,
-      -15.9247f,  -14.6468f,  -14.7788f,  -15.4498f,  -18.5514f,  -11.1579f,
-      -5.8164f,   -3.4318f,   0.8100f,    0.0642f,    203.5111f,  189.6872f,
-      190.4776f,  176.4784f,  -4.9427f,   -12.5324f,  -7.6861f,   21.9182f,
-      -6.7864f,   -7.1906f,   -8.1292f,   21.4780f,   -7.8016f,   -5.2653f,
-      61.8526f,   -15.5105f,  -14.6900f,  -14.1459f,  -15.4350f,  -19.1379f,
-      -0.7876f,   -1.8558f,   -4.6035f,   -6.8405f,   -0.2904f,   2.3202f,
-      1.8127f,    -2.9397f,   -0.8187f,   -0.6098f,   22.6173f,   10.3668f,
-      12.9363f,   2.4541f,    6.6700f,    0.3804f,    -3.3117f,   8.5464f,
-      -25.8348f,  1.8698f,    -9.5753f,   8.5558f,    -16.3576f,  7.2217f,
-      35.3115f,   -1.1447f,   -2.6530f,   -4.7027f,   -5.7024f,   -0.9513f,
-      0.8393f,    0.7085f,    0.7879f,    0.3728f,    3.0574f,    1.1360f,
-      26.0531f,   4.1084f,    -1.7340f,   0.1683f,    -450.7927f, -444.5818f,
-      -442.5239f, -438.1168f, 2.4924f,    -0.0147f,   -0.0797f,   -47.5322f,
-      -1.7638f,   -0.8608f,   -0.6500f,   -44.4326f,  -0.9027f,   2.5560f,
-      -267.6517f, 0.2642f,    0.9457f,    0.7944f,    0.3609f,    3.2742f,
-      -74.3400f,  -81.6894f,  -76.2162f,  -69.2979f,  -90.2476f,  -39.7389f,
-      2.2545f,    36.5095f,   -60.1129f,  -1.0383f,   87.0348f,   83.9940f,
-      83.7199f,   80.8609f,   14.9075f,   -78.7405f,  -74.3549f,  -4.2382f,
-      -23.9739f,  -91.8469f,  -67.2654f,  -21.5293f,  -9.9857f,   11.8391f,
-      35.8223f,   -74.2551f,  -81.0729f,  -73.8347f,  -70.3798f,  -86.8052f,
-      0.1701f,    -0.1136f,   0.0060f,    -0.0496f,   -0.1727f,   0.0195f,
-      -0.1040f,   0.1027f,    0.0467f,    -0.2538f,   -0.1322f,   0.0860f,
-      0.0093f,    -0.2801f,   -0.0958f,   0.0497f,    -0.0582f,   -0.0311f,
-      0.1840f,    0.0752f,    0.0282f,    0.0297f,    0.0607f,    0.0650f,
-      0.0893f,    0.1297f,    0.0373f,    0.0040f,    -0.0973f,   0.0248f,
-      -0.1419f,   0.0322f,    -0.0712f,   0.0860f,    -0.0426f,   -0.1989f,
-      0.1393f,    -0.1183f,   0.0735f,    -0.1895f,   0.1447f,    -0.0056f,
-      -0.1833f,   0.0884f,    0.0949f,    0.0476f,    0.0551f,    0.2125f,
-      -0.1537f,   -0.0141f,   -0.2182f,   0.1567f,    0.0457f,    -0.1485f,
-      -0.1177f,   0.0391f,    0.1982f,    -0.1288f,   0.1165f,    -0.2019f,
-      0.4550f,    0.5179f,    0.4311f,    0.1861f,    0.6199f,    0.4542f,
-      0.2034f,    0.1128f,    1.3489f,    -0.2525f,   -2.1139f,   -2.2444f,
-      -2.3679f,   -2.3378f,   0.5682f,    0.1348f,    0.3032f,    -1.5835f,
-      0.2883f,    0.1693f,    0.0439f,    -1.4556f,   0.3818f,    0.4875f,
-      -1.8899f,   0.2510f,    0.6450f,    0.6082f,    0.5962f,    0.8131f,
-      12.0281f,   13.3899f,   13.6249f,   15.8068f,   -1.5453f,   6.7456f,
-      -6.0877f,   26.2596f,   6.2223f,    -0.5922f,   134.1428f,  128.8985f,
-      128.7538f,  123.0920f,  1.3207f,    18.3069f,   15.7436f,   46.5230f,
-      24.7455f,   15.0688f,   19.9965f,   34.7236f,   19.7171f,   1.2018f,
-      49.7274f,   11.8957f,   13.1578f,   14.0451f,   15.3544f,   -3.5601f,
-      1.0048f,    0.9479f,    1.1832f,    2.0635f,    -2.9808f,   2.0803f,
-      -7.5815f,   8.4733f,    -4.2008f,   0.1217f,    226.5257f,  210.7018f,
-      211.6235f,  195.2605f,  0.8283f,    1.0977f,    1.4858f,    41.1242f,
-      1.5822f,    0.8742f,    2.0440f,    33.6213f,   1.6177f,    0.9661f,
-      65.0014f,   1.4197f,    1.0109f,    1.3153f,    1.5470f,    -3.2833f,
-      2.0858f,    2.0012f,    2.1088f,    2.5593f,    -0.9422f,   1.8554f,
-      -6.5378f,   0.6780f,    2.3186f,    0.0506f,    218.3285f,  203.4055f,
-      204.0362f,  188.7854f,  0.3701f,    2.5257f,    3.5172f,    28.8144f,
-      2.1511f,    3.4676f,    2.6337f,    28.5113f,   2.4254f,    -0.0548f,
-      59.4511f,   2.0757f,    2.1551f,    2.2271f,    2.5300f,    -1.4173f,
-      91.9240f,   88.2142f,   83.6155f,   82.2482f,   -9.2566f,   10.9654f,
-      -2.6974f,   62.6750f,   -3.6298f,   -0.1245f,   69.6721f,   67.1340f,
-      66.9162f,   64.1994f,   -83.6778f,  76.8107f,   69.7832f,   64.9261f,
-      68.4901f,   76.3615f,   70.8108f,   63.5435f,   69.1973f,   -83.6034f,
-      24.8275f,   90.1923f,   87.6831f,   82.9783f,   81.8558f,   -7.1010f,
-      95.1656f,   88.3853f,   80.5835f,   79.5990f,   -3.0720f,   8.1290f,
-      -0.6151f,   63.6425f,   -4.5833f,   -0.0063f,   70.1861f,   66.6250f,
-      66.6148f,   63.0886f,   -89.2863f,  74.7684f,   64.8897f,   60.4134f,
-      62.5241f,   78.7076f,   61.7234f,   60.1688f,   61.9509f,   -89.4098f,
-      30.3361f,   92.9144f,   88.5954f,   79.6336f,   79.2453f,   -0.4101f,
-      0.6287f,    0.8050f,    0.4417f,    0.5419f,    0.5972f,    1.3037f,
-      0.4316f,    -0.0013f,   -0.3673f,   -0.4952f,   6.1773f,    5.7825f,
-      6.1705f,    5.3848f,    1.7607f,    -0.0152f,   -0.2924f,   0.8199f,
-      1.3326f,    0.7197f,    -0.6332f,   1.1127f,    1.0472f,    1.8468f,
-      3.4419f,    0.8233f,    0.7175f,    0.8514f,    0.6372f,    0.9472f,
-      -0.0813f,   -0.0197f,   -0.0096f,   -0.2015f,   0.1133f,    -0.0305f,
-      0.0578f,    0.1375f,    -0.0750f,   -0.1702f,   0.1246f,    -0.1782f,
-      0.2017f,    0.0425f,    -0.0602f,   0.1837f,    0.1044f,    -0.1273f,
-      -0.1431f,   0.0672f,    -0.1807f,   -0.1045f,   -0.1355f,   -0.0497f,
-      -0.0561f,   -0.0633f,   0.1907f,    -0.0777f,   0.1203f,    0.0754f,
-      0.4079f,    0.2001f,    0.0558f,    0.0622f,    0.2951f,    0.6541f,
-      -0.0068f,   0.1070f,    0.4469f,    -0.1266f,   -1.3035f,   -1.3324f,
-      -1.3612f,   -0.9966f,   0.7986f,    0.3192f,    -0.5028f,   -0.3844f,
-      -0.4079f,   0.6690f,    -0.5109f,   -0.2719f,   -0.4958f,   1.0310f,
-      -0.8044f,   0.1447f,    0.4221f,    0.3194f,    0.3063f,    0.5520f,
-      0.4667f,    -5.7238f,   -0.5602f,   12.6339f,   -15.1865f,  -14.9035f,
-      -3.0726f,   9.5347f,    -24.6225f,  -2.7086f,   89.8557f,   95.0657f,
-      93.8693f,   99.1085f,   -35.9483f,  -18.0363f,  -1.6298f,   25.3484f,
-      39.3975f,   -15.3199f,  5.7664f,    17.2367f,   25.2788f,   -36.5648f,
-      29.1426f,   0.3857f,    -5.2117f,   0.0533f,    12.1707f,   -11.1735f,
-      0.2673f,    0.0090f,    0.1574f,    0.0904f,    0.0281f,    0.1144f,
-      0.1123f,    -0.0061f,   0.0954f,    -0.0094f,   -0.4387f,   -0.5006f,
-      -0.2560f,   -0.2326f,   -0.1769f,   0.0465f,    0.1273f,    -0.1627f,
-      0.2987f,    -0.3041f,   0.1131f,    -0.3620f,   0.0932f,    -0.0649f,
-      -0.4597f,   0.2535f,    -0.0994f,   0.1390f,    0.1279f,    0.4207f,
-      -39.1159f,  -42.6382f,  -38.4225f,  -31.2301f,  -28.2382f,  -28.1176f,
-      -9.5822f,   1.1886f,    -1.2964f,   -0.7908f,   154.9819f,  147.1914f,
-      147.0482f,  138.7535f,  -21.7014f,  -35.7117f,  -28.8802f,  -3.8968f,
-      -21.5007f,  -28.2213f,  -28.4878f,  -3.7558f,   -26.8317f,  -22.8491f,
-      50.9464f,   -37.0918f,  -42.8811f,  -39.3079f,  -32.1904f,  -26.6354f,
-      -72.5346f,  -75.5751f,  -72.6896f,  -71.3671f,  -35.3279f,  -21.6077f,
-      -5.8259f,   38.7516f,   -6.8012f,   0.0172f,    170.0685f,  157.4452f,
-      158.2334f,  145.0102f,  10.0653f,   -45.1775f,  -56.4571f,  -5.1165f,
-      -75.8980f,  -46.8672f,  -55.3642f,  -6.5631f,   -81.0258f,  10.1348f,
-      55.9786f,   -70.8124f,  -75.7040f,  -73.9831f,  -70.8786f,  -34.9723f,
-      88.6239f,   86.5330f,   80.9333f,   79.6833f,   -10.0096f,  10.6312f,
-      -4.2350f,   62.6230f,   -3.2991f,   -0.0843f,   75.8659f,   72.7886f,
-      72.5301f,   68.8265f,   -81.8276f,  70.3025f,   62.9511f,   62.5706f,
-      69.1842f,   69.3637f,   65.4820f,   65.4357f,   71.5347f,   -82.1064f,
-      24.1925f,   86.2418f,   85.4985f,   80.4091f,   79.5378f,   -9.3877f,
-      -7.6594f,   -4.9581f,   -10.6385f,  -20.2307f,  -44.2261f,  -13.7557f,
-      -4.5344f,   18.1793f,   -10.5522f,  -1.5878f,   110.3187f,  102.4945f,
-      102.3305f,  94.1324f,   -25.2665f,  9.8172f,    -4.4791f,   69.4972f,
-      -6.7571f,   5.8378f,    -11.6101f,  70.7066f,   -4.9327f,   -24.0513f,
-      41.4598f,   -7.0600f,   -7.0940f,   -10.2478f,  -18.9616f,  -46.7505f,
-      90.9365f,   86.0260f,   73.2934f,   69.3406f,   3.3863f,    3.8524f,
-      0.6536f,    63.2150f,   -10.6304f,  0.0291f,    73.0071f,   69.7660f,
-      69.0457f,   65.5611f,   -92.3379f,  74.2756f,   54.5025f,   84.3183f,
-      53.7481f,   73.5624f,   55.3827f,   82.3242f,   53.5432f,   -92.5355f,
-      25.3457f,   89.1858f,   84.4763f,   72.9840f,   69.1889f,   4.6719f,
-      -0.0129f,   0.1995f,    0.2069f,    0.0358f,    0.1209f,    -0.1185f,
-      -0.1217f,   -0.1456f,   0.0125f,    -0.1354f,   0.0510f,    -0.0572f,
-      0.1397f,    0.1453f,    -0.0086f,   0.0107f,    0.0232f,    0.1508f,
-      0.0884f,    -0.0967f,   -0.1786f,   0.1361f,    -0.1399f,   -0.2021f,
-      -0.0242f,   -0.2169f,   0.0133f,    0.0116f,    -0.1489f,   -0.0093f,
-      -0.0796f,   0.1507f,    0.0906f,    0.0228f,    -0.0166f,   -0.1875f,
-      0.0471f,    0.1184f,    -0.0007f,   -0.2732f,   -0.1386f,   -0.2057f,
-      -0.0213f,   -0.1699f,   0.0996f,    0.1562f,    0.1850f,    -0.0362f,
-      -0.2059f,   0.0258f,    -0.0135f,   -0.1276f,   0.0034f,    0.2023f,
-      0.0857f,    -0.0085f,   -0.1955f,   -0.1666f,   -0.0920f,   0.0971f,
-      -0.0292f,   -0.0512f,   -0.0753f,   -0.0739f,   -0.0873f,   -0.1200f,
-      0.0220f,    -0.1359f,   0.2013f,    -0.0445f,   0.1143f,    -0.1484f,
-      -0.1556f,   -0.0003f,   0.1711f,    -0.0724f,   -0.0531f,   0.1126f,
-      0.0476f,    -0.0057f,   0.0088f,    0.0792f,    -0.0438f,   -0.1118f,
-      -0.0244f,   0.0712f,    0.0930f,    -0.0203f,   0.1662f,    -0.0695f,
-      -12.3872f,  -18.7022f,  -13.4237f,  -1.4731f,   -18.6843f,  -14.1515f,
-      -7.5057f,   40.2090f,   -2.7774f,   -1.8433f,   123.6006f,  119.0557f,
-      118.2758f,  113.6423f,  -32.6216f,  -19.5865f,  -16.2897f,  17.2068f,
-      6.3559f,    -17.8742f,  0.7098f,    11.5970f,   -10.1104f,  -33.1830f,
-      39.5617f,   -10.5499f,  -17.8137f,  -14.7185f,  -2.6172f,   -14.6004f,
-      0.3893f,    0.4443f,    0.5305f,    0.3049f,    0.8316f,    0.8679f,
-      0.2265f,    0.2393f,    1.1970f,    -0.2891f,   -1.8666f,   -1.8266f,
-      -1.6984f,   -1.8787f,   0.8706f,    0.4208f,    0.5076f,    -0.8436f,
-      -0.1623f,   0.8008f,    0.1512f,    -1.0839f,   -0.3002f,   0.9263f,
-      -1.3031f,   0.5964f,    0.3413f,    0.5551f,    0.2618f,    0.7018f,
-      -0.1320f,   -0.1944f,   -0.0209f,   -0.0877f,   0.0721f,    -0.0840f,
-      0.0589f,    0.1019f,    0.1927f,    -0.2011f,   -0.1117f,   0.1575f,
-      0.1080f,    -0.0516f,   0.2154f,    -0.1231f,   0.0426f,    -0.0522f,
-      -0.1824f,   -0.1923f,   -0.1206f,   -0.1724f,   -0.0798f,   0.0401f,
-      -0.2170f,   0.0293f,    -0.0853f,   0.1517f,    0.2128f,    -0.1934f,
-      0.0406f,    0.0517f,    0.0822f,    -0.0150f,   0.0943f,    -0.0989f,
-      -0.1802f,   -0.1453f,   -0.1967f,   -0.1797f,   0.1545f,    -0.1217f,
-      0.1755f,    -0.1604f,   -0.0515f,   0.0509f,    0.0310f,    -0.1220f,
-      -0.1770f,   -0.0157f,   0.1989f,    -0.0069f,   0.1766f,    0.1267f,
-      -0.0517f,   -0.0396f,   0.0346f,    0.1946f,    0.1162f,    -0.1345f,
-      -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f,  -22.4247f,
-      2.6632f,    109.5208f,  -66.1177f,  0.0062f,    159.9339f,  144.7755f,
-      145.5032f,  128.9872f,  18.9180f,   -75.3569f,  -105.0866f, -52.0704f,
-      -119.1299f, -74.7543f,  -109.9468f, -59.0682f,  -104.5754f, 19.2878f,
-      67.2573f,   -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f,
-      -0.6967f,   -0.8495f,   -0.9586f,   -1.0461f,   1.4522f,    -0.2762f,
-      28.2828f,   2.9157f,    -2.1062f,   0.1566f,    -467.2388f, -461.0685f,
-      -459.0092f, -453.8370f, 1.5422f,    -0.8186f,   -0.4884f,   -53.0399f,
-      -2.0255f,   -1.1348f,   -1.1039f,   -50.2489f,  -1.4821f,   1.8021f,
-      -258.0319f, -1.0865f,   -0.5542f,   -1.0443f,   -1.2732f,   1.8413f,
-      0.2377f,    0.1937f,    -0.0116f,   0.0935f,    -0.0599f,   0.0118f,
-      -0.0875f,   0.0455f,    -0.1301f,   -0.1081f,   -0.2622f,   -0.1960f,
-      0.0393f,    -0.1490f,   0.1852f,    -0.0964f,   -0.0741f,   0.0419f,
-      0.1162f,    -0.0274f,   0.1200f,    -0.0333f,   -0.1337f,   0.2141f,
-      0.0664f,    0.1044f,    -0.1744f,   0.1060f,    -0.1468f,   0.0679f,
-      0.0218f,    0.0494f,    0.1064f,    0.1363f,    0.0013f,    0.1331f,
-      -0.2095f,   0.2088f,    -0.0399f,   -0.1811f,   0.0678f,    -0.1974f,
-      0.1855f,    -0.0968f,   -0.2008f,   0.0162f,    -0.0096f,   -0.1493f,
-      0.2170f,    -0.1248f,   -0.2055f,   0.1276f,    -0.0269f,   -0.1697f,
-      -0.0662f,   0.1073f,    -0.0029f,   -0.1051f,   -0.1573f,   0.2106f,
-      -0.2020f,   -0.1565f,   0.0335f,    -0.1818f,   -0.1665f,   0.2169f,
-      0.1974f,    -0.1470f,   -0.1738f,   -0.2038f,   0.0558f,    -0.0441f,
-      0.0065f,    -0.1485f,   -0.1366f,   -0.2131f,   0.1042f,    0.0349f,
-      -0.1804f,   -0.1361f,   -0.0116f,   -0.1012f,   -0.0860f,   0.0606f,
-      -0.2077f,   0.1826f,    -0.1014f,   -0.0721f,   -0.1517f,   0.1022f,
-      -0.1110f,   -0.0186f,   0.1505f,    0.1797f,    0.0911f,    0.0340f,
-      0.1702f,    -0.1404f,   -0.0566f,   -0.2744f,   -0.1943f,   -0.1871f,
-      0.0046f,    0.0306f,    -0.0436f,   0.1625f,    -0.1302f,   0.0175f,
-      0.1570f,    -0.1425f,   0.0779f,    0.1398f,    0.0929f,    0.0897f,
-      0.0458f,    -0.0936f,   0.1321f,    -0.1355f,   0.0974f,    0.0457f,
-      -73.3516f,  -75.0655f,  -72.1062f,  -72.4624f,  -34.8640f,  -14.3727f,
-      -4.4720f,   66.4982f,   -18.8358f,  0.0397f,    174.2172f,  160.4959f,
-      161.1034f,  147.3250f,  9.5507f,    -45.0180f,  -73.1609f,  -1.5230f,
-      -74.8677f,  -43.8559f,  -68.7622f,  -4.8971f,   -82.1922f,  9.6490f,
-      64.7115f,   -71.8566f,  -75.3879f,  -72.5479f,  -71.7161f,  -34.8056f,
-      0.1442f,    0.1558f,    0.1267f,    -0.1261f,   -0.0506f,   -0.0823f,
-      -0.1807f,   -0.0889f,   -0.2098f,   -0.1295f,   -0.2046f,   -0.1749f,
-      -0.1197f,   -0.1380f,   0.0799f,    -0.0889f,   -0.1209f,   0.1919f,
-      0.1947f,    -0.2086f,   -0.1042f,   -0.0468f,   0.0232f,    0.1052f,
-      -0.0535f,   0.1398f,    0.1713f,    -0.1522f,   0.1453f,    0.0286f,
-      -64.8503f,  -67.6746f,  -63.6497f,  -60.4614f,  -35.6091f,  -20.1605f,
-      -3.6082f,   84.2801f,   -37.8552f,  -2.2371f,   132.4947f,  123.5057f,
-      123.5776f,  113.9060f,  -14.8772f,  -40.7130f,  -79.1391f,  -10.7024f,
-      -65.7831f,  -43.6078f,  -79.6847f,  -13.0743f,  -69.2533f,  -16.0171f,
-      50.4868f,   -64.3678f,  -68.7061f,  -64.0823f,  -59.3413f,  -28.9405f,
-      77.1601f,   75.4899f,   69.8696f,   67.8764f,   -22.7548f,  5.9814f,
-      -3.2826f,   57.9754f,   -5.9500f,   -0.0014f,   77.2251f,   74.0737f,
-      73.7004f,   70.5072f,   -80.9661f,  69.3065f,   55.8337f,   76.8831f,
-      57.9902f,   63.4765f,   56.4748f,   70.0282f,   61.0874f,   -81.3960f,
-      26.2594f,   76.0367f,   74.9115f,   69.2361f,   66.9262f,   -20.1637f,
-      0.1886f,    -0.1108f,   0.1262f,    0.0189f,    0.1382f,    0.0859f,
-      -0.1874f,   -0.1986f,   -0.0171f,   -0.1400f,   -0.2944f,   -0.0750f,
-      -0.0395f,   -0.2092f,   -0.0878f,   0.1216f,    -0.0870f,   -0.1613f,
-      0.2495f,    0.0754f,    0.0244f,    -0.1205f,   -0.0196f,   -0.1729f,
-      0.1170f,    0.1585f,    0.1482f,    -0.1705f,   -0.1337f,   0.0199f,
-      13.0897f,   9.1111f,    6.7413f,    6.3907f,    -28.1187f,  0.4556f,
-      -5.3116f,   30.7293f,   -16.3644f,  -0.0365f,   118.9118f,  111.6125f,
-      111.3227f,  103.4680f,  -30.1883f,  8.9328f,    -4.1876f,   79.3936f,
-      -9.0522f,   12.7861f,   -1.2736f,   78.0446f,   -5.9485f,   -30.5716f,
-      27.8951f,   13.9613f,   6.7173f,    5.2345f,    8.3271f,    -27.3705f,
-      1.0488f,    1.0864f,    1.0710f,    1.7332f,    -3.0561f,   1.1622f,
-      -7.6688f,   3.0491f,    -1.3865f,   0.0769f,    222.5451f,  207.8170f,
-      208.1767f,  193.1396f,  0.4447f,    2.1654f,    1.8929f,    35.1469f,
-      1.1783f,    2.6199f,    1.1611f,    26.2989f,   3.4446f,    0.1551f,
-      65.6529f,   1.2229f,    0.9851f,    1.0241f,    1.4373f,    -3.3421f,
-      0.1388f,    0.0756f,    0.2047f,    0.1140f,    0.0945f,    0.2038f,
-      0.1038f,    -0.2068f,   -0.0626f,   -0.1937f,   0.1347f,    -0.0464f,
-      -0.0866f,   0.0250f,    0.0264f,    -0.1556f,   -0.1625f,   0.1028f,
-      -0.1255f,   -0.0854f,   0.1033f,    0.0008f,    -0.2133f,   -0.0317f,
-      0.1725f,    -0.1054f,   -0.1900f,   0.0383f,    0.0440f,    -0.1900f,
-      -30.0811f,  -30.9929f,  -29.3194f,  -26.8347f,  -20.5957f,  -4.1595f,
-      -1.9066f,   42.4707f,   -9.0435f,   0.0064f,    175.7328f,  163.1350f,
-      163.5085f,  151.1648f,  4.4620f,    -20.6011f,  -19.3402f,  1.5468f,
-      -32.0920f,  -25.4581f,  -12.3706f,  -2.1636f,   -32.4569f,  3.9365f,
-      61.0117f,   -28.4195f,  -31.0837f,  -30.2749f,  -27.5522f,  -22.8688f,
-      -0.3000f,   0.0092f,    -0.3675f,   -0.4113f,   0.0033f,    0.1138f,
-      0.2182f,    -0.5803f,   0.7507f,    -0.2529f,   -1.7724f,   -1.4702f,
-      -1.5805f,   -1.4294f,   0.1435f,    -0.0168f,   0.2356f,    -0.4373f,
-      -0.4500f,   -0.4803f,   -0.0041f,   -0.3878f,   0.1321f,    0.2761f,
-      -1.1975f,   -0.3509f,   -0.0465f,   -0.4050f,   -0.1110f,   0.2233f,
-      0.0950f,    0.0974f,    -0.1600f,   -0.1753f,   -0.0328f,   0.0741f,
-      -0.0706f,   0.1839f,    -0.0833f,   -0.1367f,   -0.1094f,   -0.1739f,
-      -0.1069f,   0.0370f,    -0.1404f,   0.1631f,    -0.1570f,   0.2117f,
-      -0.1891f,   0.0395f,    0.1081f,    0.1760f,    0.0997f,    0.0853f,
-      -0.1018f,   0.1306f,    -0.0924f,   -0.2078f,   0.0801f,    -0.0949f,
-      0.5803f,    0.5578f,    0.4089f,    0.1912f,    0.6774f,    0.3145f,
-      0.3992f,    -0.1316f,   1.3142f,    -0.2457f,   -2.3536f,   -2.4939f,
-      -2.3165f,   -2.4879f,   0.2321f,    0.1901f,    0.1789f,    -1.5215f,
-      0.2645f,    0.2231f,    0.2411f,    -1.2361f,   0.2971f,    0.1421f,
-      -1.6715f,   0.3158f,    0.2476f,    0.3596f,    0.3029f,    0.9297f,
-      -88.8401f,  -89.5209f,  -86.1926f,  -87.4196f,  -39.6504f,  -17.9684f,
-      -4.2702f,   80.2017f,   -29.1676f,  -0.4190f,   150.2820f,  138.4751f,
-      139.1087f,  126.6569f,  13.7188f,   -57.0739f,  -80.3383f,  -18.8351f,
-      -87.4103f,  -56.0072f,  -82.7707f,  -23.1871f,  -93.6787f,  13.9287f,
-      59.6213f,   -87.4843f,  -90.4227f,  -86.2635f,  -86.6841f,  -37.9086f,
-      0.1184f,    -0.2169f,   -0.1915f,   0.0543f,    0.1253f,    -0.1370f,
-      0.0836f,    -0.1198f,   0.1544f,    -0.2004f,   -0.1118f,   -0.0786f,
-      0.1517f,    -0.1000f,   -0.1055f,   0.0936f,    -0.1579f,   0.1098f,
-      -0.0234f,   -0.0499f,   0.0951f,    -0.1711f,   0.0186f,    -0.2008f,
-      0.1777f,    0.1386f,    -0.1495f,   -0.0684f,   -0.2149f,   -0.1198f,
-      -0.6205f,   -0.7209f,   -0.5487f,   -0.9080f,   1.3400f,    0.0085f,
-      28.2837f,   3.2217f,    -1.8463f,   0.1620f,    -464.3599f, -458.4327f,
-      -455.9967f, -451.0393f, 1.6619f,    -0.6944f,   -0.3167f,   -52.3630f,
-      -1.6971f,   -0.7340f,   -0.8923f,   -49.2771f,  -1.1177f,   1.8810f,
-      -258.9386f, -1.0765f,   -0.7279f,   -0.5208f,   -0.8839f,   1.8175f,
-      -78.8510f,  -80.5740f,  -77.8843f,  -77.9798f,  -36.5560f,  -16.0818f,
-      -5.5362f,   66.4228f,   -16.8150f,  0.0036f,    181.8365f,  167.7181f,
-      168.2344f,  153.9725f,  11.2659f,   -47.5786f,  -92.6978f,  6.7573f,
-      -68.7704f,  -48.3850f,  -95.3637f,  8.8888f,    -76.9497f,  11.2243f,
-      60.9020f,   -77.6515f,  -80.7610f,  -78.4537f,  -77.4659f,  -36.2872f,
-      -0.0936f,   0.1966f,    -0.2121f,   0.0193f,    0.0489f,    -0.1445f,
-      0.0060f,    0.0358f,    -0.0783f,   -0.0985f,   -0.2072f,   -0.0802f,
-      -0.0185f,   0.1868f,    -0.0631f,   0.1260f,    -0.0675f,   0.2167f,
-      -0.2174f,   -0.1085f,   0.1483f,    -0.1655f,   -0.1040f,   0.1605f,
-      -0.1673f,   -0.0148f,   -0.1856f,   -0.1454f,   0.1603f,    -0.1620f,
-      -0.9205f,   -1.2716f,   -3.6561f,   -5.0834f,   -0.7934f,   1.8710f,
-      2.2999f,    -2.9516f,   -1.7631f,   -0.3804f,   41.2998f,   26.2358f,
-      28.9763f,   15.7315f,   5.2164f,    3.2963f,    -5.4457f,   18.6310f,
-      -25.0076f,  5.4368f,    -12.0085f,  17.1462f,   -14.6992f,  5.6365f,
-      48.6207f,   -1.0921f,   -1.8723f,   -3.5354f,   -5.1774f,   -1.0200f,
-      -0.1065f,   -0.2021f,   0.0332f,    0.1692f,    -0.1239f,   0.1325f,
-      -0.0660f,   -0.0567f,   0.2107f,    -0.2084f,   -0.0263f,   0.1411f,
-      0.0178f,    0.0451f,    0.2024f,    -0.1756f,   -0.0771f,   -0.1690f,
-      -0.2097f,   -0.2130f,   0.0714f,    0.0172f,    -0.0310f,   0.0649f,
-      -0.1550f,   0.0701f,    0.0306f,    -0.1750f,   -0.1988f,   -0.2060f,
-      0.0005f,    -0.1325f,   -0.1823f,   -0.0900f,   -0.1291f,   -0.1817f,
-      0.0144f,    0.0951f,    -0.1954f,   -0.0171f,   -0.1985f,   0.0875f,
-      0.0901f,    -0.0857f,   0.1681f,    0.0465f,    0.1023f,    0.0985f,
-      -0.2152f,   -0.1723f,   -0.0825f,   0.0203f,    -0.1206f,   -0.1431f,
-      -0.1552f,   0.1344f,    0.0398f,    0.0169f,    0.2180f,    -0.1530f,
-      2.7964f,    2.7312f,    2.8831f,    3.4729f,    -3.1366f,   2.4043f,
-      -7.2004f,   1.4128f,    2.8648f,    0.0578f,    225.5640f,  210.3712f,
-      210.6907f,  195.0339f,  0.3140f,    1.8060f,    2.7355f,    33.6917f,
-      3.3542f,    3.3682f,    1.7371f,    31.2424f,   3.4094f,    -0.1192f,
-      63.0864f,   3.0562f,    2.8633f,    2.6777f,    3.5495f,    -4.2616f,
-      -1.4034f,   0.3930f,    -4.6756f,   -9.9870f,   -27.8511f,  5.6071f,
-      -1.0862f,   34.4907f,   -10.4831f,  -0.0281f,   117.2617f,  104.9590f,
-      106.1515f,  93.9707f,   -16.8801f,  5.3036f,    -21.7458f,  98.5306f,
-      -20.7596f,  6.4733f,    -17.6440f,  98.3097f,   -31.9540f,  -17.0600f,
-      27.4543f,   -0.6140f,   -1.6182f,   -4.9167f,   -8.9017f,   -26.2485f,
-      -0.1952f,   -0.0462f,   -0.1958f,   0.1679f,    -0.1592f,   -0.1634f,
-      -0.0507f,   -0.0542f,   0.0038f,    -0.0343f,   0.0567f,    -0.1983f,
-      0.0250f,    -0.0762f,   0.0902f,    -0.0343f,   0.1240f,    0.1161f,
-      0.1237f,    0.1870f,    0.0346f,    0.0340f,    0.0625f,    -0.0355f,
-      0.0278f,    -0.1043f,   0.1755f,    0.0253f,    0.1750f,    -0.2070f,
-      -5.5531f,   -5.3122f,   -4.9348f,   -4.4782f,   -7.5686f,   -1.5478f,
-      -5.4341f,   0.5087f,    -2.1382f,   0.0798f,    208.3677f,  194.0083f,
-      194.4168f,  179.3082f,  1.4443f,    -1.5038f,   -1.4021f,   25.9363f,
-      -4.0635f,   -2.6785f,   -1.6640f,   22.2589f,   -1.4910f,   1.4715f,
-      59.1972f,   -4.9638f,   -5.1920f,   -4.9193f,   -5.2649f,   -8.0556f,
-      20.1226f,   12.0195f,   9.7385f,    10.7058f,   -27.4201f,  8.4869f,
-      -5.0826f,   32.9212f,   -2.0674f,   -0.0290f,   120.5002f,  112.3222f,
-      112.3287f,  104.1107f,  -20.6293f,  14.8534f,   -0.8748f,   103.1141f,
-      -1.1368f,   15.3716f,   2.7653f,    91.7285f,   -0.5991f,   -20.7338f,
-      35.9363f,   20.5104f,   11.1988f,   9.0368f,    10.6355f,   -26.5309f,
-      -0.2058f,   -0.2176f,   0.1331f,    -0.1415f,   -0.0825f,   -0.0470f,
-      -0.0615f,   0.1274f,    0.0076f,    -0.0575f,   -0.2065f,   0.0866f,
-      0.2166f,    -0.1942f,   -0.1952f,   0.1323f,    -0.1016f,   0.1803f,
-      -0.0424f,   0.1555f,    0.1118f,    0.1559f,    0.0337f,    -0.0341f,
-      -0.0430f,   0.1988f,    -0.0553f,   -0.0255f,   0.1817f,    0.0608f,
-      0.1431f,    0.0686f,    -0.0245f,   -0.2107f,   0.2001f,    -0.0964f,
-      -0.0090f,   0.1151f,    -0.0365f,   -0.1986f,   0.1740f,    -0.2098f,
-      0.0013f,    0.1369f,    0.1910f,    0.1801f,    -0.2019f,   0.0348f,
-      -0.1175f,   0.0627f,    -0.1929f,   -0.0099f,   0.1349f,    0.1804f,
-      -0.1071f,   -0.1651f,   -0.1146f,   -0.0259f,   0.1626f,    -0.0271f,
-      0.1393f,    0.1304f,    -0.0200f,   0.0924f,    -0.0839f,   -0.0031f,
-      -0.1311f,   0.0350f,    -0.1330f,   -0.0911f,   0.1949f,    -0.0209f,
-      -0.1883f,   0.0269f,    0.2040f,    0.1552f,    0.1532f,    0.1157f,
-      -0.1102f,   -0.1220f,   -0.0808f,   -0.1050f,   0.1716f,    0.0846f,
-      -0.0180f,   -0.1037f,   0.2063f,    0.1237f,    0.1253f,    -0.0496f,
-      -0.0183f,   0.0491f,    0.1703f,    -0.0824f,   -0.0702f,   -0.1100f,
-      -0.0965f,   0.0130f,    -0.1222f,   -0.1081f,   0.0329f,    0.2115f,
-      -0.1438f,   0.0799f,    -0.1602f,   -0.0330f,   0.0501f,    0.1072f,
-      -0.0744f,   -0.1783f,   -0.0240f,   0.0777f,    -0.1944f,   0.0438f,
-      -0.0033f,   -0.1873f,   0.0984f,    -0.0318f,   0.0773f,    0.1489f,
-      0.3966f,    0.4711f,    0.3972f,    0.0623f,    0.5970f,    0.1018f,
-      0.1375f,    -0.1881f,   0.8921f,    -0.1854f,   -2.1138f,   -2.1178f,
-      -1.8295f,   -2.1703f,   0.5784f,    -0.1937f,   -0.0728f,   -0.9953f,
-      0.2442f,    -0.4074f,   -0.1591f,   -1.1660f,   0.4832f,    0.2203f,
-      -1.4957f,   0.1544f,    0.1810f,    0.2275f,    0.4075f,    0.8153f,
-      0.0715f,    0.0222f,    0.0463f,    -0.0201f,   0.0396f,    0.5951f,
-      -0.2779f,   -0.0306f,   0.7532f,    -0.1596f,   -4.1080f,   -3.7925f,
-      -3.8522f,   -3.2468f,   0.7728f,    0.0188f,    -0.1448f,   0.4084f,
-      -0.4666f,   -0.1036f,   -1.1469f,   0.4243f,    0.2778f,    0.9023f,
-      -3.0216f,   0.0384f,    -0.3348f,   -0.0314f,   -0.2788f,   0.0479f,
-      139.0773f,  131.6164f,  115.0392f,  111.1817f,  41.7596f,   9.5379f,
-      1.8542f,    46.9890f,   -12.8221f,  0.0241f,    52.9779f,   51.5268f,
-      50.8060f,   48.7028f,   -132.9665f, 118.3478f,  101.1239f,  81.4608f,
-      75.4251f,   121.0643f,  97.8947f,   86.8911f,   74.5576f,   -133.7606f,
-      29.2657f,   135.8916f,  131.3661f,  114.1687f,  111.0784f,  31.3790f,
-      -0.0807f,   -0.0657f,   -0.0027f,   0.0410f,    0.0765f,    0.1194f,
-      0.0953f,    -0.0060f,   0.1531f,    -0.2339f,   0.1488f,    -0.0615f,
-      -0.0579f,   0.0761f,    0.1250f,    -0.0469f,   0.1480f,    0.0683f,
-      -0.0049f,   0.1558f,    0.2168f,    -0.0736f,   0.1135f,    -0.1244f,
-      0.0725f,    -0.1297f,   -0.0215f,   -0.0412f,   -0.1632f,   -0.0200f,
-      -0.1346f,   -0.1954f,   0.0053f,    0.0151f,    0.1379f,    -0.1497f,
-      -0.0102f,   -0.0336f,   0.0900f,    -0.1706f,   -0.0932f,   -0.2084f,
-      0.1242f,    -0.2027f,   0.0849f,    -0.2139f,   -0.2015f,   0.0944f,
-      -0.0984f,   0.2082f,    0.1625f,    -0.0227f,   -0.1676f,   0.1021f,
-      0.1516f,    0.0245f,    0.0955f,    -0.1488f,   -0.0057f,   0.1783f,
-      -0.8568f,   -0.8175f,   -0.6282f,   -1.3107f,   1.5712f,    0.1044f,
-      28.2289f,   3.0885f,    -1.9829f,   0.1600f,    -465.9583f, -459.5893f,
-      -457.5055f, -452.7600f, 1.7229f,    -0.6620f,   -0.1065f,   -52.8017f,
-      -2.0293f,   -0.8224f,   -1.0389f,   -49.9049f,  -1.2250f,   1.7647f,
-      -259.2465f, -1.0978f,   -0.5169f,   -0.8721f,   -0.8197f,   1.9158f,
-      16.2234f,   15.8523f,   13.8343f,   9.8509f,    -21.4326f,  15.7650f,
-      -6.4451f,   34.8575f,   1.1387f,    -0.0223f,   117.7213f,  109.8494f,
-      109.7624f,  101.8532f,  -20.3275f,  16.0812f,   4.9165f,    92.4919f,
-      4.1615f,    13.8451f,   9.2112f,    97.1580f,   -8.7037f,   -20.4420f,
-      27.1105f,   17.4922f,   13.9998f,   12.3888f,   11.4705f,   -20.9568f,
-      0.5457f,    0.5322f,    0.2823f,    0.3581f,    0.5359f,    0.1576f,
-      0.1969f,    -0.0136f,   -0.2748f,   -0.3168f,   -0.3918f,   -0.2167f,
-      -0.1797f,   -0.1869f,   0.2986f,    -0.2116f,   -0.4226f,   -0.2022f,
-      0.9452f,    0.5474f,    -0.1218f,   0.2067f,    -0.1600f,   0.1937f,
-      0.0808f,    0.4877f,    0.5106f,    0.2626f,    0.5076f,    0.6228f,
-      0.5124f,    0.4044f,    0.4023f,    0.1222f,    2.5446f,    0.9623f,
-      24.9875f,   4.7442f,    -2.0551f,   0.1642f,    -449.9478f, -444.1841f,
-      -442.0153f, -437.1498f, 2.3209f,    -0.6986f,   -0.3456f,   -47.4074f,
-      -1.2374f,   -1.0939f,   -0.9112f,   -41.1851f,  -0.5064f,   2.4209f,
-      -263.4446f, -0.0433f,   0.3460f,    0.1475f,    0.3770f,    2.9154f,
-      0.2032f,    0.1527f,    0.2161f,    -0.1981f,   0.1893f,    -0.2003f,
-      0.1734f,    0.1713f,    0.1207f,    -0.2073f,   -0.1018f,   0.0770f,
-      0.0728f,    0.1665f,    0.0689f,    0.1884f,    -0.1399f,   -0.1326f,
-      -0.0518f,   -0.1948f,   0.1576f,    -0.1835f,   0.1436f,    0.0497f,
-      0.0883f,    -0.1253f,   -0.0417f,   -0.0507f,   -0.1555f,   0.2076f,
-      -2.4080f,   6.1616f,    -0.8564f,   -13.6773f,  -32.7238f,  -16.3144f,
-      -1.9828f,   20.5110f,   -17.0191f,  -1.7154f,   103.6642f,  95.3675f,
-      95.5662f,   86.9504f,   -35.5340f,  19.6681f,   -2.4900f,   65.0847f,
-      -15.8119f,  13.7256f,   -4.6753f,   63.4713f,   -6.5992f,   -34.2369f,
-      41.3959f,   -1.5528f,   3.8106f,    -0.7762f,   -12.3204f,  -35.1734f,
-      -83.9509f,  -87.4861f,  -83.5925f,  -81.5047f,  -54.1256f,  -45.7506f,
-      -13.5325f,  -6.0331f,   -8.5062f,   0.0261f,    189.9450f,  177.7870f,
-      178.6945f,  164.9762f,  9.8521f,    -68.0619f,  -68.6145f,  6.5056f,
-      -55.9651f,  -66.9540f,  -65.3349f,  -2.1954f,   -57.2408f,  8.6577f,
-      60.6966f,   -82.1056f,  -88.5245f,  -83.3057f,  -80.7283f,  -50.5285f,
-      -0.1397f,   0.1862f,    -0.0691f,   -0.0906f,   0.1560f,    0.1377f,
-      -0.0066f,   -0.0213f,   0.0708f,    -0.0386f,   -0.0015f,   -0.0020f,
-      -0.2122f,   0.0747f,    0.0795f,    0.0229f,    0.1923f,    -0.1661f,
-      0.0895f,    0.1176f,    0.1398f,    -0.0443f,   0.0934f,    0.0638f,
-      -0.1924f,   0.0602f,    0.0404f,    0.1597f,    0.1387f,    -0.0601f,
-      -28.3967f,  -21.8483f,  -25.5175f,  -29.9252f,  2.0161f,    -3.0092f,
-      7.7435f,    28.2367f,   -35.0188f,  -0.1578f,   105.0164f,  93.4495f,
-      94.9134f,   81.0315f,   4.3602f,    8.1303f,    -37.7665f,  -16.6986f,
-      -40.8902f,  8.2542f,    -33.3215f,  -2.0457f,   -69.0245f,  4.1016f,
-      47.2770f,   -25.8268f,  -23.6034f,  -26.4339f,  -27.8305f,  8.4468f,
-      13.8742f,   8.3874f,    4.2044f,    1.4619f,    -40.2909f,  -0.6358f,
-      -0.7982f,   36.1931f,   -17.3147f,  -0.3348f,   106.8135f,  96.5298f,
-      97.8829f,   86.9994f,   -25.8170f,  15.0652f,   -0.9181f,   85.8544f,
-      2.5475f,    9.8009f,    -3.5931f,   89.2017f,   -3.7252f,   -25.2986f,
-      22.5505f,   14.0434f,   7.0708f,    4.6646f,    1.5807f,    -39.4024f,
-      -0.1436f,   0.0256f,    0.0274f,    -0.2126f,   0.0401f,    0.0745f,
-      -0.0379f,   -0.0357f,   0.0777f,    -0.0709f,   -0.1093f,   -0.2047f,
-      -0.0713f,   -0.0478f,   -0.0908f,   0.1963f,    0.1282f,    0.0977f,
-      0.1304f,    0.2058f,    0.0700f,    0.0518f,    0.0239f,    0.0686f,
-      -0.1909f,   0.0828f,    -0.1243f,   -0.1920f,   0.1908f,    -0.0808f,
-      90.8028f,   89.2894f,   84.5339f,   83.3491f,   -13.3838f,  12.0240f,
-      -3.9443f,   63.0867f,   -2.5321f,   -0.0099f,   68.9140f,   66.3206f,
-      66.0278f,   63.1498f,   -83.7261f,  74.3448f,   73.4998f,   64.8477f,
-      69.7701f,   74.5878f,   71.0331f,   63.2116f,   74.3162f,   -83.9282f,
-      20.8163f,   89.6818f,   88.6452f,   83.7338f,   82.9360f,   -13.2357f,
-      0.1299f,    -0.1765f,   -0.0168f,   -0.1372f,   -0.1183f,   0.0472f,
-      0.1312f,    0.0267f,    0.0194f,    -0.1593f,   0.0059f,    0.1775f,
-      0.0668f,    -0.1239f,   -0.1982f,   -0.1415f,   -0.1659f,   -0.1148f,
-      0.0136f,    0.0913f,    -0.1254f,   -0.0357f,   0.0892f,    0.0835f,
-      -0.0554f,   0.1969f,    -0.0888f,   -0.0623f,   -0.0236f,   -0.1492f,
-      0.4196f,    0.3218f,    0.2287f,    0.5095f,    0.7210f,    0.2279f,
-      0.4523f,    -0.1832f,   1.3095f,    -0.2041f,   -2.1443f,   -2.1947f,
-      -1.9292f,   -2.1142f,   0.5840f,    0.1018f,    0.1011f,    -1.6565f,
-      0.4325f,    0.0424f,    0.2836f,    -1.7183f,   0.2595f,    0.2686f,
-      -1.8784f,   0.3891f,    0.3050f,    0.6195f,    0.2896f,    0.5905f,
-      -5.3024f,   -3.2518f,   -12.5192f,  -29.1732f,  1.6538f,    -1.8315f,
-      9.9788f,    10.5155f,   6.3234f,    -0.3460f,   76.9925f,   51.3785f,
-      55.7120f,   29.0432f,   5.5901f,    25.6578f,   -3.9565f,   13.0509f,
-      -106.0371f, 23.2124f,   -18.2004f,  8.4618f,    -69.3585f,  5.5651f,
-      80.0565f,   -6.4941f,   -5.3742f,   -14.4209f,  -24.1565f,  6.6801f,
-      -22.0585f,  -20.9909f,  -26.7939f,  -29.6890f,  -14.5085f,  2.1866f,
-      -4.2608f,   17.3977f,   -30.8824f,  -0.4017f,   135.6957f,  126.9320f,
-      127.0044f,  118.1835f,  -1.8768f,   -0.8629f,   -32.0882f,  44.7862f,
-      -23.9174f,  1.6485f,    -27.9940f,  51.9078f,   -48.5279f,  -1.7550f,
-      49.9230f,   -19.9785f,  -22.4647f,  -27.6911f,  -27.3197f,  -10.6545f,
-      -0.1922f,   -0.1999f,   -0.1396f,   0.1065f,    0.0085f,    -0.1940f,
-      0.0351f,    0.1285f,    -0.0292f,   -0.1296f,   0.1543f,    -0.2082f,
-      -0.1758f,   0.0719f,    0.0764f,    0.1394f,    -0.0255f,   -0.0370f,
-      0.1615f,    -0.0568f,   0.1920f,    -0.1631f,   0.0199f,    0.1884f,
-      0.0693f,    0.1074f,    -0.0273f,   0.1540f,    0.0098f,    0.2111f,
-      0.1805f,    -0.0555f,   0.1159f,    0.0469f,    0.1789f,    -0.1711f,
-      -0.1304f,   0.1912f,    -0.0737f,   -0.1408f,   0.1804f,    -0.2023f,
-      -0.0467f,   -0.1019f,   -0.0136f,   0.0691f,    0.1454f,    -0.0213f,
-      0.0929f,    -0.0958f,   0.1299f,    0.1137f,    0.1175f,    0.1042f,
-      -0.2081f,   -0.0737f,   0.0582f,    0.1640f,    0.2120f,    -0.0646f,
-      -0.0326f,   0.1976f,    0.1182f,    -0.1365f,   -0.1784f,   0.2113f,
-      0.0469f,    0.0763f,    -0.0197f,   -0.1902f,   0.1259f,    0.1598f,
-      -0.0180f,   -0.1339f,   -0.1675f,   -0.1884f,   -0.1973f,   0.1529f,
-      0.1160f,    0.2154f,    -0.1446f,   -0.1395f,   0.0355f,    0.1513f,
-      -0.2086f,   -0.1135f,   -0.1502f,   -0.0018f,   0.0486f,    -0.0110f,
-      -0.0843f,   -0.0716f,   -0.1367f,   0.0753f,    0.0114f,    0.0475f,
-      -0.0632f,   0.2045f,    -0.0512f,   -0.0906f,   -0.1071f,   -0.1957f,
-      0.1361f,    0.1821f,    -0.1684f,   -0.1383f,   0.1059f,    0.1579f,
-      -0.0064f,   -0.1205f,   -0.0718f,   -0.1323f,   -0.0174f,   -0.1092f,
-      -0.1915f,   0.1978f,    -0.1245f,   0.1297f,    -0.1542f,   0.1556f,
-      -0.1752f,   0.0718f,    -0.1020f,   -0.1970f,   0.0518f,    -0.0888f,
-      0.0541f,    -0.1922f,   -0.1467f,   -0.0653f,   -0.1940f,   -0.0800f,
-      -0.1096f,   -0.0796f,   -0.1310f,   0.0191f,    -0.1077f,   -0.0973f,
-      0.1566f,    0.0074f,    0.0500f,    -0.0415f,   -0.2116f,   0.0227f,
-      0.0895f,    0.1528f,    0.1404f,    0.0467f,    0.0462f,    -0.0973f,
-      -0.1669f,   0.0551f,    0.1167f,    -0.1470f,   -0.0542f,   -0.1006f,
-      0.2104f,    0.1039f,    -0.0211f,   -0.1726f,   -0.0694f,   -0.0270f,
-      0.0277f,    -0.0715f,   -0.2055f,   -0.1502f,   -0.1718f,   -0.0043f,
-      0.0174f,    0.1019f,    -0.0233f,   -0.1518f,   -0.1331f,   -0.0001f,
-      -0.1483f,   -0.2115f,   0.0666f,    0.0014f,    0.1601f,    -0.0690f,
-    };
-
-static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = {
-  0.156824f,   0.f,         0.130013f,   0.084482f,  -129.058197f, -15.090252f,
-  -3.859116f,  0.736356f,   -81.361557f, -0.001922f, -0.000713f,   0.440181f,
-  14.982646f,  1.282223f,   2.23122f,    94.26635f,  93.920929f,   0.614672f,
-  0.f,         0.315858f,   4.746014f,   0.116901f,  -35.661354f,  -75.148285f,
-  92.006989f,  -14.112332f, 86.673157f,  -0.000307f, -0.000544f,   0.f,
-  -7.851313f,  0.505186f,   0.f,         0.f,        -111.681091f, -0.937782f,
-  0.035789f,   0.f,         0.f,         -0.00102f,  -75.180527f,  0.f,
-  -63.821148f, 79.592392f,  0.085068f,   11.184906f, 1.25406f,     0.f,
-  -29.779242f, -0.181732f,  0.f,         0.425554f,  -90.78405f,   0.f,
-  -0.828326f,  -81.132179f, 0.f,         -2.757063f, 0.f,          0.f,
-  2.967951f,   -4.440599f,  0.f,         -5.105355f, 14.734543f,   0.f,
-  0.f,         0.f,         0.f,         0.295342f,  -0.026907f,   133.375412f,
-  -0.000855f,  0.f,         -0.875029f,  15.665165f, 0.437296f,    0.321257f,
-  -0.001932f,  -4.235782f,  -87.187782f, 0.f,        -28.84696f,   7.055514f,
-  0.f,         95.548302f,  -0.000425f,  0.38969f,   -13.88008f,   -27.347931f,
-  0.f,         0.f,         0.f,         -0.000026f, 0.f,          0.f,
-};
-
-static const float
-    av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = {
-      -0.101706f,   -0.14411f,    -0.139118f,   -0.132945f,   118.811302f,
-      3.137232f,    -32.969776f,  -4.150725f,   26.263071f,   0.092841f,
-      0.174125f,    -0.028195f,   15.712872f,   17.722702f,   5.666006f,
-      -121.143929f, -131.933731f, -3.000318f,   -0.032063f,   -0.380065f,
-      -1.660653f,   -0.164802f,   7.177527f,    87.759155f,   -119.564224f,
-      -98.051651f,  -110.581116f, -0.069982f,   0.023906f,    0.183792f,
-      40.606274f,   -0.080804f,   -0.053744f,   -0.187848f,   157.44313f,
-      -4.820149f,   0.089499f,    0.070232f,    -0.043038f,   0.072996f,
-      93.347313f,   0.225259f,    103.223228f,  -110.682541f, 0.14314f,
-      -89.827538f,  6.505952f,    -0.076949f,   73.816132f,   -0.063416f,
-      -0.23736f,    -0.066059f,   116.049599f,  0.120871f,    -4.708246f,
-      107.501671f,  -0.206708f,   -32.688675f,  0.047608f,    -0.105907f,
-      6.505825f,    -75.461891f,  -0.160341f,   6.532121f,    -84.868111f,
-      -0.065622f,   0.044756f,    0.008672f,    0.017155f,    0.046108f,
-      -0.218818f,   -126.507957f, 0.028271f,    0.180625f,    -4.707376f,
-      -121.524307f, -0.03853f,    -4.103166f,   -0.018947f,   -95.768463f,
-      15.941695f,   0.147154f,    -102.863029f, -72.521698f,  -0.037133f,
-      -138.1492f,   0.210016f,    -0.084692f,   -68.693665f,  -52.523472f,
-      -0.133385f,   -0.17438f,    0.008654f,    -0.035642f,   -0.145202f,
-      0.211135f,
-    };
-
-static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = {
-  0.251909f
-};
-
-static const NN_CONFIG av1_rdcost_model_nnconfig = {
-  NUM_FEATURES,
-  NUM_OUTPUTS,
-  NUM_HIDDEN_LAYERS,
-  {
-      NUM_HIDDEN_NODES,
-  },
-  {
-      av1_rdcost_model_nn_weights_layer0,
-      av1_rdcost_model_nn_weights_layer1,
-  },
-  {
-      av1_rdcost_model_nn_biases_layer0,
-      av1_rdcost_model_nn_biases_layer1,
-  },
-};
-
-//------------------------------------------------------------------------------
-
-#undef NUM_FEATURES
-#undef NUM_HIDDEN_LAYERS
-#undef NUM_HIDDEN_NODES
-#undef NUM_OUTPUTS
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_

diff --git a/libaom/av1/encoder/ratectrl.c b/libaom/av1/encoder/ratectrl.c
index 861c737..433163f 100644
--- a/libaom/av1/encoder/ratectrl.c
+++ b/libaom/av1/encoder/ratectrl.c

@@ -34,17 +34,20 @@
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
+#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
+
 // Max rate target for 1080P and below encodes under normal circumstances
 // (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
 #define MAX_MB_RATE 250
 #define MAXRATE_1080P 2025000
 
-#define DEFAULT_KF_BOOST 2000
-#define DEFAULT_GF_BOOST 2000
-
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2
+#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0
+
 #define FRAME_OVERHEAD_BITS 200
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
@@ -81,9 +84,13 @@
 static int inter_minq_12[QINDEX_RANGE];
 static int rtc_minq_12[QINDEX_RANGE];
 
-static int gf_high = 2000;
-static int gf_low = 400;
+static int gf_high = 2400;
+static int gf_low = 300;
+#ifdef STRICT_RC
+static int kf_high = 3200;
+#else
 static int kf_high = 5000;
+#endif
 static int kf_low = 400;
 
 // How many times less pixels there are to encode given the current scaling.
@@ -140,9 +147,9 @@
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
   // Convert the index to a real Q value (scaled down to match old Q values)
   switch (bit_depth) {
-    case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
-    case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
-    case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
+    case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1.0;
@@ -152,13 +159,12 @@
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor, aom_bit_depth_t bit_depth) {
   const double q = av1_convert_qindex_to_q(qindex, bit_depth);
-  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+  int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000;
 
   assert(correction_factor <= MAX_BPB_FACTOR &&
          correction_factor >= MIN_BPB_FACTOR);
 
   // q based adjustment to baseline enumerator
-  enumerator += (int)(enumerator * q) >> 12;
   return (int)(enumerator * correction_factor / q);
 }
 
@@ -212,6 +218,24 @@
   return target;
 }
 
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+  const int current_temporal_layer = svc->temporal_layer_id;
+  for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers;
+       ++i) {
+    const int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    lrc->bits_off_target +=
+        (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target =
+        AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+  }
+}
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -226,6 +250,8 @@
   // Clip the buffer level to the maximum specified buffer size.
   rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = rc->bits_off_target;
+
+  if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size);
 }
 
 int av1_rc_get_default_min_gf_interval(int width, int height,
@@ -347,6 +373,42 @@
   }
 }
 
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int max_delta = 16;
+  const int change_avg_frame_bandwidth =
+      abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
+      0.1 * (rc->avg_frame_bandwidth);
+  // If resolution changes or avg_frame_bandwidth significantly changed,
+  // then set this flag to indicate change in target bits per macroblock.
+  const int change_target_bits_mb =
+      cm->prev_frame &&
+      (cm->width != cm->prev_frame->width ||
+       cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
+  // Apply some control/clamp to QP under certain conditions.
+  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
+      rc->frames_since_key > 1 && !change_target_bits_mb &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame))) {
+    // Make sure q is between oscillating Qs to prevent resonance.
+    if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
+        rc->q_1_frame != rc->q_2_frame) {
+      q = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+                AOMMAX(rc->q_1_frame, rc->q_2_frame));
+    }
+    // Limit the decrease in Q from previous frame.
+    if (rc->q_1_frame - q > max_delta) q = rc->q_1_frame - max_delta;
+  }
+  // For single spatial layer: if resolution has increased push q closer
+  // to the active_worst to avoid excess overshoot.
+  if (cpi->svc.number_spatial_layers <= 1 && cm->prev_frame &&
+      (cm->width * cm->height >
+       1.5 * cm->prev_frame->width * cm->prev_frame->height))
+    q = (q + active_worst_quality) >> 1;
+  return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
 static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
   KF_STD,        // KF_UPDATE
   INTER_NORMAL,  // LF_UPDATE
@@ -370,13 +432,12 @@
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rcf = rc->rate_correction_factors[KF_STD];
-  } else if (cpi->oxcf.pass == 2) {
-    const RATE_FACTOR_LEVEL rf_lvl =
-        get_rate_factor_level(&cpi->twopass.gf_group);
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
     rcf = rc->rate_correction_factors[rf_lvl];
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
       rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
@@ -397,13 +458,12 @@
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
-  } else if (cpi->oxcf.pass == 2) {
-    const RATE_FACTOR_LEVEL rf_lvl =
-        get_rate_factor_level(&cpi->twopass.gf_group);
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
@@ -436,7 +496,7 @@
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
-        cpi->common.current_frame.frame_type, cm->base_qindex, MBs,
+        cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
         rate_correction_factor, cm->seq_params.bit_depth);
   }
   // Work out a size correction factor.
@@ -454,7 +514,7 @@
   }
 
   cpi->rc.q_2_frame = cpi->rc.q_1_frame;
-  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.q_1_frame = cm->quant_params.base_qindex;
   cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
   if (correction_factor > 110)
     cpi->rc.rc_1_frame = -1;
@@ -503,8 +563,8 @@
                                        const AV1_COMP *cpi,
                                        double correction_factor,
                                        int best_qindex, int worst_qindex) {
-  const int use_cyclic_refresh =
-      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+  const int use_cyclic_refresh = cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+                                 cpi->cyclic_refresh->apply_cyclic_refresh;
 
   // Find 'qindex' based on 'desired_bits_per_mb'.
   assert(best_qindex <= worst_qindex);
@@ -556,20 +616,14 @@
   const double correction_factor =
       get_rate_correction_factor(cpi, width, height);
   const int target_bits_per_mb =
-      (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
+      (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs);
 
   int q =
       find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
                                   active_best_quality, active_worst_quality);
+  if (cpi->oxcf.rc_mode == AOM_CBR && has_no_stats_stage(cpi))
+    return adjust_q_cbr(cpi, q, active_worst_quality);
 
-  // In CBR mode, this makes sure q is between oscillating Qs to prevent
-  // resonance.
-  if (cpi->oxcf.rc_mode == AOM_CBR &&
-      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
-      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-              AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
-  }
   return q;
 }
 
@@ -624,7 +678,7 @@
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+        (cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
          cpi->refresh_alt_ref_frame)) {
       active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
                                              : rc->last_q[INTER_FRAME];
@@ -735,7 +789,8 @@
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref &&
+  } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+             cpi->oxcf.gf_cbr_boost_pct &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
@@ -804,26 +859,32 @@
   return q;
 }
 
-static int gf_group_pyramid_level(const AV1_COMP *cpi) {
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int this_height = gf_group->pyramid_level[gf_group->index];
-  return this_height;
+static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
+  return gf_group->layer_depth[gf_index];
 }
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
                                const AV1EncoderConfig *const oxcf,
-                               int intra_only, int superres_denom) {
+                               int intra_only, SUPERRES_MODE superres_mode,
+                               int superres_denom) {
   static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
   (void)intra_only;
   if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
     // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
     //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
-    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
-        superres_denom != SCALE_NUMERATOR &&
-        !(intra_only && rc->frames_to_key <= 1)) {
-      active_cq_level =
-          AOMMAX(active_cq_level - ((superres_denom - SCALE_NUMERATOR) * 4), 0);
+    if ((superres_mode == SUPERRES_QTHRESH || superres_mode == SUPERRES_AUTO) &&
+        superres_denom != SCALE_NUMERATOR) {
+      int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
+      if (intra_only && rc->frames_to_key <= 1) {
+        mult = 0;
+      } else if (intra_only) {
+        mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME;
+      } else {
+        mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME;
+      }
+      active_cq_level = AOMMAX(
+          active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
     }
   }
   if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
@@ -835,6 +896,44 @@
   return active_cq_level;
 }
 
+static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
+                                     const RATE_CONTROL *const rc,
+                                     const GF_GROUP *const gf_group,
+                                     int gf_index, int cq_level,
+                                     int bit_depth) {
+  assert(oxcf->use_fixed_qp_offsets);
+  assert(oxcf->rc_mode == AOM_Q);
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index];
+
+  int offset_idx = -1;
+  if (update_type == KF_UPDATE) {
+    if (rc->frames_to_key == 1) {
+      // Image / intra-only coding: ignore offsets.
+      return cq_level;
+    }
+    offset_idx = 0;
+  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) {
+    offset_idx = 1;
+  } else if (update_type == INTNL_ARF_UPDATE) {
+    offset_idx =
+        AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1);
+  } else {  // Leaf level / overlay frame.
+    assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE ||
+           update_type == INTNL_OVERLAY_UPDATE);
+    return cq_level;  // Directly Return worst quality allowed.
+  }
+  assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT);
+  assert(oxcf->fixed_qp_offsets[offset_idx] >= 0);
+
+  // Get qindex offset, by first converting to 'q' and then back.
+  const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth);
+  const double q_val_target =
+      AOMMAX(q_val_orig - oxcf->fixed_qp_offsets[offset_idx], 0.0);
+  const int delta_qindex =
+      av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth);
+  return AOMMAX(cq_level + delta_qindex, 0);
+}
+
 static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
                                              int height, int *bottom_index,
                                              int *top_index) {
@@ -842,13 +941,20 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
-                                           cm->superres_scale_denominator);
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (oxcf->use_fixed_qp_offsets) {
+    return get_q_using_fixed_offsets(oxcf, rc, &cpi->gf_group,
+                                     cpi->gf_group.index, cq_level, bit_depth);
+  }
+
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
   int *inter_minq;
-  const int bit_depth = cm->seq_params.bit_depth;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
@@ -987,196 +1093,153 @@
 
 static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
   1.00,  // INTER_NORMAL
-  1.25,  // GF_ARF_LOW
+  1.50,  // GF_ARF_LOW
   2.00,  // GF_ARF_STD
   2.00,  // KF_STD
 };
 
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
-  const RATE_FACTOR_LEVEL rf_lvl =
-      get_rate_factor_level(&cpi->twopass.gf_group);
+  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
   const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
-  return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q,
-                                    rate_factor_deltas[rf_lvl],
+  double rate_factor;
+
+  rate_factor = rate_factor_deltas[rf_lvl];
+  if (rf_lvl == GF_ARF_LOW) {
+    rate_factor -= (cpi->gf_group.layer_depth[cpi->gf_group.index] - 2) * 0.1;
+    rate_factor = AOMMAX(rate_factor, 1.0);
+  }
+  return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
                                     cpi->common.seq_params.bit_depth);
 }
 
-#define STATIC_MOTION_THRESH 95
-static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
-                                         int height, int *bottom_index,
-                                         int *top_index, int *arf_q) {
+// This unrestricted Q selection on CQ mode is useful when testing new features,
+// but may lead to Q being out of range on current RC restrictions
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width,
+                                            int height, int *bottom_index,
+                                            int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
                                            cm->superres_scale_denominator);
-  int active_best_quality;
-  int active_worst_quality = cpi->twopass.active_worst_quality;
-  int q;
-  int *inter_minq;
   const int bit_depth = cm->seq_params.bit_depth;
-  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+  const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
+  (void)width;
+  (void)height;
+  *top_index = q;
+  *bottom_index = q;
 
-  const int is_intrl_arf_boost =
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+  return q;
+}
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
 
-  if (frame_is_intra_only(cm)) {
-    if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
-      // If the next frame is also a key frame or the current frame is the
-      // only frame in the sequence in AOM_Q mode, just use the cq_level
-      // as q.
-      active_best_quality = cq_level;
-      active_worst_quality = cq_level;
-    } else if (cm->current_frame.frame_type == KEY_FRAME &&
-               cm->show_frame == 0) {
-      // Handle the special case for forward reference key frames.
-      // Increase the boost because this keyframe is used as a forward and
-      // backward reference.
-      const int qindex = rc->last_boosted_qindex;
-      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-      const int delta_qindex = av1_compute_qdelta(
-          rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
+#define STATIC_MOTION_THRESH 95
+static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                            int height, int *active_best,
+                                            int *active_worst, int cq_level,
+                                            int is_fwd_kf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int active_best_quality;
+  int active_worst_quality = *active_worst;
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
+    // If the next frame is also a key frame or the current frame is the
+    // only frame in the sequence in AOM_Q mode, just use the cq_level
+    // as q.
+    active_best_quality = cq_level;
+    active_worst_quality = cq_level;
+  } else if (is_fwd_kf) {
+    // Handle the special case for forward reference key frames.
+    // Increase the boost because this keyframe is used as a forward and
+    // backward reference.
+    const int qindex = rc->last_boosted_qindex;
+    const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+    const int delta_qindex = av1_compute_qdelta(
+        rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
+    active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+  } else if (rc->this_key_frame_forced) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    double last_boosted_q;
+    int delta_qindex;
+    int qindex;
+
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      active_best_quality = qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 1.25, bit_depth);
+      active_worst_quality =
+          AOMMIN(qindex + delta_qindex, active_worst_quality);
+    } else {
+      qindex = rc->last_boosted_qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-      // Update the arf_q since the forward keyframe is replacing the ALTREF
-      *arf_q = active_best_quality;
-    } else if (rc->this_key_frame_forced) {
-      // Handle the special case for key frames forced when we have reached
-      // the maximum key frame interval. Here force the Q to a range
-      // based on the ambient Q to reduce the risk of popping.
-      double last_boosted_q;
-      int delta_qindex;
-      int qindex;
-
-      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-        qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
-        active_best_quality = qindex;
-        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 1.25, bit_depth);
-        active_worst_quality =
-            AOMMIN(qindex + delta_qindex, active_worst_quality);
-      } else {
-        qindex = rc->last_boosted_qindex;
-        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 0.50, bit_depth);
-        active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-      }
-    } else {
-      // Not forced keyframe.
-      double q_adj_factor = 1.0;
-      double q_val;
-
-      // Baseline value derived from cpi->active_worst_quality and kf boost.
-      active_best_quality =
-          get_kf_active_quality(rc, active_worst_quality, bit_depth);
-
-      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
-        active_best_quality /= 3;
-      }
-
-      // Allow somewhat lower kf minq with small image formats.
-      if ((width * height) <= (352 * 288)) {
-        q_adj_factor -= 0.25;
-      }
-
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
-
-      // Convert the adjustment factor to a qindex delta
-      // on active_best_quality.
-      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
-      active_best_quality +=
-          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
-    }
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || is_intrl_arf_boost ||
-              cpi->refresh_alt_ref_frame)) {
-    // Use the lower of active_worst_quality and recent
-    // average Q as basis for GF/ARF best Q limit unless last frame was
-    // a key frame.
-    if (rc->frames_since_key > 1 &&
-        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = rc->avg_frame_qindex[INTER_FRAME];
-    } else {
-      q = active_worst_quality;
-    }
-    // For constrained quality dont allow Q less than the cq level
-    if (oxcf->rc_mode == AOM_CQ) {
-      if (q < cq_level) q = cq_level;
-
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-
-      // Constrained quality use slightly lower active best.
-      active_best_quality = active_best_quality * 15 / 16;
-
-      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-        const int min_boost = get_gf_high_motion_quality(q, bit_depth);
-        const int boost = min_boost - active_best_quality;
-
-        active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-        *arf_q = active_best_quality;
-      } else if (is_intrl_arf_boost) {
-        assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
-        active_best_quality = rc->arf_q;
-        int this_height = gf_group_pyramid_level(cpi);
-        while (this_height < gf_group->pyramid_height) {
-          active_best_quality = (active_best_quality + cq_level + 1) / 2;
-          ++this_height;
-        }
-      }
-    } else if (oxcf->rc_mode == AOM_Q) {
-      if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
-        active_best_quality = cq_level;
-      } else {
-        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-          active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
-          const int boost = min_boost - active_best_quality;
-
-          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-          *arf_q = active_best_quality;
-        } else {
-          assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
-          assert(is_intrl_arf_boost);
-          active_best_quality = rc->arf_q;
-          int this_height = gf_group_pyramid_level(cpi);
-          while (this_height < gf_group->pyramid_height) {
-            active_best_quality = (active_best_quality + cq_level + 1) / 2;
-            ++this_height;
-          }
-        }
-      }
-    } else {
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-      const int min_boost = get_gf_high_motion_quality(q, bit_depth);
-      const int boost = min_boost - active_best_quality;
-
-      active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-      if (is_intrl_arf_boost) {
-        int this_height = gf_group_pyramid_level(cpi);
-        while (this_height < gf_group->pyramid_height) {
-          active_best_quality =
-              (active_best_quality + active_worst_quality + 1) / 2;
-          ++this_height;
-        }
-      }
     }
   } else {
-    if (oxcf->rc_mode == AOM_Q) {
-      active_best_quality = cq_level;
-    } else {
-      active_best_quality = inter_minq[active_worst_quality];
+    // Not forced keyframe.
+    double q_adj_factor = 1.0;
+    double q_val;
 
-      // For the constrained quality mode we don't want
-      // q to fall below the cq level.
-      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
-        active_best_quality = cq_level;
-      }
+    // Baseline value derived from cpi->active_worst_quality and kf boost.
+    active_best_quality =
+        get_kf_active_quality(rc, active_worst_quality, bit_depth);
+
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+      active_best_quality /= 3;
+    }
+
+    // Allow somewhat lower kf minq with small image formats.
+    if ((width * height) <= (352 * 288)) {
+      q_adj_factor -= 0.25;
+    }
+
+    // Make a further adjustment based on the kf zero motion measure.
+    if (is_stat_consumption_stage_twopass(cpi))
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+    // Convert the adjustment factor to a qindex delta
+    // on active_best_quality.
+    q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+    active_best_quality +=
+        av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+
+    // Tweak active_best_quality for AOM_Q mode when superres is on, as this
+    // will be used directly as 'q' later.
+    if (oxcf->rc_mode == AOM_Q &&
+        (cpi->superres_mode == SUPERRES_QTHRESH ||
+         cpi->superres_mode == SUPERRES_AUTO) &&
+        cm->superres_scale_denominator != SCALE_NUMERATOR) {
+      active_best_quality =
+          AOMMAX(active_best_quality -
+                     ((cm->superres_scale_denominator - SCALE_NUMERATOR) *
+                      SUPERRES_QADJ_PER_DENOM_KEYFRAME),
+                 0);
     }
   }
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
 
+static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
+                                                 const int is_intrl_arf_boost,
+                                                 int *active_worst,
+                                                 int *active_best) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const int bit_depth = cpi->common.seq_params.bit_depth;
+  int active_best_quality = *active_best;
+  int active_worst_quality = *active_worst;
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
   if (cpi->oxcf.rc_mode != AOM_Q) {
@@ -1195,6 +1258,7 @@
   }
 
   aom_clear_system_state();
+#ifndef STRICT_RC
   // Static forced key frames Q restrictions dealt with elsewhere.
   if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
       (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
@@ -1202,6 +1266,7 @@
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
   }
+#endif
 
   // Modify active_best_quality for downscaled normal frames.
   if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
@@ -1216,7 +1281,18 @@
   active_worst_quality =
       clamp(active_worst_quality, active_best_quality, rc->worst_quality);
 
-  if (oxcf->rc_mode == AOM_Q ||
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
+
+static int get_q(const AV1_COMP *cpi, const int width, const int height,
+                 const int active_worst_quality,
+                 const int active_best_quality) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int q;
+
+  if (cpi->oxcf.rc_mode == AOM_Q ||
       (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
        rc->frames_to_key > 1)) {
@@ -1230,52 +1306,196 @@
       q = AOMMIN(rc->last_boosted_qindex,
                  (active_best_quality + active_worst_quality) / 2);
     }
+    q = clamp(q, active_best_quality, active_worst_quality);
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality, width, height);
     if (q > active_worst_quality) {
       // Special case when we are targeting the max allowed rate.
-      if (rc->this_frame_target >= rc->max_frame_bandwidth)
-        active_worst_quality = q;
-      else
+      if (rc->this_frame_target < rc->max_frame_bandwidth) {
         q = active_worst_quality;
+      }
+    }
+    q = AOMMAX(q, active_best_quality);
+  }
+  return q;
+}
+
+// Returns |active_best_quality| for an inter frame.
+// The |active_best_quality| depends on different rate control modes:
+// VBR, Q, CQ, CBR.
+// The returning active_best_quality could further be adjusted in
+// adjust_active_best_and_worst_quality().
+static int get_active_best_quality(const AV1_COMP *const cpi,
+                                   const int active_worst_quality,
+                                   const int cq_level, const int gf_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const int rc_mode = oxcf->rc_mode;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+  int active_best_quality = 0;
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+  const int is_leaf_frame = !(cpi->refresh_golden_frame ||
+                              cpi->refresh_alt_ref_frame || is_intrl_arf_boost);
+  const int is_overlay_frame = rc->is_src_frame_alt_ref;
+
+  if (is_leaf_frame || is_overlay_frame) {
+    if (rc_mode == AOM_Q) return cq_level;
+
+    active_best_quality = inter_minq[active_worst_quality];
+    // For the constrained quality mode we don't want
+    // q to fall below the cq level.
+    if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+      active_best_quality = cq_level;
+    }
+    return active_best_quality;
+  }
+
+  // TODO(chengchen): can we remove this condition?
+  if (rc_mode == AOM_Q && !cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
+    return cq_level;
+  }
+
+  // Determine active_best_quality for frames that are not leaf or overlay.
+  int q = active_worst_quality;
+  // Use the lower of active_worst_quality and recent
+  // average Q as basis for GF/ARF best Q limit unless last frame was
+  // a key frame.
+  if (rc->frames_since_key > 1 &&
+      rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+    q = rc->avg_frame_qindex[INTER_FRAME];
+  }
+  if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
+  active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  // Constrained quality use slightly lower active best.
+  if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
+  const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+  if (!is_intrl_arf_boost) return active_best_quality;
+
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q;
+  int this_height = gf_group_pyramid_level(gf_group, gf_index);
+  while (this_height > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --this_height;
+  }
+  return active_best_quality;
+}
+
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                         int height, int gf_index,
+                                         int *bottom_index, int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (oxcf->use_fixed_qp_offsets) {
+    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index,
+                                     cq_level, bit_depth);
+  }
+
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+
+  if (frame_is_intra_only(cm)) {
+    const int is_fwd_kf =
+        cm->current_frame.frame_type == KEY_FRAME && cm->show_frame == 0;
+    get_intra_q_and_bounds_two_pass(cpi, width, height, &active_best_quality,
+                                    &active_worst_quality, cq_level, is_fwd_kf);
+#ifdef STRICT_RC
+    active_best_quality = 0;
+#endif
+  } else {
+#ifdef STRICT_RC
+    //  Active best quality limited by previous layer.
+    const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
+    active_best_quality =
+        rc->active_best_quality[pyramid_level - 1] +
+        AOMMAX((rc->active_best_quality[pyramid_level - 1] / 10), 5);
+#else
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+#endif
+
+    // For alt_ref and GF frames (including internal arf frames) adjust the
+    // worst allowed quality as well. This insures that even on hard
+    // sections we dont clamp the Q at the same value for arf frames and
+    // leaf (non arf) frames. This is important to the TPL model which assumes
+    // Q drops with each arf level.
+    if (!(rc->is_src_frame_alt_ref) &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame ||
+         is_intrl_arf_boost)) {
+      active_worst_quality =
+          (active_best_quality + (3 * active_worst_quality) + 2) / 4;
     }
   }
-  clamp(q, active_best_quality, active_worst_quality);
 
+  adjust_active_best_and_worst_quality(
+      cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality);
+  q = get_q(cpi, width, height, active_worst_quality, active_best_quality);
+
+  // Special case when we are targeting the max allowed rate.
+  if (rc->this_frame_target >= rc->max_frame_bandwidth &&
+      q > active_worst_quality) {
+    active_worst_quality = q;
+  }
+
+#ifdef STRICT_RC
+  *top_index = rc->worst_quality;
+#else
   *top_index = active_worst_quality;
+#endif
   *bottom_index = active_best_quality;
 
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
          *bottom_index >= rc->best_quality);
   assert(q <= rc->worst_quality && q >= rc->best_quality);
+
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
-                             int *bottom_index, int *top_index) {
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
+                             int height, int gf_index, int *bottom_index,
+                             int *top_index) {
   int q;
-  if (cpi->oxcf.pass == 0) {
+  // TODO(sarahparker) merge onepass vbr and altref q computation
+  // with two pass
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if ((cpi->oxcf.rc_mode != AOM_Q ||
+       gf_group->update_type[gf_index] == ARF_UPDATE) &&
+      has_no_stats_stage(cpi)) {
     if (cpi->oxcf.rc_mode == AOM_CBR)
       q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
                                             top_index);
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+    else if (cpi->oxcf.rc_mode == AOM_CQ)
+      q = rc_pick_q_and_bounds_one_pass_cq(cpi, width, height, bottom_index,
+                                           top_index);
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
     else
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
                                             top_index);
   } else {
-    assert(cpi->oxcf.pass == 2 && "invalid encode pass");
-
-    GF_GROUP *gf_group = &cpi->twopass.gf_group;
-    int arf_q = -1;  // Initialize to invalid value, for sanity check later.
-
-    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
-                                      top_index, &arf_q);
-
-    if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-      cpi->rc.arf_q = arf_q;
-    }
+    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, gf_index,
+                                      bottom_index, top_index);
   }
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
 
   return q;
 }
@@ -1289,15 +1509,15 @@
   } else {
     // For very small rate targets where the fractional adjustment
     // may be tiny make sure there is at least a minimum range.
-    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
-    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0);
+    const int tolerance =
+        AOMMAX(100, (cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
     *frame_over_shoot_limit =
-        AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth);
+        AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
   }
 }
 
-static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
-                                int height) {
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
@@ -1310,7 +1530,7 @@
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate =
-      (int)((int64_t)rc->this_frame_target * 64 * 64) / (width * height);
+      (int)(((int64_t)rc->this_frame_target << 12) / (width * height));
 }
 
 static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
@@ -1327,34 +1547,18 @@
 
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
-  const GF_GROUP *const gf_group = &twopass->gf_group;
-  const int is_intrnl_arf =
-      cpi->oxcf.pass == 2
-          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
-          : cpi->refresh_alt2_ref_frame;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
 
   // Update the Golden frame usage counts.
-  // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
-  //                   only the virtual indices for the reference frame will be
-  //                   updated and cpi->refresh_golden_frame will still be zero.
   if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
-    // We will not use internal overlay frames to replace the golden frame
-    if (!rc->is_src_frame_internal_arf) {
-      // this frame refreshes means next frames don't unless specified by user
-      rc->frames_since_golden = 0;
-    }
+    rc->frames_since_golden = 0;
 
     // If we are not using alt ref in the up and coming group clear the arf
     // active flag. In multi arf group case, if the index is not 0 then
     // we are overlaying a mid group arf so should not reset the flag.
-    if (cpi->oxcf.pass == 2) {
-      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
-        rc->source_alt_ref_active = 0;
-    } else if (!rc->source_alt_ref_pending) {
+    if (!rc->source_alt_ref_pending && (gf_group->index == 0))
       rc->source_alt_ref_active = 0;
-    }
-  } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) {
+  } else if (cpi->common.show_frame) {
     rc->frames_since_golden++;
   }
 }
@@ -1363,18 +1567,12 @@
   const AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
-  const GF_GROUP *const gf_group = &twopass->gf_group;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+
   const int is_intrnl_arf =
-      cpi->oxcf.pass == 2
-          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
-          : cpi->refresh_alt2_ref_frame;
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
 
-  const int qindex = cm->base_qindex;
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    av1_cyclic_refresh_postencode(cpi);
-  }
+  const int qindex = cm->quant_params.base_qindex;
 
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
@@ -1388,9 +1586,10 @@
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if (!rc->is_src_frame_alt_ref &&
-        !(cpi->refresh_golden_frame || is_intrnl_arf ||
-          cpi->refresh_alt_ref_frame)) {
+    if ((cpi->use_svc && cpi->oxcf.rc_mode == AOM_CBR) ||
+        (!rc->is_src_frame_alt_ref &&
+         !(cpi->refresh_golden_frame || is_intrnl_arf ||
+           cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
           ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1419,6 +1618,7 @@
   if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
 
   update_buffer_level(cpi, rc->projected_frame_size);
+  rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
@@ -1427,13 +1627,13 @@
         (int)(rc->this_frame_target /
               resize_rate_factor(cpi, cm->width, cm->height));
   if (current_frame->frame_type != KEY_FRAME) {
-    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+    rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
-    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+    rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
-    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+    rc->long_rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
-    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+    rc->long_rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
   }
 
@@ -1469,227 +1669,6 @@
   cpi->rc.rc_1_frame = 0;
 }
 
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
-#define USE_ALTREF_FOR_ONE_PASS 1
-
-static int calc_pframe_target_size_one_pass_vbr(
-    const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
-  static const int af_ratio = 10;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-#if USE_ALTREF_FOR_ONE_PASS
-  if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
-      frame_update_type == ARF_UPDATE) {
-    target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
-             (rc->baseline_gf_interval + af_ratio - 1);
-  } else {
-    target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
-             (rc->baseline_gf_interval + af_ratio - 1);
-  }
-#else
-  target = rc->avg_frame_bandwidth;
-#endif
-  return av1_rc_clamp_pframe_target_size(cpi, target, frame_update_type);
-}
-
-static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
-  static const int kf_ratio = 25;
-  const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
-  return av1_rc_clamp_iframe_target_size(cpi, target);
-}
-
-void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi,
-                                    FRAME_UPDATE_TYPE *const frame_update_type,
-                                    EncodeFrameParams *const frame_params,
-                                    unsigned int frame_flags) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  CurrentFrame *const current_frame = &cm->current_frame;
-  int target;
-  int altref_enabled = is_altref_enabled(cpi);
-  int sframe_dist = cpi->oxcf.sframe_dist;
-  int sframe_mode = cpi->oxcf.sframe_mode;
-  int sframe_enabled = cpi->oxcf.sframe_enabled;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if (*frame_update_type != ARF_UPDATE &&
-      (current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
-    frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        current_frame->frame_number != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    frame_params->frame_type = INTER_FRAME;
-    if (sframe_enabled) {
-      if (altref_enabled) {
-        if (sframe_mode == 1) {
-          // sframe_mode == 1: insert sframe if it matches altref frame.
-
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0 &&
-              *frame_update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-          }
-        } else {
-          // sframe_mode != 1: if sframe will be inserted at the next available
-          // altref frame
-
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0) {
-            rc->sframe_due = 1;
-          }
-
-          if (rc->sframe_due && *frame_update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-            rc->sframe_due = 0;
-          }
-        }
-      } else {
-        if (current_frame->frame_number % sframe_dist == 0 &&
-            current_frame->frame_number != 0) {
-          frame_params->frame_type = S_FRAME;
-        }
-      }
-    }
-  }
-  if (rc->frames_till_gf_update_due == 0) {
-    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-      rc->constrained_gf_group = 1;
-    } else {
-      rc->constrained_gf_group = 0;
-    }
-    if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
-    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    rc->gfu_boost = DEFAULT_GF_BOOST;
-  }
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-    av1_cyclic_refresh_update_parameters(cpi);
-
-  if (frame_params->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_vbr(cpi);
-  else
-    target = calc_pframe_target_size_one_pass_vbr(cpi, *frame_update_type);
-  rc_set_frame_target(cpi, target, cm->width, cm->height);
-}
-
-static int calc_pframe_target_size_one_pass_cbr(
-    const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  const RATE_CONTROL *rc = &cpi->rc;
-  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
-  int min_frame_target =
-      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target;
-
-  if (oxcf->gf_cbr_boost_pct) {
-    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
-    if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
-      target =
-          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
-          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
-    } else {
-      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
-               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
-    }
-  } else {
-    target = rc->avg_frame_bandwidth;
-  }
-
-  if (diff > 0) {
-    // Lower the target bandwidth for this frame.
-    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
-    target -= (target * pct_low) / 200;
-  } else if (diff < 0) {
-    // Increase the target bandwidth for this frame.
-    const int pct_high =
-        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
-    target += (target * pct_high) / 200;
-  }
-  if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
-    target = AOMMIN(target, max_rate);
-  }
-  return AOMMAX(min_frame_target, target);
-}
-
-static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
-  const RATE_CONTROL *rc = &cpi->rc;
-  int target;
-  if (cpi->common.current_frame.frame_number == 0) {
-    target = ((rc->starting_buffer_level / 2) > INT_MAX)
-                 ? INT_MAX
-                 : (int)(rc->starting_buffer_level / 2);
-  } else {
-    int kf_boost = 32;
-    double framerate = cpi->framerate;
-
-    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
-    if (rc->frames_since_key < framerate / 2) {
-      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
-    }
-    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
-  }
-  return av1_rc_clamp_iframe_target_size(cpi, target);
-}
-
-void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi,
-                                    FRAME_UPDATE_TYPE *const frame_update_type,
-                                    EncodeFrameParams *const frame_params,
-                                    unsigned int frame_flags) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  CurrentFrame *const current_frame = &cm->current_frame;
-  int target;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if ((current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
-    frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        current_frame->frame_number != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    frame_params->frame_type = INTER_FRAME;
-  }
-  if (rc->frames_till_gf_update_due == 0) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      av1_cyclic_refresh_set_golden_update(cpi);
-    else
-      rc->baseline_gf_interval =
-          (rc->min_gf_interval + rc->max_gf_interval) / 2;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-    if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
-    rc->gfu_boost = DEFAULT_GF_BOOST;
-  }
-
-  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
-  // should be done here, before the frame qp is selected.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-    av1_cyclic_refresh_update_parameters(cpi);
-
-  if (frame_params->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_cbr(cpi);
-  else
-    target = calc_pframe_target_size_one_pass_cbr(cpi, *frame_update_type);
-
-  rc_set_frame_target(cpi, target, cm->width, cm->height);
-  // TODO(afergs): Decide whether to scale up, down, or not at all
-}
-
 int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
                     int best_qindex, int worst_qindex) {
   assert(best_qindex <= worst_qindex);
@@ -1768,7 +1747,7 @@
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Special case code for 1 pass fixed Q mode tests
-  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
     rc->max_gf_interval = FIXED_GF_INTERVAL;
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
@@ -1782,9 +1761,15 @@
     if (rc->max_gf_interval == 0)
       rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
-
-    // Extended max interval for genuinely static scenes like slide shows.
-    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+    /*
+     * Extended max interval for genuinely static scenes like slide shows.
+     * The no.of.stats available in the case of LAP is limited,
+     * hence setting to max_gf_interval.
+     */
+    if (cpi->lap_enabled)
+      rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
+    else
+      rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
 
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1828,28 +1813,21 @@
 static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
-  int max_delta;
-  double position_factor = 1.0;
+  const int stats_count =
+      cpi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->twopass.stats_buf_ctx->total_stats->count
+          : 0;
+  const int frame_window = AOMMIN(
+      16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
 
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count != 0.) {
-    position_factor = sqrt((double)cpi->common.current_frame.frame_number /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+  if (frame_window > 0) {
+    const int max_delta =
+        AOMMIN(abs((int)(vbr_bits_off_target / frame_window)),
+               (*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
 
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target += (vbr_bits_off_target > max_delta)
-                              ? max_delta
-                              : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -= (vbr_bits_off_target < -max_delta)
-                              ? max_delta
-                              : (int)-vbr_bits_off_target;
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    // vbr_bits_off_target < 0 we are currently overshooting
+    *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
@@ -1874,5 +1852,266 @@
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
     vbr_rate_correction(cpi, &target_rate);
-  rc_set_frame_target(cpi, target_rate, width, height);
+  av1_rc_set_frame_target(cpi, target_rate, width, height);
+}
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int64_t target;
+#if USE_ALTREF_FOR_ONE_PASS
+  if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+      frame_update_type == ARF_UPDATE) {
+    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+              af_ratio) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  } else {
+    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  }
+  if (target > INT_MAX) target = INT_MAX;
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+}
+
+int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+      target =
+          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    } else {
+      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    }
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+  if (cpi->use_svc) {
+    // Note that for layers, avg_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int layer =
+        LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+                         cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    target = lc->avg_frame_size;
+    min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  return AOMMAX(min_frame_target, target);
+}
+
+int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+  if (cpi->common.current_frame.frame_number == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+                 ? INT_MAX
+                 : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+
+    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key < framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  SVC *const svc = &cpi->svc;
+  // Specify the reference prediction structure, for 1 layer nonrd mode.
+  // Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+  // where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+  // either updated on LAST with period baseline_gf_interval (fixed slot)
+  // or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+  const int gld_fixed_slot = 1;
+  const unsigned int lag_alt = 4;
+  int last_idx = 0;
+  int last_idx_refresh = 0;
+  int gld_idx = 0;
+  int alt_ref_idx = 0;
+  ext_flags->refresh_frame_flags_pending = 1;
+  svc->external_ref_frame_config = 1;
+  ext_flags->ref_frame_flags = 0;
+  ext_flags->refresh_last_frame = 1;
+  ext_flags->refresh_golden_frame = 0;
+  ext_flags->refresh_alt_ref_frame = 0;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
+  for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
+  // Always reference LAST, GOLDEN, ALTREF
+  ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+  const int sh = 7 - gld_fixed_slot;
+  // Moving index slot for last: 0 - (sh - 1).
+  if (cm->current_frame.frame_number > 1)
+    last_idx = ((cm->current_frame.frame_number - 1) % sh);
+  // Moving index for refresh of last: one ahead for next frame.
+  last_idx_refresh = (cm->current_frame.frame_number % sh);
+  gld_idx = 6;
+  if (!gld_fixed_slot) {
+    gld_idx = 7;
+    const unsigned int lag_gld = 7;  // Must be <= 7.
+    // Moving index for gld_ref, lag behind current by gld_interval frames.
+    if (cm->current_frame.frame_number > lag_gld)
+      gld_idx = ((cm->current_frame.frame_number - lag_gld) % sh);
+  }
+  // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+  if (cm->current_frame.frame_number > lag_alt)
+    alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh);
+  svc->ref_idx[0] = last_idx;          // LAST
+  svc->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
+  svc->ref_idx[3] = gld_idx;           // GOLDEN
+  svc->ref_idx[6] = alt_ref_idx;       // ALT_REF
+  // Refresh this slot, which will become LAST on next frame.
+  svc->refresh[last_idx_refresh] = 1;
+  // Update GOLDEN on period for fixed slot case.
+  if (gld_fixed_slot && gf_update) {
+    ext_flags->refresh_golden_frame = 1;
+    svc->refresh[gld_idx] = 1;
+  }
+}
+
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  int gf_update = 0;
+  int target;
+  const int resize_pending =
+      (resize_pending_params->width && resize_pending_params->height &&
+       (cm->width != resize_pending_params->width ||
+        cm->height != resize_pending_params->height));
+  // Turn this on to explicitly set the reference structure rather than
+  // relying on internal/default structure.
+  const int set_reference_structure = 1;
+  if (cpi->use_svc) {
+    av1_update_temporal_layer_framerate(cpi);
+    av1_restore_layer_context(cpi);
+  }
+  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
+      (cpi->use_svc && cpi->svc.spatial_layer_id == 0 &&
+       cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) ||
+      (frame_flags & FRAMEFLAGS_KEY)) {
+    frame_params->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    rc->source_alt_ref_active = 0;
+    gf_group->update_type[gf_group->index] = KF_UPDATE;
+    if (cpi->use_svc && cm->current_frame.frame_number > 0)
+      av1_svc_reset_temporal_layers(cpi, 1);
+  } else {
+    frame_params->frame_type = INTER_FRAME;
+    gf_group->update_type[gf_group->index] = LF_UPDATE;
+  }
+  // GF update based on frames_till_gf_update_due, also
+  // force upddate on resize pending frame.
+  if ((resize_pending || rc->frames_till_gf_update_due == 0) &&
+      cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      av1_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval = MAX_GF_INTERVAL;
+    if (rc->baseline_gf_interval > rc->frames_to_key)
+      rc->baseline_gf_interval = rc->frames_to_key;
+    rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+    rc->constrained_gf_group =
+        (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    gf_group->index = 0;
+    // SVC does not use GF as periodid boost.
+    // TODO(marpan): Find better way to disable this for SVC.
+    if (cpi->use_svc) {
+      SVC *const svc = &cpi->svc;
+      rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+      rc->gfu_boost = 1;
+      rc->constrained_gf_group = 0;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      for (int layer = 0;
+           layer < svc->number_spatial_layers * svc->number_temporal_layers;
+           ++layer) {
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        lc->rc.baseline_gf_interval = rc->baseline_gf_interval;
+        lc->rc.gfu_boost = rc->gfu_boost;
+        lc->rc.constrained_gf_group = rc->constrained_gf_group;
+        lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+        lc->group_index = 0;
+      }
+    }
+    gf_group->size = rc->baseline_gf_interval;
+    gf_group->update_type[0] =
+        (frame_params->frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+    gf_update = 1;
+  }
+  if (cpi->oxcf.rc_mode == AOM_CBR) {
+    if (frame_params->frame_type == KEY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_cbr(
+          cpi, gf_group->update_type[gf_group->index]);
+    }
+  } else {
+    if (frame_params->frame_type == KEY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_vbr(
+          cpi, gf_group->update_type[gf_group->index]);
+    }
+  }
+  av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
+  rc->base_frame_target = target;
+  if (set_reference_structure && cpi->oxcf.speed >= 6 &&
+      cm->number_spatial_layers == 1 && cm->number_temporal_layers == 1)
+    set_reference_structure_one_pass_rt(cpi, gf_update);
+  cm->current_frame.frame_type = frame_params->frame_type;
 }

diff --git a/libaom/av1/encoder/ratectrl.h b/libaom/av1/encoder/ratectrl.h
index 1cd5994..c463786 100644
--- a/libaom/av1/encoder/ratectrl.h
+++ b/libaom/av1/encoder/ratectrl.h

@@ -17,8 +17,8 @@
 
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,6 +27,9 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
 // Threshold used to define if a KF group is static (e.g. a slide show).
 // Essentially, this means that no frame in the group has more than 1% of MBs
 // that are not marked as coded with 0,0 motion in the first pass.
@@ -42,8 +45,14 @@
 #define MAX_PYRAMID_LVL 4
 
 #define MIN_GF_INTERVAL 4
-#define MAX_GF_INTERVAL 16
+#define MAX_GF_INTERVAL 32
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+#define MAX_GF_LENGTH_LAP 16
+
+#define MAX_NUM_GF_INTERVALS 15
+
+#define MAX_ARF_LAYERS 6
+// #define STRICT_RC
 
 typedef struct {
   int resize_width;
@@ -59,11 +68,26 @@
   RATE_FACTOR_LEVELS
 } UENUM1BYTE(RATE_FACTOR_LEVEL);
 
+enum {
+  KF_UPDATE,
+  LF_UPDATE,
+  GF_UPDATE,
+  ARF_UPDATE,
+  OVERLAY_UPDATE,
+  INTNL_OVERLAY_UPDATE,  // Internal Overlay Frame
+  INTNL_ARF_UPDATE,      // Internal Altref Frame
+  FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
+
 typedef struct {
   // Rate targetting variables
   int base_frame_target;  // A baseline frame target before adjustment
                           // for previous under or over shoot.
   int this_frame_target;  // Actual frame target after rc adjustment.
+
+  // gop bit budget
+  int64_t gf_group_bits;
+
   int projected_frame_size;
   int sb64_target_rate;
   int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
@@ -77,6 +101,14 @@
 
   int frames_since_golden;
   int frames_till_gf_update_due;
+
+  // number of determined gf group length left
+  int intervals_till_gf_calculate_due;
+  // stores gf group length intervals
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+  // the current index in gf_intervals
+  int cur_gf_index;
+
   int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
@@ -89,12 +121,12 @@
   int source_alt_ref_pending;
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
-  int is_src_frame_internal_arf;
   int sframe_due;
 
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+  int prev_avg_frame_bandwidth;
 
   int ni_av_qi;
   int ni_tot_qi;
@@ -142,6 +174,18 @@
   float_t arf_boost_factor;
   // Q index used for ALT frame
   int arf_q;
+  int active_worst_quality;
+  int active_best_quality[MAX_ARF_LAYERS + 1];
+  int base_layer_qp;
+
+  // Total number of stats used only for kf_boost calculation.
+  int num_stats_used_for_kf_boost;
+  // Total number of stats used only for gfu_boost calculation.
+  int num_stats_used_for_gfu_boost;
+  // Total number of stats required by gfu_boost calculation.
+  int num_stats_required_for_gfu_boost;
+  int next_is_fwd_key;
+  int enable_scenecut_detection;
 } RATE_CONTROL;
 
 struct AV1_COMP;
@@ -166,8 +210,6 @@
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
 // First call per frame, one of:
-//   av1_rc_get_one_pass_vbr_params()
-//   av1_rc_get_one_pass_cbr_params()
 //   av1_rc_get_first_pass_params()
 //   av1_rc_get_second_pass_params()
 // depending on the usage to set the rate control encode parameters desired.
@@ -187,12 +229,6 @@
 // Functions to set parameters for encoding before the actual
 // encode_frame_to_data_rate() function.
 struct EncodeFrameParams;
-void av1_rc_get_one_pass_vbr_params(
-    struct AV1_COMP *cpi, uint8_t *const frame_update_type,
-    struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
-void av1_rc_get_one_pass_cbr_params(
-    struct AV1_COMP *cpi, uint8_t *const frame_update_type,
-    struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
 
 // Post encode update of the rate control parameters based
 // on bytes used
@@ -216,7 +252,8 @@
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
+                             int width, int height, int gf_index,
                              int *bottom_index, int *top_index);
 
 // Estimates q to achieve a target bits per frame
@@ -263,6 +300,23 @@
 
 int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
+                             int height);
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
+
+void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
+                                struct EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/rd.c b/libaom/av1/encoder/rd.c
index d78e269..e48c771 100644
--- a/libaom/av1/encoder/rd.c
+++ b/libaom/av1/encoder/rd.c

@@ -52,20 +52,20 @@
   2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
 };
 
-static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
-    {
-      { 1, 1, 1, 1 },  // unused
-      { 1, 1, 0, 0 },
-      { 0, 0, 1, 0 },
-    };
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                            };
 
-static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
-    {
-      { 1, 1, 1, 1 },  // unused
-      { 1, 1, 0, 0 },
-      { 0, 0, 1, 0 },
-      { 0, 0, 0, 1 },
-    };
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                              { 0, 1, 1, 1 },
+                                            };
 
 static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
                                                       EXT_TX_SETS_INTER)] = {
@@ -278,7 +278,7 @@
       av1_cost_tokens_from_cdf(x->compound_type_cost[i],
                                fc->compound_type_cdf[i], NULL);
     for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      if (get_interinter_wedge_bits(i)) {
+      if (av1_is_wedge_used(i)) {
         av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
                                  NULL);
       }
@@ -312,14 +312,11 @@
 }
 
 // Values are now correlated to quantizer.
-static int sad_per_bit16lut_8[QINDEX_RANGE];
-static int sad_per_bit4lut_8[QINDEX_RANGE];
-static int sad_per_bit16lut_10[QINDEX_RANGE];
-static int sad_per_bit4lut_10[QINDEX_RANGE];
-static int sad_per_bit16lut_12[QINDEX_RANGE];
-static int sad_per_bit4lut_12[QINDEX_RANGE];
+static int sad_per_bit_lut_8[QINDEX_RANGE];
+static int sad_per_bit_lut_10[QINDEX_RANGE];
+static int sad_per_bit_lut_12[QINDEX_RANGE];
 
-static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+static void init_me_luts_bd(int *bit16lut, int range,
                             aom_bit_depth_t bit_depth) {
   int i;
   // Initialize the sad lut tables using a formulaic calculation for now.
@@ -328,27 +325,23 @@
   for (i = 0; i < range; i++) {
     const double q = av1_convert_qindex_to_q(i, bit_depth);
     bit16lut[i] = (int)(0.0418 * q + 2.4107);
-    bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
 void av1_init_me_luts(void) {
-  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
-                  AOM_BITS_8);
-  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
-                  AOM_BITS_10);
-  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
-                  AOM_BITS_12);
+  init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
+  init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
+  init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
 }
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
-static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-  128, 144, 128, 128, 144, 144, 128
-};
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
+                                                              128, 144, 144,
+                                                              128 };
 
 int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
-  const int q = av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth);
+  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
   int rdmult = q * q;
   rdmult = rdmult * 3 + (rdmult * 2 / 3);
   switch (cpi->common.seq_params.bit_depth) {
@@ -364,9 +357,9 @@
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
   int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex);
-  if (cpi->oxcf.pass == 2 &&
+  if (is_stat_consumption_stage(cpi) &&
       (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const GF_GROUP *const gf_group = &cpi->gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
     const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
 
@@ -376,26 +369,46 @@
   return (int)rdmult;
 }
 
+int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) {
+  assert(beta > 0.0);
+  int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int newq = (int)rint(q / sqrt(beta));
+  int orig_qindex = qindex;
+  if (newq < q) {
+    do {
+      qindex--;
+      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+    } while (newq < q && qindex > 0);
+  } else {
+    do {
+      qindex++;
+      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+    } while (newq > q && qindex < MAXQ);
+  }
+  return qindex - orig_qindex;
+}
+
 int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
+  assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
-  int64_t q =
-      av1_dc_quant_Q3(cm->base_qindex, 0, cpi->common.seq_params.bit_depth);
+  int64_t q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                               cm->seq_params.bit_depth);
   int64_t rdmult = 0;
 
-  switch (cpi->common.seq_params.bit_depth) {
+  switch (cm->seq_params.bit_depth) {
     case AOM_BITS_8: rdmult = (int)((88 * q * q / beta) / 24); break;
     case AOM_BITS_10:
       rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 4);
       break;
     default:
-      assert(cpi->common.seq_params.bit_depth == AOM_BITS_12);
+      assert(cm->seq_params.bit_depth == AOM_BITS_12);
       rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 8);
       break;
   }
 
-  if (cpi->oxcf.pass == 2 &&
-      (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  if (is_stat_consumption_stage(cpi) &&
+      (cm->current_frame.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
     const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
 
@@ -409,9 +422,13 @@
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
   double q;
   switch (bit_depth) {
-    case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
-    case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
-    case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
+    case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0;
+      break;
+    case AOM_BITS_12:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0;
+      break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
@@ -422,18 +439,9 @@
 
 void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
   switch (cpi->common.seq_params.bit_depth) {
-    case AOM_BITS_8:
-      x->sadperbit16 = sad_per_bit16lut_8[qindex];
-      x->sadperbit4 = sad_per_bit4lut_8[qindex];
-      break;
-    case AOM_BITS_10:
-      x->sadperbit16 = sad_per_bit16lut_10[qindex];
-      x->sadperbit4 = sad_per_bit4lut_10[qindex];
-      break;
-    case AOM_BITS_12:
-      x->sadperbit16 = sad_per_bit16lut_12[qindex];
-      x->sadperbit4 = sad_per_bit4lut_12[qindex];
-      break;
+    case AOM_BITS_8: x->sadperbit = sad_per_bit_lut_8[qindex]; break;
+    case AOM_BITS_10: x->sadperbit = sad_per_bit_lut_10[qindex]; break;
+    case AOM_BITS_12: x->sadperbit = sad_per_bit_lut_12[qindex]; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
   }
@@ -443,10 +451,10 @@
   int i, bsize, segment_id;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
-    const int qindex =
-        clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
-                  cm->y_dc_delta_q,
-              0, MAXQ);
+    const int qindex = clamp(
+        av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
+            cm->quant_params.y_dc_delta_q,
+        0, MAXQ);
     const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -525,8 +533,9 @@
         int br_rate[BR_CDF_SIZE];
         int prev_cost = 0;
         int i, j;
-        av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx],
-                                 NULL);
+        av1_cost_tokens_from_cdf(
+            br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx],
+            NULL);
         // printf("br_rate: ");
         // for(j = 0; j < BR_CDF_SIZE; j++)
         //  printf("%4d ", br_rate[j]);
@@ -555,15 +564,21 @@
   }
 }
 
-void av1_initialize_cost_tables(const AV1_COMMON *const cm, MACROBLOCK *x) {
-  if (cm->cur_frame_force_integer_mv) {
-    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
+                       MACROBLOCK *x) {
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  if (integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc,
                              MV_SUBPEL_NONE);
+    x->mv_cost_stack = (int **)&x->nmvcost;
   } else {
+    int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost;
+    x->mv_cost_stack = *src;
     av1_build_nmv_cost_table(
-        x->nmv_vec_cost,
-        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
-        cm->allow_high_precision_mv);
+        x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp);
   }
 }
 
@@ -574,29 +589,37 @@
 
   aom_clear_system_state();
 
-  rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+  rd->RDMULT = av1_compute_rd_mult(
+      cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
 
   set_error_per_bit(x, rd->RDMULT);
 
   set_block_thresholds(cm, rd);
 
-  av1_initialize_cost_tables(cm, x);
+  if ((!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+       cpi->oxcf.mv_cost_upd_freq != COST_UPD_OFF) ||
+      frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1)
+    av1_fill_mv_costs(cm->fc, cm->features.cur_frame_force_integer_mv,
+                      cm->features.allow_high_precision_mv, x);
 
-  if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
-      cpi->oxcf.pass != 1) {
-    int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] };
-    av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc,
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode && frame_is_intra_only(cm) &&
+      cm->features.allow_screen_content_tools &&
+      !is_stat_generation_stage(cpi)) {
+    IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
+    int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX],
+                       &dv_costs->mv_component[1][MV_MAX] };
+    av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc,
                              MV_SUBPEL_NONE);
   }
 
-  if (cpi->oxcf.pass != 1) {
+  if (!is_stat_generation_stage(cpi)) {
     for (int i = 0; i < TRANS_TYPES; ++i)
       // IDENTITY: 1 bit
       // TRANSLATION: 3 bits
       // ROTZOOM: 2 bits
       // AFFINE: 3 bits
-      cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
-                            << AV1_PROB_COST_SHIFT;
+      cpi->gm_info.type_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
+                                  << AV1_PROB_COST_SHIFT;
   }
 }
 
@@ -710,7 +733,7 @@
 */
 
 static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
-  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3
 };
 
 static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
@@ -795,9 +818,9 @@
   },
 };
 
-static const double surffit_dist_params[7] = {
-  1.475844, 4.328362, -5.680233, -0.500994, 0.554585, 4.839478, -0.695837
-};
+static const double surffit_dist_params[7] = { 1.475844,  4.328362, -5.680233,
+                                               -0.500994, 0.554585, 4.839478,
+                                               -0.695837 };
 
 static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
                                              double *rpar) {
@@ -829,66 +852,66 @@
   {
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
-      0.000000,    23.801499,   28.387688,   33.388795,   42.298282,
-      41.525408,   51.597692,   49.566271,   54.632979,   60.321507,
-      67.730678,   75.766165,   85.324032,   96.600012,   120.839562,
-      173.917577,  255.974908,  354.107573,  458.063476,  562.345966,
-      668.568424,  772.072881,  878.598490,  982.202274,  1082.708946,
-      1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230,
-      1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495,
-      2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051,
-      2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868,
-      2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602,
-      3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000,
+      0.000000,    118.257702,  120.210658,  121.434853,  122.100487,
+      122.377758,  122.436865,  72.290102,   96.974289,   101.652727,
+      126.830141,  140.417377,  157.644879,  184.315291,  215.823873,
+      262.300169,  335.919859,  420.624173,  519.185032,  619.854243,
+      726.053595,  827.663369,  933.127475,  1037.988755, 1138.839609,
+      1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052,
+      1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680,
+      2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011,
+      2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827,
+      2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773,
+      3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000,
   },
   {
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
-      0.000000,    8.998436,    9.439592,    9.731837,    10.865931,
-      11.561347,   12.578139,   14.205101,   16.770584,   19.094853,
-      21.330863,   23.298907,   26.901921,   34.501017,   57.891733,
-      112.234763,  194.853189,  288.302032,  380.499422,  472.625309,
-      560.226809,  647.928463,  734.155122,  817.489721,  906.265783,
-      999.260562,  1094.489206, 1197.062998, 1293.296825, 1378.926484,
-      1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063,
-      1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134,
-      2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802,
-      2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567,
-      3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000,
+      0.000000,    13.087244,   15.919735,   25.930313,   24.412411,
+      28.567417,   29.924194,   30.857010,   32.742979,   36.382570,
+      39.210386,   42.265690,   47.378572,   57.014850,   82.740067,
+      137.346562,  219.968084,  316.781856,  415.643773,  516.706538,
+      614.914364,  714.303763,  815.512135,  911.210485,  1008.501528,
+      1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641,
+      1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309,
+      1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824,
+      2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694,
+      2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660,
+      3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000,
   },
   {
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
-      0.000000,    2.377584,    2.557185,    2.732445,    2.851114,
-      3.281800,    3.765589,    4.342578,    5.145582,    5.611038,
-      6.642238,    7.945977,    11.800522,   17.346624,   37.501413,
-      87.216800,   165.860942,  253.865564,  332.039345,  408.518863,
-      478.120452,  547.268590,  616.067676,  680.022540,  753.863541,
-      834.529973,  919.489191,  1008.264989, 1092.230318, 1173.971886,
-      1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471,
-      1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485,
-      1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437,
-      2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170,
-      3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000,
+      0.000000,    4.656893,    5.123633,    5.594132,    6.162376,
+      6.918433,    7.768444,    8.739415,    10.105862,   11.477328,
+      13.236604,   15.421030,   19.093623,   25.801871,   46.724612,
+      98.841054,   181.113466,  272.586364,  359.499769,  445.546343,
+      525.944439,  605.188743,  681.793483,  756.668359,  838.486885,
+      926.950356,  1015.482542, 1113.353926, 1204.897193, 1288.871992,
+      1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771,
+      1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872,
+      2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216,
+      2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436,
+      3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000,
   },
   {
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
-      0.000000,    0.296997,    0.342545,    0.403097,    0.472889,
-      0.614483,    0.842937,    1.050824,    1.326663,    1.717750,
-      2.530591,    3.582302,    6.995373,    9.973335,    24.042464,
-      56.598240,   113.680735,  180.018689,  231.050567,  266.101082,
-      294.957934,  323.326511,  349.434429,  380.443211,  408.171987,
-      441.214916,  475.716772,  512.900000,  551.186939,  592.364455,
-      624.527378,  661.940693,  679.185473,  724.800679,  764.781792,
-      873.050019,  950.299001,  939.292954,  1052.406153, 1033.893184,
-      1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809,
-      1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614,
-      2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000,
+      0.000000,    0.337370,    0.391916,    0.468839,    0.566334,
+      0.762564,    1.069225,    1.384361,    1.787581,    2.293948,
+      3.251909,    4.412991,    8.050068,    11.606073,   27.668092,
+      65.227758,   128.463938,  202.097653,  262.715851,  312.464873,
+      355.601398,  400.609054,  447.201352,  495.761568,  552.871938,
+      619.067625,  691.984883,  773.753288,  860.628503,  946.262808,
+      1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987,
+      1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823,
+      1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119,
+      2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754,
+      3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000,
   },
 };
 
-static const double interp_dgrid_curv[2][65] = {
+static const double interp_dgrid_curv[3][65] = {
   {
       16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
       15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
@@ -945,65 +968,62 @@
                                        const struct macroblockd_plane *pd,
                                        ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
                                        ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
-  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-  const ENTROPY_CONTEXT *const above = pd->above_context;
-  const ENTROPY_CONTEXT *const left = pd->left_context;
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_entropy_context;
+  const ENTROPY_CONTEXT *const left = pd->left_entropy_context;
 
   memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
   memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
 }
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
                               ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
   get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
 }
 
 void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
-  int i;
-  int zero_seen = 0;
-  int best_sad = INT_MAX;
-  int this_sad = INT_MAX;
-  int max_mv = 0;
-  uint8_t *src_y_ptr = x->plane[0].src.buf;
-  uint8_t *ref_y_ptr;
-  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
-  int num_mv_refs = 0;
   const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
   const int_mv ref_mv =
       av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
   const int_mv ref_mv1 =
       av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
-
+  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+  int num_mv_refs = 0;
   pred_mv[num_mv_refs++] = ref_mv.as_mv;
   if (ref_mv.as_int != ref_mv1.as_int) {
     pred_mv[num_mv_refs++] = ref_mv1.as_mv;
   }
-  if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
+  if (cpi->sf.mv_sf.adaptive_motion_search &&
+      block_size < x->max_partition_size) {
     pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
+  }
 
   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
+  const uint8_t *const src_y_ptr = x->plane[0].src.buf;
+  int zero_seen = 0;
+  int best_sad = INT_MAX;
+  int max_mv = 0;
   // Get the sad for each candidate reference mv.
-  for (i = 0; i < num_mv_refs; ++i) {
+  for (int i = 0; i < num_mv_refs; ++i) {
     const MV *this_mv = &pred_mv[i];
-    int fp_row, fp_col;
-    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
-    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
     max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
 
     if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
     zero_seen |= (fp_row == 0 && fp_col == 0);
 
-    ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    const uint8_t *const ref_y_ptr =
+        &ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
-    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
-                                           ref_y_ptr, ref_y_stride);
+    const int this_sad = cpi->fn_ptr[block_size].sdf(
+        src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
@@ -1017,19 +1037,19 @@
 
 void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
-                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const YV12_BUFFER_CONFIG *src,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv,
                           const int num_planes) {
-  int i;
-
   dst[0].buf = src->y_buffer;
   dst[0].stride = src->y_stride;
   dst[1].buf = src->u_buffer;
   dst[2].buf = src->v_buffer;
   dst[1].stride = dst[2].stride = src->uv_stride;
 
-  for (i = 0; i < num_planes; ++i) {
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int i = 0; i < num_planes; ++i) {
     setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
                      i ? src->uv_crop_width : src->y_crop_width,
                      i ? src->uv_crop_height : src->y_crop_height,
@@ -1038,20 +1058,6 @@
   }
 }
 
-int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
-                            int stride) {
-  const int bw = mi_size_wide_log2[plane_bsize];
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-
-int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
-                                       int16_t *base) {
-  const int stride = block_size_wide[plane_bsize];
-  return base + av1_raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
                                              int ref_frame) {
   assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
@@ -1062,9 +1068,9 @@
                                                        : NULL;
 }
 
-int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
-                            const MACROBLOCKD *xd) {
-  if (cm->interp_filter == SWITCHABLE) {
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter) {
+  if (interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     int inter_filter_cost = 0;
     int dir;
@@ -1082,234 +1088,231 @@
 }
 
 void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
-  int i;
   RD_OPT *const rd = &cpi->rd;
-  SPEED_FEATURES *const sf = &cpi->sf;
 
   // Set baseline threshold values.
-  for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0;
+  av1_zero(rd->thresh_mult);
 
-  if (sf->adaptive_rd_thresh) {
-    rd->thresh_mult[THR_NEARESTMV] = 300;
-    rd->thresh_mult[THR_NEARESTL2] = 300;
-    rd->thresh_mult[THR_NEARESTL3] = 300;
-    rd->thresh_mult[THR_NEARESTB] = 300;
-    rd->thresh_mult[THR_NEARESTA2] = 300;
-    rd->thresh_mult[THR_NEARESTA] = 300;
-    rd->thresh_mult[THR_NEARESTG] = 300;
-  } else {
-    rd->thresh_mult[THR_NEARESTMV] = 0;
-    rd->thresh_mult[THR_NEARESTL2] = 0;
-    rd->thresh_mult[THR_NEARESTL3] = 100;
-    rd->thresh_mult[THR_NEARESTB] = 0;
-    rd->thresh_mult[THR_NEARESTA2] = 0;
-    rd->thresh_mult[THR_NEARESTA] = 0;
-    rd->thresh_mult[THR_NEARESTG] = 0;
-  }
+  rd->thresh_mult[THR_NEARESTMV] = 300;
+  rd->thresh_mult[THR_NEARESTL2] = 300;
+  rd->thresh_mult[THR_NEARESTL3] = 300;
+  rd->thresh_mult[THR_NEARESTB] = 300;
+  rd->thresh_mult[THR_NEARESTA2] = 300;
+  rd->thresh_mult[THR_NEARESTA] = 300;
+  rd->thresh_mult[THR_NEARESTG] = 300;
 
-  rd->thresh_mult[THR_NEWMV] += 1000;
-  rd->thresh_mult[THR_NEWL2] += 1000;
-  rd->thresh_mult[THR_NEWL3] += 1000;
-  rd->thresh_mult[THR_NEWB] += 1000;
+  rd->thresh_mult[THR_NEWMV] = 1000;
+  rd->thresh_mult[THR_NEWL2] = 1000;
+  rd->thresh_mult[THR_NEWL3] = 1000;
+  rd->thresh_mult[THR_NEWB] = 1000;
   rd->thresh_mult[THR_NEWA2] = 1100;
-  rd->thresh_mult[THR_NEWA] += 1000;
-  rd->thresh_mult[THR_NEWG] += 1000;
+  rd->thresh_mult[THR_NEWA] = 1000;
+  rd->thresh_mult[THR_NEWG] = 1000;
 
-  rd->thresh_mult[THR_NEARMV] += 1000;
-  rd->thresh_mult[THR_NEARL2] += 1000;
-  rd->thresh_mult[THR_NEARL3] += 1000;
-  rd->thresh_mult[THR_NEARB] += 1000;
+  rd->thresh_mult[THR_NEARMV] = 1000;
+  rd->thresh_mult[THR_NEARL2] = 1000;
+  rd->thresh_mult[THR_NEARL3] = 1000;
+  rd->thresh_mult[THR_NEARB] = 1000;
   rd->thresh_mult[THR_NEARA2] = 1000;
-  rd->thresh_mult[THR_NEARA] += 1000;
-  rd->thresh_mult[THR_NEARG] += 1000;
+  rd->thresh_mult[THR_NEARA] = 1000;
+  rd->thresh_mult[THR_NEARG] = 1000;
 
-  rd->thresh_mult[THR_GLOBALMV] += 2200;
-  rd->thresh_mult[THR_GLOBALL2] += 2000;
-  rd->thresh_mult[THR_GLOBALL3] += 2000;
-  rd->thresh_mult[THR_GLOBALB] += 2400;
+  rd->thresh_mult[THR_GLOBALMV] = 2200;
+  rd->thresh_mult[THR_GLOBALL2] = 2000;
+  rd->thresh_mult[THR_GLOBALL3] = 2000;
+  rd->thresh_mult[THR_GLOBALB] = 2400;
   rd->thresh_mult[THR_GLOBALA2] = 2000;
-  rd->thresh_mult[THR_GLOBALG] += 2000;
-  rd->thresh_mult[THR_GLOBALA] += 2400;
+  rd->thresh_mult[THR_GLOBALG] = 2000;
+  rd->thresh_mult[THR_GLOBALA] = 2400;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1100;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 800;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 900;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1530;
-  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1870;
-  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2400;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2750;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1870;
-  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 1800;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 3000;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1320;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 2040;
-  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2250;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1360;
-  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2400;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2250;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1870;
-  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1800;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1440;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2750;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2640;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1800;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1760;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2400;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1760;
-  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2640;
-  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1980;
-  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2640;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200;
 
-  rd->thresh_mult[THR_DC] += 1000;
-  rd->thresh_mult[THR_PAETH] += 1000;
-  rd->thresh_mult[THR_SMOOTH] += 2200;
-  rd->thresh_mult[THR_SMOOTH_V] += 2000;
-  rd->thresh_mult[THR_SMOOTH_H] += 2000;
-  rd->thresh_mult[THR_H_PRED] += 2000;
-  rd->thresh_mult[THR_V_PRED] += 1800;
-  rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D203_PRED] += 2000;
-  rd->thresh_mult[THR_D157_PRED] += 2500;
-  rd->thresh_mult[THR_D67_PRED] += 2000;
-  rd->thresh_mult[THR_D113_PRED] += 2500;
-  rd->thresh_mult[THR_D45_PRED] += 2500;
+  rd->thresh_mult[THR_DC] = 1000;
+  rd->thresh_mult[THR_PAETH] = 1000;
+  rd->thresh_mult[THR_SMOOTH] = 2200;
+  rd->thresh_mult[THR_SMOOTH_V] = 2000;
+  rd->thresh_mult[THR_SMOOTH_H] = 2000;
+  rd->thresh_mult[THR_H_PRED] = 2000;
+  rd->thresh_mult[THR_V_PRED] = 1800;
+  rd->thresh_mult[THR_D135_PRED] = 2500;
+  rd->thresh_mult[THR_D203_PRED] = 2000;
+  rd->thresh_mult[THR_D157_PRED] = 2500;
+  rd->thresh_mult[THR_D67_PRED] = 2000;
+  rd->thresh_mult[THR_D113_PRED] = 2500;
+  rd->thresh_mult[THR_D45_PRED] = 2500;
 }
 
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
-                               int (*factor_buf)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index) {
-  if (rd_thresh > 0) {
-    const int top_mode = MAX_MODES;
-    int mode;
-    for (mode = 0; mode < top_mode; ++mode) {
-      const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size =
-          AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
-      BLOCK_SIZE bs;
-      for (bs = min_size; bs <= max_size; ++bs) {
-        int *const fact = &factor_buf[bs][mode];
-        if (mode == best_mode_index) {
-          *fact -= (*fact >> 4);
-        } else {
-          *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
-        }
+                               int (*factor_buf)[MAX_MODES],
+                               int use_adaptive_rd_thresh, BLOCK_SIZE bsize,
+                               THR_MODES best_mode_index) {
+  assert(use_adaptive_rd_thresh > 0);
+  const THR_MODES top_mode = MAX_MODES;
+  const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+  const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size;
+  BLOCK_SIZE min_size, max_size;
+  if (bsize_is_1_to_4) {
+    // This part handles block sizes with 1:4 and 4:1 aspect ratios
+    // TODO(any): Experiment with threshold update for parent/child blocks
+    min_size = bsize;
+    max_size = bsize;
+  } else {
+    min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+  }
+
+  for (THR_MODES mode = 0; mode < top_mode; ++mode) {
+    for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+      int *const fact = &factor_buf[bs][mode];
+      if (mode == best_mode_index) {
+        *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+      } else {
+        *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
       }
     }
   }
@@ -1317,7 +1320,7 @@
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth) {
-  const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
+  const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return 20 * q;
     case AOM_BITS_10: return 5 * q;

diff --git a/libaom/av1/encoder/rd.h b/libaom/av1/encoder/rd.h
index ff46083..1addbae 100644
--- a/libaom/av1/encoder/rd.h
+++ b/libaom/av1/encoder/rd.h

@@ -31,6 +31,10 @@
   (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
    ((D) * (1 << RDDIV_BITS)))
 
+#define RDCOST_NEG_R(RM, R, D) \
+  (((D) * (1 << RDDIV_BITS)) - \
+   ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
+
 #define RDCOST_DBL(RM, R, D)                                       \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
    ((double)(D) * (1 << RDDIV_BITS)))
@@ -40,242 +44,28 @@
 #define MV_COST_WEIGHT 108
 #define MV_COST_WEIGHT_SUB 120
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC 1
+// The fractional part of rd_thresh factor is stored with 5 bits. The maximum
+// factor that we allow is two, which is stored as 2 ** (5+1) = 64
+#define RD_THRESH_FAC_FRAC_BITS (5)
+#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS))
+#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1)
+#define RD_THRESH_LOG_DEC_FACTOR (4)
+#define RD_THRESH_INC (1)
 
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
-// This enumerator type needs to be kept aligned with the mode order in
-// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
 enum {
-  THR_NEARESTMV,
-  THR_NEARESTL2,
-  THR_NEARESTL3,
-  THR_NEARESTB,
-  THR_NEARESTA2,
-  THR_NEARESTA,
-  THR_NEARESTG,
-
-  THR_NEWMV,
-  THR_NEWL2,
-  THR_NEWL3,
-  THR_NEWB,
-  THR_NEWA2,
-  THR_NEWA,
-  THR_NEWG,
-
-  THR_NEARMV,
-  THR_NEARL2,
-  THR_NEARL3,
-  THR_NEARB,
-  THR_NEARA2,
-  THR_NEARA,
-  THR_NEARG,
-
-  THR_GLOBALMV,
-  THR_GLOBALL2,
-  THR_GLOBALL3,
-  THR_GLOBALB,
-  THR_GLOBALA2,
-  THR_GLOBALA,
-  THR_GLOBALG,
-
-  THR_COMP_NEAREST_NEARESTLA,
-  THR_COMP_NEAREST_NEARESTL2A,
-  THR_COMP_NEAREST_NEARESTL3A,
-  THR_COMP_NEAREST_NEARESTGA,
-  THR_COMP_NEAREST_NEARESTLB,
-  THR_COMP_NEAREST_NEARESTL2B,
-  THR_COMP_NEAREST_NEARESTL3B,
-  THR_COMP_NEAREST_NEARESTGB,
-  THR_COMP_NEAREST_NEARESTLA2,
-  THR_COMP_NEAREST_NEARESTL2A2,
-  THR_COMP_NEAREST_NEARESTL3A2,
-  THR_COMP_NEAREST_NEARESTGA2,
-  THR_COMP_NEAREST_NEARESTLL2,
-  THR_COMP_NEAREST_NEARESTLL3,
-  THR_COMP_NEAREST_NEARESTLG,
-  THR_COMP_NEAREST_NEARESTBA,
-
-  THR_COMP_NEAR_NEARLA,
-  THR_COMP_NEW_NEARESTLA,
-  THR_COMP_NEAREST_NEWLA,
-  THR_COMP_NEW_NEARLA,
-  THR_COMP_NEAR_NEWLA,
-  THR_COMP_NEW_NEWLA,
-  THR_COMP_GLOBAL_GLOBALLA,
-
-  THR_COMP_NEAR_NEARL2A,
-  THR_COMP_NEW_NEARESTL2A,
-  THR_COMP_NEAREST_NEWL2A,
-  THR_COMP_NEW_NEARL2A,
-  THR_COMP_NEAR_NEWL2A,
-  THR_COMP_NEW_NEWL2A,
-  THR_COMP_GLOBAL_GLOBALL2A,
-
-  THR_COMP_NEAR_NEARL3A,
-  THR_COMP_NEW_NEARESTL3A,
-  THR_COMP_NEAREST_NEWL3A,
-  THR_COMP_NEW_NEARL3A,
-  THR_COMP_NEAR_NEWL3A,
-  THR_COMP_NEW_NEWL3A,
-  THR_COMP_GLOBAL_GLOBALL3A,
-
-  THR_COMP_NEAR_NEARGA,
-  THR_COMP_NEW_NEARESTGA,
-  THR_COMP_NEAREST_NEWGA,
-  THR_COMP_NEW_NEARGA,
-  THR_COMP_NEAR_NEWGA,
-  THR_COMP_NEW_NEWGA,
-  THR_COMP_GLOBAL_GLOBALGA,
-
-  THR_COMP_NEAR_NEARLB,
-  THR_COMP_NEW_NEARESTLB,
-  THR_COMP_NEAREST_NEWLB,
-  THR_COMP_NEW_NEARLB,
-  THR_COMP_NEAR_NEWLB,
-  THR_COMP_NEW_NEWLB,
-  THR_COMP_GLOBAL_GLOBALLB,
-
-  THR_COMP_NEAR_NEARL2B,
-  THR_COMP_NEW_NEARESTL2B,
-  THR_COMP_NEAREST_NEWL2B,
-  THR_COMP_NEW_NEARL2B,
-  THR_COMP_NEAR_NEWL2B,
-  THR_COMP_NEW_NEWL2B,
-  THR_COMP_GLOBAL_GLOBALL2B,
-
-  THR_COMP_NEAR_NEARL3B,
-  THR_COMP_NEW_NEARESTL3B,
-  THR_COMP_NEAREST_NEWL3B,
-  THR_COMP_NEW_NEARL3B,
-  THR_COMP_NEAR_NEWL3B,
-  THR_COMP_NEW_NEWL3B,
-  THR_COMP_GLOBAL_GLOBALL3B,
-
-  THR_COMP_NEAR_NEARGB,
-  THR_COMP_NEW_NEARESTGB,
-  THR_COMP_NEAREST_NEWGB,
-  THR_COMP_NEW_NEARGB,
-  THR_COMP_NEAR_NEWGB,
-  THR_COMP_NEW_NEWGB,
-  THR_COMP_GLOBAL_GLOBALGB,
-
-  THR_COMP_NEAR_NEARLA2,
-  THR_COMP_NEW_NEARESTLA2,
-  THR_COMP_NEAREST_NEWLA2,
-  THR_COMP_NEW_NEARLA2,
-  THR_COMP_NEAR_NEWLA2,
-  THR_COMP_NEW_NEWLA2,
-  THR_COMP_GLOBAL_GLOBALLA2,
-
-  THR_COMP_NEAR_NEARL2A2,
-  THR_COMP_NEW_NEARESTL2A2,
-  THR_COMP_NEAREST_NEWL2A2,
-  THR_COMP_NEW_NEARL2A2,
-  THR_COMP_NEAR_NEWL2A2,
-  THR_COMP_NEW_NEWL2A2,
-  THR_COMP_GLOBAL_GLOBALL2A2,
-
-  THR_COMP_NEAR_NEARL3A2,
-  THR_COMP_NEW_NEARESTL3A2,
-  THR_COMP_NEAREST_NEWL3A2,
-  THR_COMP_NEW_NEARL3A2,
-  THR_COMP_NEAR_NEWL3A2,
-  THR_COMP_NEW_NEWL3A2,
-  THR_COMP_GLOBAL_GLOBALL3A2,
-
-  THR_COMP_NEAR_NEARGA2,
-  THR_COMP_NEW_NEARESTGA2,
-  THR_COMP_NEAREST_NEWGA2,
-  THR_COMP_NEW_NEARGA2,
-  THR_COMP_NEAR_NEWGA2,
-  THR_COMP_NEW_NEWGA2,
-  THR_COMP_GLOBAL_GLOBALGA2,
-
-  THR_COMP_NEAR_NEARLL2,
-  THR_COMP_NEW_NEARESTLL2,
-  THR_COMP_NEAREST_NEWLL2,
-  THR_COMP_NEW_NEARLL2,
-  THR_COMP_NEAR_NEWLL2,
-  THR_COMP_NEW_NEWLL2,
-  THR_COMP_GLOBAL_GLOBALLL2,
-
-  THR_COMP_NEAR_NEARLL3,
-  THR_COMP_NEW_NEARESTLL3,
-  THR_COMP_NEAREST_NEWLL3,
-  THR_COMP_NEW_NEARLL3,
-  THR_COMP_NEAR_NEWLL3,
-  THR_COMP_NEW_NEWLL3,
-  THR_COMP_GLOBAL_GLOBALLL3,
-
-  THR_COMP_NEAR_NEARLG,
-  THR_COMP_NEW_NEARESTLG,
-  THR_COMP_NEAREST_NEWLG,
-  THR_COMP_NEW_NEARLG,
-  THR_COMP_NEAR_NEWLG,
-  THR_COMP_NEW_NEWLG,
-  THR_COMP_GLOBAL_GLOBALLG,
-
-  THR_COMP_NEAR_NEARBA,
-  THR_COMP_NEW_NEARESTBA,
-  THR_COMP_NEAREST_NEWBA,
-  THR_COMP_NEW_NEARBA,
-  THR_COMP_NEAR_NEWBA,
-  THR_COMP_NEW_NEWBA,
-  THR_COMP_GLOBAL_GLOBALBA,
-
-  THR_DC,
-  THR_PAETH,
-  THR_SMOOTH,
-  THR_SMOOTH_V,
-  THR_SMOOTH_H,
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D203_PRED,
-  THR_D157_PRED,
-  THR_D67_PRED,
-  THR_D113_PRED,
-  THR_D45_PRED,
-
-  MAX_MODES,
-
-  LAST_SINGLE_REF_MODES = THR_GLOBALG,
-  MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
-  LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
-  MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
-} UENUM1BYTE(THR_MODES);
-
-enum {
-  THR_LAST,
-  THR_LAST2,
-  THR_LAST3,
-  THR_BWDR,
-  THR_ALTR2,
-  THR_GOLD,
-  THR_ALTR,
-
-  THR_COMP_LA,
-  THR_COMP_L2A,
-  THR_COMP_L3A,
-  THR_COMP_GA,
-
-  THR_COMP_LB,
-  THR_COMP_L2B,
-  THR_COMP_L3B,
-  THR_COMP_GB,
-
-  THR_COMP_LA2,
-  THR_COMP_L2A2,
-  THR_COMP_L3A2,
-  THR_COMP_GA2,
-
-  THR_INTRA,
-
-  MAX_REFS
-} UENUM1BYTE(THR_MODES_SUB8X8);
+  // Default initialization when we are not using winner mode framework. e.g.
+  // intrabc
+  DEFAULT_EVAL = 0,
+  // Initialization for selecting winner mode
+  MODE_EVAL,
+  // Initialization for winner mode evaluation
+  WINNER_MODE_EVAL,
+  // All mode evaluation types
+  MODE_EVAL_TYPES,
+} UENUM1BYTE(MODE_EVAL_TYPE);
 
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
@@ -286,13 +76,26 @@
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
 
-  int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES];
-
   int RDMULT;
 
-  double r0;
+  double r0, arf_r0;
+  double mc_saved_base, mc_count_base;
 } RD_OPT;
 
+typedef struct {
+  // Cost of transmitting the actual motion vector.
+  // mv_component[0][i] is the cost of motion vector with horizontal component
+  // (mv_row) equal to i - MV_MAX.
+  // mv_component[1][i] is the cost of motion vector with vertical component
+  // (mv_col) equal to i - MV_MAX.
+  int mv_component[2][MV_VALS];
+
+  // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of
+  // type i.
+  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+  int joint_mv[MV_JOINTS];
+} IntraBCMVCosts;
+
 static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
 #if CONFIG_RD_DEBUG
   int plane;
@@ -303,8 +106,6 @@
   rd_stats->sse = 0;
   rd_stats->skip = 1;
   rd_stats->zero_rate = 0;
-  rd_stats->invalid_rate = 0;
-  rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
@@ -330,8 +131,6 @@
   rd_stats->sse = INT64_MAX;
   rd_stats->skip = 0;
   rd_stats->zero_rate = 0;
-  rd_stats->invalid_rate = 1;
-  rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
@@ -341,7 +140,7 @@
       int r, c;
       for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
         for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
-          rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+          rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX;
     }
   }
 #endif
@@ -349,20 +148,18 @@
 
 static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
                                       const RD_STATS *rd_stats_src) {
-#if CONFIG_RD_DEBUG
-  int plane;
-#endif
-  rd_stats_dst->rate += rd_stats_src->rate;
+  assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX);
+  rd_stats_dst->rate = (int)AOMMIN(
+      ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
   if (!rd_stats_dst->zero_rate)
     rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
-  rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
     {
       // TODO(angiebird): optimize this part
@@ -380,6 +177,49 @@
 #endif
 }
 
+static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+                                           int rate, int skip, int64_t sse,
+                                           int zero_rate) {
+  assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
+  rd_stats->rate += rate;
+  if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
+  rd_stats->dist += dist;
+  rd_stats->skip &= skip;
+  rd_stats->sse += sse;
+}
+
+static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+  assert(mult >= 0);
+  if (rate >= 0) {
+    return RDCOST(mult, rate, dist);
+  }
+  return RDCOST_NEG_R(mult, -rate, dist);
+}
+
+static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+  if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
+      rd_cost->rdcost < INT64_MAX) {
+    rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
+  } else {
+    av1_invalid_rd_stats(rd_cost);
+  }
+}
+
+static INLINE void av1_rd_stats_subtraction(int mult,
+                                            const RD_STATS *const left,
+                                            const RD_STATS *const right,
+                                            RD_STATS *result) {
+  if (left->rate == INT_MAX || right->rate == INT_MAX ||
+      left->dist == INT64_MAX || right->dist == INT64_MAX ||
+      left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) {
+    av1_invalid_rd_stats(result);
+  } else {
+    result->rate = left->rate - right->rate;
+    result->dist = left->dist - right->dist;
+    result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist);
+  }
+}
+
 struct TileInfo;
 struct TileDataEnc;
 struct AV1_COMP;
@@ -391,8 +231,6 @@
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
-void av1_initialize_cost_tables(const AV1_COMMON *const cm, MACROBLOCK *x);
-
 void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
                               int qindex);
 
@@ -404,14 +242,8 @@
 void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
                           double yl, double *rate_f, double *distbysse_f);
 
-int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
-                            const MACROBLOCKD *xd);
-
-int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
-                            int stride);
-
-int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
-                                       int16_t *base);
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter);
 
 YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
                                              int ref_frame);
@@ -420,7 +252,7 @@
 
 void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
                               ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
@@ -428,8 +260,16 @@
 void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
-                               int (*fact)[MAX_MODES], int rd_thresh, int bsize,
-                               int best_mode_index);
+                               int (*fact)[MAX_MODES], int rd_thresh,
+                               BLOCK_SIZE bsize, THR_MODES best_mode_index);
+
+static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    for (int j = 0; j < MAX_MODES; ++j) {
+      x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
+    }
+  }
+}
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
@@ -445,9 +285,64 @@
   x->errorperbit += (x->errorperbit == 0);
 }
 
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE uint32_t get_rd_opt_coeff_thresh(
+    const uint32_t *const coeff_opt_dist_threshold,
+    int enable_winner_mode_for_coeff_opt, int is_winner_mode) {
+  // Default initialization of threshold
+  uint32_t coeff_opt_thresh = coeff_opt_dist_threshold[DEFAULT_EVAL];
+  // TODO(any): Experiment with coeff_opt_dist_threshold values when
+  // enable_winner_mode_for_coeff_opt is ON
+  // TODO(any): Skip the winner mode processing for blocks with lower residual
+  // energy as R-D optimization of coefficients would have been enabled during
+  // mode decision
+  if (enable_winner_mode_for_coeff_opt) {
+    // Use conservative threshold during mode decision and perform R-D
+    // optimization of coeffs always for winner modes
+    if (is_winner_mode)
+      coeff_opt_thresh = coeff_opt_dist_threshold[WINNER_MODE_EVAL];
+    else
+      coeff_opt_thresh = coeff_opt_dist_threshold[MODE_EVAL];
+  }
+  return coeff_opt_thresh;
+}
+
+// Used to reset the state of tx/mb rd hash information
+static INLINE void reset_hash_records(MACROBLOCK *const x,
+                                      int use_inter_txb_hash) {
+  int32_t record_idx;
+
+  // Reset the state for use_inter_txb_hash
+  if (use_inter_txb_hash) {
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
+      x->txb_rd_record_8X8[record_idx].num =
+          x->txb_rd_record_8X8[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
+      x->txb_rd_record_16X16[record_idx].num =
+          x->txb_rd_record_16X16[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
+      x->txb_rd_record_32X32[record_idx].num =
+          x->txb_rd_record_32X32[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
+      x->txb_rd_record_64X64[record_idx].num =
+          x->txb_rd_record_64X64[record_idx].index_start = 0;
+  }
+
+  // Reset the state for use_intra_txb_hash
+  x->txb_rd_record_intra.num = x->txb_rd_record_intra.index_start = 0;
+
+  // Reset the state for use_mb_rd_hash
+  x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+}
+
 void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
-                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const YV12_BUFFER_CONFIG *src,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv,
                           const int num_planes);
@@ -461,8 +356,13 @@
 void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
                           const int num_planes);
 
+void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
+                       MACROBLOCK *x);
+
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
+int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/rdopt.c b/libaom/av1/encoder/rdopt.c
index 5e6054e..02afcd1 100644
--- a/libaom/av1/encoder/rdopt.c
+++ b/libaom/av1/encoder/rdopt.c

@@ -13,6 +13,7 @@
 #include <math.h>
 #include <stdbool.h>
 
+#include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
@@ -23,6 +24,7 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
@@ -31,7 +33,6 @@
 #include "av1/common/idct.h"
 #include "av1/common/mvref_common.h"
 #include "av1/common/obmc.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
@@ -44,13 +45,19 @@
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/compound_type.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/intra_mode_search.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/mode_prune_model_weights.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/pustats.h"
 #include "av1/encoder/random.h"
@@ -59,629 +66,252 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tokenize.h"
-#include "av1/encoder/tx_prune_model_weights.h"
-
-// Set this macro as 1 to collect data about tx size selection.
-#define COLLECT_TX_SIZE_DATA 0
-
-#if COLLECT_TX_SIZE_DATA
-static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
-#endif
-
-typedef void (*model_rd_for_sb_type)(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
-typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
-                                       const MACROBLOCK *const x,
-                                       BLOCK_SIZE plane_bsize, int plane,
-                                       int64_t sse, int num_samples, int *rate,
-                                       int64_t *dist);
-
-static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
-                            int plane_to, int mi_row, int mi_col,
-                            int *out_rate_sum, int64_t *out_dist_sum,
-                            int *skip_txfm_sb, int64_t *skip_sse_sb,
-                            int *plane_rate, int64_t *plane_sse,
-                            int64_t *plane_dist);
-static void model_rd_for_sb_with_curvfit(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
-static void model_rd_for_sb_with_surffit(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
-static void model_rd_for_sb_with_dnn(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
-static void model_rd_for_sb_with_fullrdy(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
-static void model_rd_from_sse(const AV1_COMP *const cpi,
-                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
-                              int plane, int64_t sse, int num_samples,
-                              int *rate, int64_t *dist);
-static void model_rd_with_dnn(const AV1_COMP *const cpi,
-                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
-                              int plane, int64_t sse, int num_samples,
-                              int *rate, int64_t *dist);
-static void model_rd_with_curvfit(const AV1_COMP *const cpi,
-                                  const MACROBLOCK *const x,
-                                  BLOCK_SIZE plane_bsize, int plane,
-                                  int64_t sse, int num_samples, int *rate,
-                                  int64_t *dist);
-static void model_rd_with_surffit(const AV1_COMP *const cpi,
-                                  const MACROBLOCK *const x,
-                                  BLOCK_SIZE plane_bsize, int plane,
-                                  int64_t sse, int num_samples, int *rate,
-                                  int64_t *dist);
-
-enum {
-  MODELRD_LEGACY,
-  MODELRD_CURVFIT,
-  MODELRD_SUFFIT,
-  MODELRD_DNN,
-  MODELRD_FULLRDY,
-  MODELRD_TYPES
-} UENUM1BYTE(ModelRdType);
-
-static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
-  model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
-  model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
-};
-
-static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
-  model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
-  model_rd_with_dnn, NULL
-};
-
-// 0: Legacy model
-// 1: Curve fit model
-// 2: Surface fit model
-// 3: DNN regression model
-// 4: Full rd model
-#define MODELRD_TYPE_INTERP_FILTER 1
-#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
-#define MODELRD_TYPE_MASKED_COMPOUND 1
-#define MODELRD_TYPE_INTERINTRA 1
-#define MODELRD_TYPE_INTRA 1
-#define MODELRD_TYPE_DIST_WTD_COMPOUND 1
-#define MODELRD_TYPE_MOTION_MODE_RD 1
-
-#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
-static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
-  0x00000000, 0x00010000, 0x00020000,  // y = 0
-  0x00000001, 0x00010001, 0x00020001,  // y = 1
-  0x00000002, 0x00010002, 0x00020002,  // y = 2
-};
-
-static const double ADST_FLIP_SVM[8] = {
-  /* vertical */
-  -6.6623, -2.8062, -3.2531, 3.1671,
-  /* horizontal */
-  -7.7051, -3.2234, -3.6193, 3.4533
-};
-
-typedef struct {
-  PREDICTION_MODE mode;
-  MV_REFERENCE_FRAME ref_frame[2];
-} MODE_DEFINITION;
-
-enum {
-  FTXS_NONE = 0,
-  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
-  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
-  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
-} UENUM1BYTE(FAST_TX_SEARCH_MODE);
-
-struct rdcost_block_args {
-  const AV1_COMP *cpi;
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
-  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
-  RD_STATS rd_stats;
-  int64_t this_rd;
-  int64_t best_rd;
-  int exit_early;
-  int incomplete_exit;
-  int use_fast_coef_costing;
-  FAST_TX_SEARCH_MODE ftxs_mode;
-  int skip_trellis;
-};
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
 
 #define LAST_NEW_MV_INDEX 6
-static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
-  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
-  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
-  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
-  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
-  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
-  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
 
-  { NEWMV, { LAST_FRAME, NONE_FRAME } },
-  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
-  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-
-  { NEARMV, { LAST_FRAME, NONE_FRAME } },
-  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
-  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
-  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
-  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
-  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
-  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
-
-  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
-  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
-  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
-  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
-  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
-  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  // TODO(zoeliu): May need to reconsider the order on the modes to check
-
-  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-
-  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
-
-  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
-
-  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
-
-  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
-
-  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
-
-  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
-
-  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
-
-  // intra modes
-  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable
+// The values are kept in Q12 format and equation used to derive is
+// (2.5 - ((float)x->qindex / MAXQ) * 1.5)
+#define MODE_THRESH_QBITS 12
+static const int mode_threshold_mul_factor[QINDEX_RANGE] = {
+  10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999,
+  9975,  9951,  9927,  9903,  9879,  9854,  9830,  9806,  9782,  9758,  9734,
+  9710,  9686,  9662,  9638,  9614,  9589,  9565,  9541,  9517,  9493,  9469,
+  9445,  9421,  9397,  9373,  9349,  9324,  9300,  9276,  9252,  9228,  9204,
+  9180,  9156,  9132,  9108,  9083,  9059,  9035,  9011,  8987,  8963,  8939,
+  8915,  8891,  8867,  8843,  8818,  8794,  8770,  8746,  8722,  8698,  8674,
+  8650,  8626,  8602,  8578,  8553,  8529,  8505,  8481,  8457,  8433,  8409,
+  8385,  8361,  8337,  8312,  8288,  8264,  8240,  8216,  8192,  8168,  8144,
+  8120,  8096,  8072,  8047,  8023,  7999,  7975,  7951,  7927,  7903,  7879,
+  7855,  7831,  7806,  7782,  7758,  7734,  7710,  7686,  7662,  7638,  7614,
+  7590,  7566,  7541,  7517,  7493,  7469,  7445,  7421,  7397,  7373,  7349,
+  7325,  7301,  7276,  7252,  7228,  7204,  7180,  7156,  7132,  7108,  7084,
+  7060,  7035,  7011,  6987,  6963,  6939,  6915,  6891,  6867,  6843,  6819,
+  6795,  6770,  6746,  6722,  6698,  6674,  6650,  6626,  6602,  6578,  6554,
+  6530,  6505,  6481,  6457,  6433,  6409,  6385,  6361,  6337,  6313,  6289,
+  6264,  6240,  6216,  6192,  6168,  6144,  6120,  6096,  6072,  6048,  6024,
+  5999,  5975,  5951,  5927,  5903,  5879,  5855,  5831,  5807,  5783,  5758,
+  5734,  5710,  5686,  5662,  5638,  5614,  5590,  5566,  5542,  5518,  5493,
+  5469,  5445,  5421,  5397,  5373,  5349,  5325,  5301,  5277,  5253,  5228,
+  5204,  5180,  5156,  5132,  5108,  5084,  5060,  5036,  5012,  4987,  4963,
+  4939,  4915,  4891,  4867,  4843,  4819,  4795,  4771,  4747,  4722,  4698,
+  4674,  4650,  4626,  4602,  4578,  4554,  4530,  4506,  4482,  4457,  4433,
+  4409,  4385,  4361,  4337,  4313,  4289,  4265,  4241,  4216,  4192,  4168,
+  4144,  4120,  4096
 };
 
-static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
-  THR_DC,         // DC_PRED,
-  THR_V_PRED,     // V_PRED,
-  THR_H_PRED,     // H_PRED,
-  THR_D45_PRED,   // D45_PRED,
-  THR_D135_PRED,  // D135_PRED,
-  THR_D113_PRED,  // D113_PRED,
-  THR_D157_PRED,  // D157_PRED,
-  THR_D203_PRED,  // D203_PRED,
-  THR_D67_PRED,   // D67_PRED,
-  THR_SMOOTH,     // SMOOTH_PRED,
-  THR_SMOOTH_V,   // SMOOTH_V_PRED,
-  THR_SMOOTH_H,   // SMOOTH_H_PRED,
-  THR_PAETH,      // PAETH_PRED,
+static const THR_MODES av1_default_mode_order[MAX_MODES] = {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
 };
 
-/* clang-format off */
-static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
-                                             [REF_FRAMES] = {
-  // NEARESTMV,
-  { -1, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
-    THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
-  // NEARMV,
-  { -1, THR_NEARMV, THR_NEARL2, THR_NEARL3,
-    THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
-  // GLOBALMV,
-  { -1, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
-    THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
-  // NEWMV,
-  { -1, THR_NEWMV, THR_NEWL2, THR_NEWL3,
-    THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
-};
-/* clang-format on */
+static int find_last_single_ref_mode_idx(const THR_MODES *mode_order) {
+  uint8_t mode_found[NUM_SINGLE_REF_MODES];
+  av1_zero(mode_found);
+  int num_single_ref_modes_left = NUM_SINGLE_REF_MODES;
 
-/* clang-format off */
-static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
-                                     [REF_FRAMES] = {
-  // NEAREST_NEARESTMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
-      THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
-      THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEARESTL2B,
-      THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEARESTL3B,
-      THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEARESTGB,
-      THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEARESTBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEAR_NEARMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
-      THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
-      THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEARL2B,
-      THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEARL3B,
-      THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEARGB,
-      THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEARBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEAREST_NEWMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
-      THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
-      THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEWL2B,
-      THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEWL3B,
-      THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEWGB,
-      THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAREST_NEWBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEW_NEARESTMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
-      THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
-      THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARESTL2B,
-      THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARESTL3B,
-      THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARESTGB,
-      THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARESTBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEAR_NEWMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
-      THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
-      THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEWL2B,
-      THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEWL3B,
-      THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEWGB,
-      THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEAR_NEWBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEW_NEARMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
-      THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
-      THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARL2B,
-      THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARL3B,
-      THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARGB,
-      THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEARBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // GLOBAL_GLOBALMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
-      THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
-      THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_GLOBAL_GLOBALL2B,
-      THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_GLOBAL_GLOBALL3B,
-      THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_GLOBAL_GLOBALGB,
-      THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_GLOBAL_GLOBALBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-  // NEW_NEWMV,
-  {
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1,
-      THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
-      THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
-      THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEWL2B,
-      THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEWL3B,
-      THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
-    { -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEWGB,
-      THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
-    { -1, -1,
-      -1, -1,
-      -1, -1,
-      -1, THR_COMP_NEW_NEWBA, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-    { -1, -1, -1, -1, -1, -1, -1, -1, },
-  },
-};
-/* clang-format on */
-
-static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
-                                   MV_REFERENCE_FRAME ref_frame,
-                                   MV_REFERENCE_FRAME second_ref_frame) {
-  if (this_mode < INTRA_MODE_END) {
-    assert(ref_frame == INTRA_FRAME);
-    assert(second_ref_frame == NONE_FRAME);
-    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  for (int idx = 0; idx < MAX_MODES; idx++) {
+    const THR_MODES curr_mode = mode_order[idx];
+    if (curr_mode < SINGLE_REF_MODE_END) {
+      num_single_ref_modes_left--;
+    }
+    if (!num_single_ref_modes_left) {
+      return idx;
+    }
   }
-  if (this_mode >= SINGLE_INTER_MODE_START &&
-      this_mode < SINGLE_INTER_MODE_END) {
-    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
-    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
-                                   [ref_frame];
-  }
-  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
-    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
-    assert((second_ref_frame > INTRA_FRAME) &&
-           (second_ref_frame <= ALTREF_FRAME));
-    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
-                                 [second_ref_frame];
-  }
-  assert(0);
   return -1;
 }
 
-static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
-  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
-  D67_PRED,      D113_PRED,     D45_PRED,
-};
-
-static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
-  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
-  UV_D113_PRED,   UV_D45_PRED,
-};
-
 typedef struct SingleInterModeState {
   int64_t rd;
   MV_REFERENCE_FRAME ref_frame;
@@ -690,38 +320,27 @@
 
 typedef struct InterModeSearchState {
   int64_t best_rd;
+  int64_t best_skip_rd[2];
   MB_MODE_INFO best_mbmode;
   int best_rate_y;
   int best_rate_uv;
   int best_mode_skippable;
   int best_skip2;
-  int best_mode_index;
-  int skip_intra_modes;
+  THR_MODES best_mode_index;
   int num_available_refs;
   int64_t dist_refs[REF_FRAMES];
   int dist_order_refs[REF_FRAMES];
   int64_t mode_threshold[MAX_MODES];
-  PREDICTION_MODE best_intra_mode;
   int64_t best_intra_rd;
-  int angle_stats_ready;
-  uint8_t directional_mode_skip_mask[INTRA_MODES];
   unsigned int best_pred_sse;
-  int rate_uv_intra[TX_SIZES_ALL];
-  int rate_uv_tokenonly[TX_SIZES_ALL];
-  int64_t dist_uvs[TX_SIZES_ALL];
-  int skip_uvs[TX_SIZES_ALL];
-  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
-  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
-  int8_t uv_angle_delta[TX_SIZES_ALL];
-  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_pred_diff[REFERENCE_MODES];
   // Save a set of single_newmv for each checked ref_mv.
-  int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
-  int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
-  int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
-  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+  int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
   // The rd of simple translation in single inter modes
-  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
 
   // Single search results by [directions][modes][reference frames]
   SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
@@ -729,18 +348,10 @@
   SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
                                             [FWD_REFS];
   int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
-
   MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  IntraModeSearchState intra_search_state;
 } InterModeSearchState;
 
-static int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
-  if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
-      bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
-    return -1;
-  }
-  return 1;
-}
-
 void av1_inter_mode_data_init(TileDataEnc *tile_data) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
     InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
@@ -840,8 +451,9 @@
   }
 }
 
-static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
-                                 int64_t sse, int64_t dist, int residue_cost) {
+static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
+                                            BLOCK_SIZE bsize, int64_t sse,
+                                            int64_t dist, int residue_cost) {
   if (residue_cost == 0 || sse == dist) return;
   const int block_idx = inter_mode_data_block_idx(bsize);
   if (block_idx == -1) return;
@@ -858,23 +470,18 @@
   }
 }
 
-static void inter_modes_info_push(InterModesInfo *inter_modes_info,
-                                  int mode_rate, int64_t sse, int64_t rd,
-                                  bool true_rd, uint8_t *blk_skip,
-                                  RD_STATS *rd_cost, RD_STATS *rd_cost_y,
-                                  RD_STATS *rd_cost_uv,
-                                  const MB_MODE_INFO *mbmi) {
+static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                             int mode_rate, int64_t sse,
+                                             int64_t rd, RD_STATS *rd_cost,
+                                             RD_STATS *rd_cost_y,
+                                             RD_STATS *rd_cost_uv,
+                                             const MB_MODE_INFO *mbmi) {
   const int num = inter_modes_info->num;
   assert(num < MAX_INTER_MODES);
   inter_modes_info->mbmi_arr[num] = *mbmi;
   inter_modes_info->mode_rate_arr[num] = mode_rate;
   inter_modes_info->sse_arr[num] = sse;
   inter_modes_info->est_rd_arr[num] = rd;
-  inter_modes_info->true_rd_arr[num] = true_rd;
-  if (blk_skip != NULL) {
-    memcpy(inter_modes_info->blk_skip_arr[num], blk_skip,
-           sizeof(blk_skip[0]) * MAX_MIB_SIZE * MAX_MIB_SIZE);
-  }
   inter_modes_info->rd_cost_arr[num] = *rd_cost;
   inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
   inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
@@ -891,8 +498,8 @@
   }
 }
 
-static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
-                                  RdIdxPair *rd_idx_pair_arr) {
+static AOM_INLINE void inter_modes_info_sort(
+    const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
   if (inter_modes_info->num == 0) {
     return;
   }
@@ -904,723 +511,6 @@
         compare_rd_idx_pair);
 }
 
-static INLINE int write_uniform_cost(int n, int v) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (l == 0) return 0;
-  if (v < m)
-    return av1_cost_literal(l - 1);
-  else
-    return av1_cost_literal(l);
-}
-
-// Similar to store_cfl_required(), but for use during the RDO process,
-// where we haven't yet determined whether this block uses CfL.
-static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
-                                                      const MACROBLOCK *x) {
-  const MACROBLOCKD *xd = &x->e_mbd;
-
-  if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED;
-
-  if (!xd->cfl.is_chroma_reference) {
-    // For non-chroma-reference blocks, we should always store the luma pixels,
-    // in case the corresponding chroma-reference block uses CfL.
-    // Note that this can only happen for block sizes which are <8 on
-    // their shortest side, as otherwise they would be chroma reference
-    // blocks.
-    return CFL_ALLOWED;
-  }
-
-  // For chroma reference blocks, we should store data in the encoder iff we're
-  // allowed to try out CfL.
-  return is_cfl_allowed(xd);
-}
-
-// constants for prune 1 and prune 2 decision boundaries
-#define FAST_EXT_TX_CORR_MID 0.0
-#define FAST_EXT_TX_EDST_MID 0.1
-#define FAST_EXT_TX_CORR_MARGIN 0.5
-#define FAST_EXT_TX_EDST_MARGIN 0.3
-
-static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode);
-
-static unsigned pixel_dist_visible_only(
-    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
-    const int src_stride, const uint8_t *dst, const int dst_stride,
-    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
-    int visible_cols) {
-  unsigned sse;
-
-  if (txb_rows == visible_rows && txb_cols == visible_cols) {
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-    return sse;
-  }
-  const MACROBLOCKD *xd = &x->e_mbd;
-
-  if (is_cur_buf_hbd(xd)) {
-    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
-                                             visible_cols, visible_rows);
-    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
-  }
-  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
-                         visible_rows);
-  return sse;
-}
-
-#if CONFIG_DIST_8X8
-static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                    int sstride, int coeff_shift) {
-  uint64_t svar = 0;
-  uint64_t dvar = 0;
-  uint64_t sum_s = 0;
-  uint64_t sum_d = 0;
-  uint64_t sum_s2 = 0;
-  uint64_t sum_d2 = 0;
-  uint64_t sum_sd = 0;
-  uint64_t dist = 0;
-
-  int i, j;
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      sum_s += src[i * sstride + j];
-      sum_d += dst[i * dstride + j];
-      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
-      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
-      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
-    }
-  }
-  /* Compute the variance -- the calculation cannot go negative. */
-  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
-  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
-
-  // Tuning of jm's original dering distortion metric used in CDEF tool,
-  // suggested by jm
-  const uint64_t a = 4;
-  const uint64_t b = 2;
-  const uint64_t c1 = (400 * a << 2 * coeff_shift);
-  const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
-
-  dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
-                                  (svar + dvar + c1) /
-                                  (sqrt(svar * (double)dvar + c2)));
-
-  // Calibrate dist to have similar rate for the same QP with MSE only
-  // distortion (as in master branch)
-  dist = (uint64_t)((float)dist * 0.75);
-
-  return dist;
-}
-
-static int od_compute_var_4x4(uint16_t *x, int stride) {
-  int sum;
-  int s2;
-  int i;
-  sum = 0;
-  s2 = 0;
-  for (i = 0; i < 4; i++) {
-    int j;
-    for (j = 0; j < 4; j++) {
-      int t;
-
-      t = x[i * stride + j];
-      sum += t;
-      s2 += t * t;
-    }
-  }
-
-  return (s2 - (sum * sum >> 4)) >> 4;
-}
-
-/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
-   the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
-   is applied both horizontally and vertically. For X=5, the filter is
-   a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
-#define OD_DIST_LP_MID (5)
-#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
-
-static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
-                                  uint16_t *y, od_coeff *e_lp, int stride) {
-  double sum;
-  int min_var;
-  double mean_var;
-  double var_stat;
-  double activity;
-  double calibration;
-  int i;
-  int j;
-  double vardist;
-
-  vardist = 0;
-
-#if 1
-  min_var = INT_MAX;
-  mean_var = 0;
-  for (i = 0; i < 3; i++) {
-    for (j = 0; j < 3; j++) {
-      int varx;
-      int vary;
-      varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
-      vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
-      min_var = OD_MINI(min_var, varx);
-      mean_var += 1. / (1 + varx);
-      /* The cast to (double) is to avoid an overflow before the sqrt.*/
-      vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
-    }
-  }
-  /* We use a different variance statistic depending on whether activity
-     masking is used, since the harmonic mean appeared slightly worse with
-     masking off. The calibration constant just ensures that we preserve the
-     rate compared to activity=1. */
-  if (use_activity_masking) {
-    calibration = 1.95;
-    var_stat = 9. / mean_var;
-  } else {
-    calibration = 1.62;
-    var_stat = min_var;
-  }
-  /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
-     activity masking constant. */
-  activity = calibration * pow(.25 + var_stat, -1. / 6);
-#else
-  activity = 1;
-#endif  // 1
-  sum = 0;
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++)
-      sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
-  }
-  /* Normalize the filter to unit DC response. */
-  sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
-               OD_DIST_LP_NORM);
-  return activity * activity * (sum + vardist);
-}
-
-// Note : Inputs x and y are in a pixel domain
-static double od_compute_dist_common(int activity_masking, uint16_t *x,
-                                     uint16_t *y, int bsize_w, int bsize_h,
-                                     int qindex, od_coeff *tmp,
-                                     od_coeff *e_lp) {
-  int i, j;
-  double sum = 0;
-  const int mid = OD_DIST_LP_MID;
-
-  for (j = 0; j < bsize_w; j++) {
-    e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
-    e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
-                                        2 * tmp[(bsize_h - 2) * bsize_w + j];
-  }
-  for (i = 1; i < bsize_h - 1; i++) {
-    for (j = 0; j < bsize_w; j++) {
-      e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
-                              tmp[(i - 1) * bsize_w + j] +
-                              tmp[(i + 1) * bsize_w + j];
-    }
-  }
-  for (i = 0; i < bsize_h; i += 8) {
-    for (j = 0; j < bsize_w; j += 8) {
-      sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
-                                 &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
-                                 bsize_w);
-    }
-  }
-  /* Scale according to linear regression against SSE, for 8x8 blocks. */
-  if (activity_masking) {
-    sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
-           (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
-  } else {
-    sum *= qindex >= 128
-               ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
-               : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
-                              : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
-  }
-
-  return sum;
-}
-
-static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
-                              int bsize_h, int qindex) {
-  assert(bsize_w >= 8 && bsize_h >= 8);
-
-  int activity_masking = 0;
-
-  int i, j;
-  DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
-  for (i = 0; i < bsize_h; i++) {
-    for (j = 0; j < bsize_w; j++) {
-      e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
-    }
-  }
-  int mid = OD_DIST_LP_MID;
-  for (i = 0; i < bsize_h; i++) {
-    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-    tmp[i * bsize_w + bsize_w - 1] =
-        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-    for (j = 1; j < bsize_w - 1; j++) {
-      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
-                             e[i * bsize_w + j + 1];
-    }
-  }
-  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                qindex, tmp, e_lp);
-}
-
-static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
-                                   int bsize_h, int qindex) {
-  assert(bsize_w >= 8 && bsize_h >= 8);
-
-  int activity_masking = 0;
-
-  DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
-  int i, j;
-  for (i = 0; i < bsize_h; i++) {
-    for (j = 0; j < bsize_w; j++) {
-      y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
-    }
-  }
-  int mid = OD_DIST_LP_MID;
-  for (i = 0; i < bsize_h; i++) {
-    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-    tmp[i * bsize_w + bsize_w - 1] =
-        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-    for (j = 1; j < bsize_w - 1; j++) {
-      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
-                             e[i * bsize_w + j + 1];
-    }
-  }
-  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                qindex, tmp, e_lp);
-}
-
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
-                     const uint8_t *src, int src_stride, const uint8_t *dst,
-                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
-                     int bsh, int visible_w, int visible_h, int qindex) {
-  int64_t d = 0;
-  int i, j;
-  const MACROBLOCKD *xd = &x->e_mbd;
-
-  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]);
-
-  assert(bsw >= 8);
-  assert(bsh >= 8);
-  assert((bsw & 0x07) == 0);
-  assert((bsh & 0x07) == 0);
-
-  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
-      x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    if (is_cur_buf_hbd(xd)) {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-
-      if ((bsw == visible_w) && (bsh == visible_h)) {
-        for (j = 0; j < bsh; j++)
-          for (i = 0; i < bsw; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
-      } else {
-        for (j = 0; j < visible_h; j++)
-          for (i = 0; i < visible_w; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
-
-        if (visible_w < bsw) {
-          for (j = 0; j < bsh; j++)
-            for (i = visible_w; i < bsw; i++)
-              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-        }
-
-        if (visible_h < bsh) {
-          for (j = visible_h; j < bsh; j++)
-            for (i = 0; i < bsw; i++)
-              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-        }
-      }
-    } else {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-
-      if ((bsw == visible_w) && (bsh == visible_h)) {
-        for (j = 0; j < bsh; j++)
-          for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
-      } else {
-        for (j = 0; j < visible_h; j++)
-          for (i = 0; i < visible_w; i++)
-            rec[j * bsw + i] = dst[j * dst_stride + i];
-
-        if (visible_w < bsw) {
-          for (j = 0; j < bsh; j++)
-            for (i = visible_w; i < bsw; i++)
-              rec[j * bsw + i] = src[j * src_stride + i];
-        }
-
-        if (visible_h < bsh) {
-          for (j = visible_h; j < bsh; j++)
-            for (i = 0; i < bsw; i++)
-              rec[j * bsw + i] = src[j * src_stride + i];
-        }
-      }
-    }
-  }
-
-  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
-  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
-    int coeff_shift = AOMMAX(xd->bd - 8, 0);
-
-    for (i = 0; i < bsh; i += 8) {
-      for (j = 0; j < bsw; j += 8) {
-        d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
-                                 bsw, coeff_shift);
-      }
-    }
-    if (is_cur_buf_hbd(xd)) d = ((uint64_t)d) >> 2 * coeff_shift;
-  } else {
-    // Otherwise, MSE by default
-    d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
-                                tx_bsize, bsh, bsw, visible_h, visible_w);
-  }
-
-  return d;
-}
-
-static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
-                             int src_stride, const int16_t *diff,
-                             int diff_stride, int bsw, int bsh, int visible_w,
-                             int visible_h, int qindex) {
-  int64_t d = 0;
-  int i, j;
-  const MACROBLOCKD *xd = &x->e_mbd;
-
-  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]);
-
-  assert(bsw >= 8);
-  assert(bsh >= 8);
-  assert((bsw & 0x07) == 0);
-  assert((bsh & 0x07) == 0);
-
-  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
-      x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    if (is_cur_buf_hbd(xd)) {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-    } else {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-    }
-
-    if ((bsw == visible_w) && (bsh == visible_h)) {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          diff16[j * bsw + i] = diff[j * diff_stride + i];
-    } else {
-      for (j = 0; j < visible_h; j++)
-        for (i = 0; i < visible_w; i++)
-          diff16[j * bsw + i] = diff[j * diff_stride + i];
-
-      if (visible_w < bsw) {
-        for (j = 0; j < bsh; j++)
-          for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
-      }
-
-      if (visible_h < bsh) {
-        for (j = visible_h; j < bsh; j++)
-          for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
-      }
-    }
-  }
-
-  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
-  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
-    int coeff_shift = AOMMAX(xd->bd - 8, 0);
-    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]);
-
-    for (i = 0; i < bsh; i++) {
-      for (j = 0; j < bsw; j++) {
-        dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
-      }
-    }
-
-    for (i = 0; i < bsh; i += 8) {
-      for (j = 0; j < bsw; j += 8) {
-        d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
-                                 bsw, coeff_shift);
-      }
-    }
-    // Don't scale 'd' for HBD since it will be done by caller side for diff
-    // input
-  } else {
-    // Otherwise, MSE by default
-    d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
-  }
-
-  return d;
-}
-#endif  // CONFIG_DIST_8X8
-
-static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
-                                         const uint8_t *src, int src_stride,
-                                         const uint8_t *dst, int dst_stride,
-                                         int need_4th, double *hordist,
-                                         double *verdist) {
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
-  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
-    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
-    // functions for the 16 (very small) sub-blocks of this block.
-    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
-    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
-    assert(bw <= 32);
-    assert(bh <= 32);
-    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
-    if (cpi->common.seq_params.use_highbitdepth) {
-      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (int i = 0; i < bh; ++i)
-        for (int j = 0; j < bw; ++j) {
-          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
-          esq[index] +=
-              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
-              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
-        }
-    } else {
-      for (int i = 0; i < bh; ++i)
-        for (int j = 0; j < bw; ++j) {
-          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
-          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
-                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
-        }
-    }
-  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
-    const int f_index =
-        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
-    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
-    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
-    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
-    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[1]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[2]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[3]);
-    src += bh / 4 * src_stride;
-    dst += bh / 4 * dst_stride;
-
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[5]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[6]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[7]);
-    src += bh / 4 * src_stride;
-    dst += bh / 4 * dst_stride;
-
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[9]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[10]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[11]);
-    src += bh / 4 * src_stride;
-    dst += bh / 4 * dst_stride;
-
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[13]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[14]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[15]);
-  }
-
-  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
-                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
-                 esq[12] + esq[13] + esq[14] + esq[15];
-  if (total > 0) {
-    const double e_recip = 1.0 / total;
-    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
-    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
-    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
-    if (need_4th) {
-      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
-    }
-    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
-    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
-    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
-    if (need_4th) {
-      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
-    }
-  } else {
-    hordist[0] = verdist[0] = 0.25;
-    hordist[1] = verdist[1] = 0.25;
-    hordist[2] = verdist[2] = 0.25;
-    if (need_4th) {
-      hordist[3] = verdist[3] = 0.25;
-    }
-  }
-}
-
-static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
-                            const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride) {
-  int prune_bitmask = 0;
-  double svm_proj_h = 0, svm_proj_v = 0;
-  double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
-  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0,
-                               hdist, vdist);
-
-  svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
-               vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
-  svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
-               hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
-  if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
-    prune_bitmask |= 1 << FLIPADST_1D;
-  else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
-    prune_bitmask |= 1 << ADST_1D;
-
-  if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
-    prune_bitmask |= 1 << (FLIPADST_1D + 8);
-  else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
-    prune_bitmask |= 1 << (ADST_1D + 8);
-
-  return prune_bitmask;
-}
-
-static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
-  float hcorr, vcorr;
-  int prune_bitmask = 0;
-  av1_get_horver_correlation_full(diff, stride, w, h, &hcorr, &vcorr);
-
-  if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << IDTX_1D;
-  else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << DCT_1D;
-
-  if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << (IDTX_1D + 8);
-  else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << (DCT_1D + 8);
-  return prune_bitmask;
-}
-
-// Performance drop: 0.5%, Speed improvement: 24%
-static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
-                             MACROBLOCK *x, const MACROBLOCKD *xd,
-                             int adst_flipadst, int dct_idtx) {
-  int prune = 0;
-
-  if (adst_flipadst) {
-    const struct macroblock_plane *const p = &x->plane[0];
-    const struct macroblockd_plane *const pd = &xd->plane[0];
-    prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
-                              pd->dst.buf, pd->dst.stride);
-  }
-  if (dct_idtx) {
-    av1_subtract_plane(x, bsize, 0);
-    const struct macroblock_plane *const p = &x->plane[0];
-    const int bw = block_size_wide[bsize];
-    const int bh = block_size_high[bsize];
-    prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
-  }
-
-  return prune;
-}
-
-// Performance drop: 0.3%, Speed improvement: 5%
-static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
-                             const MACROBLOCK *x, const MACROBLOCKD *xd) {
-  const struct macroblock_plane *const p = &x->plane[0];
-  const struct macroblockd_plane *const pd = &xd->plane[0];
-  return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
-                          pd->dst.stride);
-}
-
-// 1D Transforms used in inter set, this needs to be changed if
-// ext_tx_used_inter is changed
-static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
-  { 1, 0, 0, 0 },
-  { 1, 1, 1, 1 },
-  { 1, 1, 1, 1 },
-  { 1, 0, 0, 1 },
-};
-
-static void get_energy_distribution_finer(const int16_t *diff, int stride,
-                                          int bw, int bh, float *hordist,
-                                          float *verdist) {
-  // First compute downscaled block energy values (esq); downscale factors
-  // are defined by w_shift and h_shift.
-  unsigned int esq[256];
-  const int w_shift = bw <= 8 ? 0 : 1;
-  const int h_shift = bh <= 8 ? 0 : 1;
-  const int esq_w = bw >> w_shift;
-  const int esq_h = bh >> h_shift;
-  const int esq_sz = esq_w * esq_h;
-  int i, j;
-  memset(esq, 0, esq_sz * sizeof(esq[0]));
-  if (w_shift) {
-    for (i = 0; i < bh; i++) {
-      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
-      const int16_t *cur_diff_row = diff + i * stride;
-      for (j = 0; j < bw; j += 2) {
-        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
-                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
-      }
-    }
-  } else {
-    for (i = 0; i < bh; i++) {
-      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
-      const int16_t *cur_diff_row = diff + i * stride;
-      for (j = 0; j < bw; j++) {
-        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
-      }
-    }
-  }
-
-  uint64_t total = 0;
-  for (i = 0; i < esq_sz; i++) total += esq[i];
-
-  // Output hordist and verdist arrays are normalized 1D projections of esq
-  if (total == 0) {
-    float hor_val = 1.0f / esq_w;
-    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
-    float ver_val = 1.0f / esq_h;
-    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
-    return;
-  }
-
-  const float e_recip = 1.0f / (float)total;
-  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
-  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
-  const unsigned int *cur_esq_row;
-  for (i = 0; i < esq_h - 1; i++) {
-    cur_esq_row = esq + i * esq_w;
-    for (j = 0; j < esq_w - 1; j++) {
-      hordist[j] += (float)cur_esq_row[j];
-      verdist[i] += (float)cur_esq_row[j];
-    }
-    verdist[i] += (float)cur_esq_row[j];
-  }
-  cur_esq_row = esq + i * esq_w;
-  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
-
-  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
-  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
-}
-
 // Similar to get_horver_correlation, but also takes into account first
 // row/column, when computing horizontal/vertical correlation.
 void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
@@ -1720,346 +610,30 @@
   }
 }
 
-// Transforms raw scores into a probability distribution across 16 TX types
-static void score_2D_transform_pow8(float *scores_2D, float shift) {
-  float sum = 0.0f;
-  int i;
-  for (i = 0; i < 16; i++) {
-    const float v = AOMMIN(AOMMAX(scores_2D[i] + shift, 0.0f), 100.0f);
-    const float v2 = v * v;
-    const float v4 = v2 * v2;
-    scores_2D[i] = v4 * v4;
-    sum += scores_2D[i];
-  }
-  for (i = 0; i < 16; i++) {
-    if (scores_2D[i] < sum * 1e-4)
-      scores_2D[i] = 0.0f;
-    else
-      scores_2D[i] /= sum;
-  }
-}
-
-// These thresholds were calibrated to provide a certain number of TX types
-// pruned by the model on average, i.e. selecting a threshold with index i
-// will lead to pruning i+1 TX types on average
-static const float *prune_2D_adaptive_thresholds[] = {
-  // TX_4X4
-  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
-             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
-             0.09778f, 0.11780f },
-  // TX_8X8
-  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
-             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
-             0.10803f, 0.14124f },
-  // TX_16X16
-  (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
-             0.06897f, 0.07629f, 0.08875f, 0.11169f },
-  // TX_32X32
-  NULL,
-  // TX_64X64
-  NULL,
-  // TX_4X8
-  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
-             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
-             0.10168f, 0.12585f },
-  // TX_8X4
-  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
-             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
-             0.10583f, 0.13123f },
-  // TX_8X16
-  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
-             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
-             0.10730f, 0.14221f },
-  // TX_16X8
-  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
-             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
-             0.10339f, 0.13464f },
-  // TX_16X32
-  NULL,
-  // TX_32X16
-  NULL,
-  // TX_32X64
-  NULL,
-  // TX_64X32
-  NULL,
-  // TX_4X16
-  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
-             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
-             0.10242f, 0.12878f },
-  // TX_16X4
-  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
-             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
-             0.10217f, 0.12610f },
-  // TX_8X32
-  NULL,
-  // TX_32X8
-  NULL,
-  // TX_16X64
-  NULL,
-  // TX_64X16
-  NULL,
-};
-
-static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            int blk_row, int blk_col, TxSetType tx_set_type,
-                            TX_TYPE_PRUNE_MODE prune_mode) {
-  static const int tx_type_table_2D[16] = {
-    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
-    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
-    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
-    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
-  };
-  if (tx_set_type != EXT_TX_SET_ALL16 &&
-      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
-    return 0;
-  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
-  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
-  if (!nn_config_hor || !nn_config_ver) return 0;  // Model not established yet.
-
-  aom_clear_system_state();
-  float hfeatures[16], vfeatures[16];
-  float hscores[4], vscores[4];
-  float scores_2D[16];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
-  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
-  assert(hfeatures_num <= 16);
-  assert(vfeatures_num <= 16);
-
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int diff_stride = block_size_wide[bsize];
-  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
-                                vfeatures);
-  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
-                                  &hfeatures[hfeatures_num - 1],
-                                  &vfeatures[vfeatures_num - 1]);
-  av1_nn_predict(hfeatures, nn_config_hor, hscores);
-  av1_nn_predict(vfeatures, nn_config_ver, vscores);
-  aom_clear_system_state();
-
-  float score_2D_average = 0.0f;
-  for (int i = 0; i < 4; i++) {
-    float *cur_scores_2D = scores_2D + i * 4;
-    cur_scores_2D[0] = vscores[i] * hscores[0];
-    cur_scores_2D[1] = vscores[i] * hscores[1];
-    cur_scores_2D[2] = vscores[i] * hscores[2];
-    cur_scores_2D[3] = vscores[i] * hscores[3];
-    score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] +
-                        cur_scores_2D[3];
-  }
-  score_2D_average /= 16;
-
-  const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
-  int pruning_aggressiveness = 1;
-  if (tx_set_type == EXT_TX_SET_ALL16) {
-    score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
-    pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
-  } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
-    score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
-    pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
-  }
-
-  // Always keep the TX type with the highest score, prune all others with
-  // score below score_thresh.
-  int max_score_i = 0;
-  float max_score = 0.0f;
-  for (int i = 0; i < 16; i++) {
-    if (scores_2D[i] > max_score &&
-        av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) {
-      max_score = scores_2D[i];
-      max_score_i = i;
-    }
-  }
-
-  const float score_thresh =
-      prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
-
-  uint16_t prune_bitmask = 0;
-  for (int i = 0; i < 16; i++) {
-    if (scores_2D[i] < score_thresh && i != max_score_i)
-      prune_bitmask |= (1 << tx_type_table_2D[i]);
-  }
-  return prune_bitmask;
-}
-
-// ((prune >> vtx_tab[tx_type]) & 1)
-static const uint16_t prune_v_mask[] = {
-  0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff,
-  0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff,
-};
-
-// ((prune >> (htx_tab[tx_type] + 8)) & 1)
-static const uint16_t prune_h_mask[] = {
-  0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff,
-  0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff,
-};
-
-static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) {
-  uint8_t prune_v = tx_search_prune & 0x0F;
-  uint8_t prune_h = (tx_search_prune >> 8) & 0x0F;
-  return (prune_v_mask[prune_v] & prune_h_mask[prune_h]);
-}
-
-static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                     const MACROBLOCKD *const xd, int tx_set_type) {
-  x->tx_search_prune[tx_set_type] = 0;
-  x->tx_split_prune_flag = 0;
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  if ((is_inter && cpi->oxcf.use_inter_dct_only) ||
-      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
-    x->tx_search_prune[tx_set_type] = ~(1 << DCT_DCT);
-    return;
-  }
-  if (!is_inter || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
-      x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
-      x->cb_partition_scan)
-    return;
-  int tx_set = ext_tx_set_index[1][tx_set_type];
-  assert(tx_set >= 0);
-  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
-  int prune = 0;
-  switch (cpi->sf.tx_type_search.prune_mode) {
-    case NO_PRUNE: return;
-    case PRUNE_ONE:
-      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
-      prune = prune_one_for_sby(cpi, bsize, x, xd);
-      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
-      break;
-    case PRUNE_TWO:
-      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
-        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
-        prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
-      } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
-        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
-      } else {
-        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
-      }
-      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
-      break;
-    case PRUNE_2D_ACCURATE:
-    case PRUNE_2D_FAST: break;
-    default: assert(0);
-  }
-}
-
-static void model_rd_from_sse(const AV1_COMP *const cpi,
-                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
-                              int plane, int64_t sse, int num_samples,
-                              int *rate, int64_t *dist) {
-  (void)num_samples;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-
-  // Fast approximate the modelling function.
-  if (cpi->sf.simple_model_rd_from_var) {
-    const int64_t square_error = sse;
-    int quantizer = pd->dequant_Q3[1] >> dequant_shift;
-    if (quantizer < 120)
-      *rate = (int)AOMMIN(
-          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
-          INT_MAX);
-    else
-      *rate = 0;
-    assert(*rate >= 0);
-    *dist = (square_error * quantizer) >> 8;
-  } else {
-    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
-                                 pd->dequant_Q3[1] >> dequant_shift, rate,
-                                 dist);
-  }
-  *dist <<= 4;
-}
-
-static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
+                       int64_t *sse_y) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
   int64_t total_sse = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
                                                pd->subsampling_y);
     unsigned int sse;
 
-    if (x->skip_chroma_rd && plane) continue;
-
     cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
                        &sse);
     total_sse += sse;
+    if (!plane && sse_y) *sse_y = sse;
   }
   total_sse <<= 4;
   return total_sse;
 }
 
-static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
-                            int plane_to, int mi_row, int mi_col,
-                            int *out_rate_sum, int64_t *out_dist_sum,
-                            int *skip_txfm_sb, int64_t *skip_sse_sb,
-                            int *plane_rate, int64_t *plane_sse,
-                            int64_t *plane_dist) {
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  int plane;
-  (void)mi_row;
-  (void)mi_col;
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    int64_t sse;
-    int rate;
-    int64_t dist;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    if (is_cur_buf_hbd(xd)) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-
-    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-    assert(rate_sum >= 0);
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  rate_sum = AOMMIN(rate_sum, INT_MAX);
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -2075,6 +649,19 @@
   return error;
 }
 
+int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             intptr_t block_size) {
+  int64_t error = 0;
+
+  for (int i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff, intptr_t block_size,
                                  int64_t *ssz, int bd) {
@@ -2095,2077 +682,7 @@
   *ssz = sqcoeff;
   return error;
 }
-
-// Get transform block visible dimensions cropped to the MI units.
-static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
-                               BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
-                               BLOCK_SIZE tx_bsize, int *width, int *height,
-                               int *visible_width, int *visible_height) {
-  assert(tx_bsize <= plane_bsize);
-  int txb_height = block_size_high[tx_bsize];
-  int txb_width = block_size_wide[tx_bsize];
-  const int block_height = block_size_high[plane_bsize];
-  const int block_width = block_size_wide[plane_bsize];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
-  // than the MI size
-  const int block_rows =
-      (xd->mb_to_bottom_edge >= 0)
-          ? block_height
-          : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
-  const int block_cols =
-      (xd->mb_to_right_edge >= 0)
-          ? block_width
-          : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
-  const int tx_unit_size = tx_size_wide_log2[0];
-  if (width) *width = txb_width;
-  if (height) *height = txb_height;
-  *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
-  *visible_height =
-      clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
-}
-
-// Compute the pixel domain distortion from src and dst on all visible 4x4s in
-// the
-// transform block.
-static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
-                           int plane, const uint8_t *src, const int src_stride,
-                           const uint8_t *dst, const int dst_stride,
-                           int blk_row, int blk_col,
-                           const BLOCK_SIZE plane_bsize,
-                           const BLOCK_SIZE tx_bsize) {
-  int txb_rows, txb_cols, visible_rows, visible_cols;
-  const MACROBLOCKD *xd = &x->e_mbd;
-
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
-                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
-  assert(visible_rows > 0);
-  assert(visible_cols > 0);
-
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0)
-    return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
-                                  tx_bsize, txb_cols, txb_rows, visible_cols,
-                                  visible_rows, x->qindex);
-#endif  // CONFIG_DIST_8X8
-
-  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
-                                         dst_stride, tx_bsize, txb_rows,
-                                         txb_cols, visible_rows, visible_cols);
-
-  return sse;
-}
-
-// Compute the pixel domain distortion from diff on all visible 4x4s in the
-// transform block.
-static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                                      int blk_row, int blk_col,
-                                      const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize,
-                                      unsigned int *block_mse_q8) {
-  int visible_rows, visible_cols;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
-                     NULL, &visible_cols, &visible_rows);
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
-#if CONFIG_DIST_8X8
-  int txb_height = block_size_high[tx_bsize];
-  int txb_width = block_size_wide[tx_bsize];
-  if (x->using_dist_8x8 && plane == 0) {
-    const int src_stride = x->plane[plane].src.stride;
-    const int src_idx = (blk_row * src_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const int diff_idx = (blk_row * diff_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-    return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride,
-                         txb_width, txb_height, visible_cols, visible_rows,
-                         x->qindex);
-  }
 #endif
-  diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
-  uint64_t sse =
-      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
-  if (block_mse_q8 != NULL)
-    *block_mse_q8 = (unsigned int)((256 * sse) / (visible_cols * visible_rows));
-  return sse;
-}
-
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                     int *val_count) {
-  const int max_pix_val = 1 << 8;
-  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      const int this_val = src[r * stride + c];
-      assert(this_val < max_pix_val);
-      ++val_count[this_val];
-    }
-  }
-  int n = 0;
-  for (int i = 0; i < max_pix_val; ++i) {
-    if (val_count[i]) ++n;
-  }
-  return n;
-}
-
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth, int *val_count) {
-  assert(bit_depth <= 12);
-  const int max_pix_val = 1 << bit_depth;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      const int this_val = src[r * stride + c];
-      assert(this_val < max_pix_val);
-      if (this_val >= max_pix_val) return 0;
-      ++val_count[this_val];
-    }
-  }
-  int n = 0;
-  for (int i = 0; i < max_pix_val; ++i) {
-    if (val_count[i]) ++n;
-  }
-  return n;
-}
-
-static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane,
-                                           int block, int blk_row, int blk_col,
-                                           int eob, int reduced_tx_set) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, reduced_tx_set);
-  const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
-                              dst_stride, eob, reduced_tx_set);
-}
-
-static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash);
-
-static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
-                                   int blk_col, BLOCK_SIZE plane_bsize,
-                                   TX_SIZE tx_size) {
-  int16_t tmp_data[64 * 64];
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
-  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  const int txb_w = tx_size_wide[tx_size];
-  const int txb_h = tx_size_high[tx_size];
-  uint8_t *hash_data = (uint8_t *)cur_diff_row;
-  if (txb_w != diff_stride) {
-    int16_t *cur_hash_row = tmp_data;
-    for (int i = 0; i < txb_h; i++) {
-      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
-      cur_hash_row += txb_w;
-      cur_diff_row += diff_stride;
-    }
-    hash_data = (uint8_t *)tmp_data;
-  }
-  CRC32C *crc = &x->mb_rd_record.crc_calculator;
-  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
-  return (hash << 5) + tx_size;
-}
-
-static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
-                                        TX_SIZE tx_size, int64_t *out_dist,
-                                        int64_t *out_sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  // Transform domain distortion computation is more efficient as it does
-  // not involve an inverse transform, but it is less accurate.
-  const int buffer_length = av1_get_max_eob(tx_size);
-  int64_t this_sse;
-  // TX-domain results need to shift down to Q2/D10 to match pixel
-  // domain distortion values which are in Q2^2
-  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-
-  if (is_cur_buf_hbd(xd))
-    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
-                                       xd->bd);
-  else
-    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-
-  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
-  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
-}
-
-static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
-                                           int plane, BLOCK_SIZE plane_bsize,
-                                           int block, int blk_row, int blk_col,
-                                           TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const uint16_t eob = p->eobs[block];
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  const int bsw = block_size_wide[tx_bsize];
-  const int bsh = block_size_high[tx_bsize];
-  const int src_stride = x->plane[plane].src.stride;
-  const int dst_stride = xd->plane[plane].dst.stride;
-  // Scale the transform block index to pixel unit.
-  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
-  const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
-  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-  const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-
-  assert(cpi != NULL);
-  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
-
-  uint8_t *recon;
-  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
-
-  if (is_cur_buf_hbd(xd)) {
-    recon = CONVERT_TO_BYTEPTR(recon16);
-    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
-                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
-                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
-  } else {
-    recon = (uint8_t *)recon16;
-    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
-                            NULL, 0, 0, NULL);
-  }
-
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
-                                    cpi->common.reduced_tx_set_used);
-  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
-                              MAX_TX_SIZE, eob,
-                              cpi->common.reduced_tx_set_used);
-
-  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
-                         blk_row, blk_col, plane_bsize, tx_bsize);
-}
-
-static double get_diff_mean(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h) {
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
-      sum += diff;
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
-static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
-                                   const uint8_t *dst8, int dst_stride, int w,
-                                   int h) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
-      sum += diff;
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
-static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      const int err = diff[j * stride + i];
-      sum += err * err;
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
-static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      sum += abs(diff[j * stride + i]);
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
-static void get_2x2_normalized_sses_and_sads(
-    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
-    int src_stride, const uint8_t *const dst, int dst_stride,
-    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
-    double *const sad_norm_arr) {
-  const BLOCK_SIZE tx_bsize_half =
-      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
-  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
-    const int half_width = block_size_wide[tx_bsize] / 2;
-    const int half_height = block_size_high[tx_bsize] / 2;
-    for (int row = 0; row < 2; ++row) {
-      for (int col = 0; col < 2; ++col) {
-        const int16_t *const this_src_diff =
-            src_diff + row * half_height * diff_stride + col * half_width;
-        if (sse_norm_arr) {
-          sse_norm_arr[row * 2 + col] =
-              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
-        }
-        if (sad_norm_arr) {
-          sad_norm_arr[row * 2 + col] =
-              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
-        }
-      }
-    }
-  } else {  // use function pointers to calculate stats
-    const int half_width = block_size_wide[tx_bsize_half];
-    const int half_height = block_size_high[tx_bsize_half];
-    const int num_samples_half = half_width * half_height;
-    for (int row = 0; row < 2; ++row) {
-      for (int col = 0; col < 2; ++col) {
-        const uint8_t *const this_src =
-            src + row * half_height * src_stride + col * half_width;
-        const uint8_t *const this_dst =
-            dst + row * half_height * dst_stride + col * half_width;
-
-        if (sse_norm_arr) {
-          unsigned int this_sse;
-          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
-                                        dst_stride, &this_sse);
-          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
-        }
-
-        if (sad_norm_arr) {
-          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
-              this_src, src_stride, this_dst, dst_stride);
-          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
-        }
-      }
-    }
-  }
-}
-
-// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
-// 0: Do not collect any RD stats
-// 1: Collect RD stats for transform units
-// 2: Collect RD stats for partition units
-#if CONFIG_COLLECT_RD_STATS
-
-#if CONFIG_COLLECT_RD_STATS == 1
-static double get_mean(const int16_t *diff, int stride, int w, int h) {
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      sum += diff[j * stride + i];
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
-static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    const RD_STATS *const rd_stats, int blk_row,
-                                    int blk_col, BLOCK_SIZE plane_bsize,
-                                    TX_SIZE tx_size, TX_TYPE tx_type,
-                                    int64_t rd) {
-  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
-
-  // Generate small sample to restrict output size.
-  static unsigned int seed = 21743;
-  if (lcg_rand16(&seed) % 256 > 0) return;
-
-  const char output_file[] = "tu_stats.txt";
-  FILE *fout = fopen(output_file, "a");
-  if (!fout) return;
-
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const int plane = 0;
-  struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int txw = tx_size_wide[tx_size];
-  const int txh = tx_size_high[tx_size];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-  const int num_samples = txw * txh;
-
-  const double rate_norm = (double)rd_stats->rate / num_samples;
-  const double dist_norm = (double)rd_stats->dist / num_samples;
-
-  fprintf(fout, "%g %g", rate_norm, dist_norm);
-
-  const int src_stride = p->src.stride;
-  const uint8_t *const src =
-      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *const dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  unsigned int sse;
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-  const double sse_norm = (double)sse / num_samples;
-
-  const unsigned int sad =
-      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
-  const double sad_norm = (double)sad / num_samples;
-
-  fprintf(fout, " %g %g", sse_norm, sad_norm);
-
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *const src_diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-
-  double sse_norm_arr[4], sad_norm_arr[4];
-  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
-                                   dst_stride, src_diff, diff_stride,
-                                   sse_norm_arr, sad_norm_arr);
-  for (int i = 0; i < 4; ++i) {
-    fprintf(fout, " %g", sse_norm_arr[i]);
-  }
-  for (int i = 0; i < 4; ++i) {
-    fprintf(fout, " %g", sad_norm_arr[i]);
-  }
-
-  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
-  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
-
-  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
-          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
-
-  int model_rate;
-  int64_t model_dist;
-  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
-                                   &model_rate, &model_dist);
-  const double model_rate_norm = (double)model_rate / num_samples;
-  const double model_dist_norm = (double)model_dist / num_samples;
-  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
-
-  const double mean = get_mean(src_diff, diff_stride, txw, txh);
-  float hor_corr, vert_corr;
-  av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
-                                  &vert_corr);
-  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
-
-  double hdist[4] = { 0 }, vdist[4] = { 0 };
-  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
-                               1, hdist, vdist);
-  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
-          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
-
-  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
-
-  fprintf(fout, "\n");
-  fclose(fout);
-}
-#endif  // CONFIG_COLLECT_RD_STATS == 1
-
-#if CONFIG_COLLECT_RD_STATS >= 2
-static void PrintPredictionUnitStats(const AV1_COMP *const cpi,
-                                     const TileDataEnc *tile_data,
-                                     MACROBLOCK *x,
-                                     const RD_STATS *const rd_stats,
-                                     BLOCK_SIZE plane_bsize) {
-  if (rd_stats->invalid_rate) return;
-  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
-
-  if (cpi->sf.inter_mode_rd_model_estimation == 1 &&
-      (tile_data == NULL ||
-       !tile_data->inter_mode_rd_models[plane_bsize].ready))
-    return;
-  (void)tile_data;
-  // Generate small sample to restrict output size.
-  static unsigned int seed = 95014;
-
-  if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
-      1)
-    return;
-
-  const char output_file[] = "pu_stats.txt";
-  FILE *fout = fopen(output_file, "a");
-  if (!fout) return;
-
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const int plane = 0;
-  struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int diff_stride = block_size_wide[plane_bsize];
-  int bw, bh;
-  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
-                     &bh);
-  const int num_samples = bw * bh;
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-
-  const double rate_norm = (double)rd_stats->rate / num_samples;
-  const double dist_norm = (double)rd_stats->dist / num_samples;
-  const double rdcost_norm =
-      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
-
-  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
-
-  const int src_stride = p->src.stride;
-  const uint8_t *const src = p->src.buf;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *const dst = pd->dst.buf;
-  const int16_t *const src_diff = p->src_diff;
-  const int shift = (xd->bd - 8);
-
-  int64_t sse;
-  if (is_cur_buf_hbd(xd)) {
-    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                         bw, bh);
-  } else {
-    sse =
-        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
-  }
-  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
-  const double sse_norm = (double)sse / num_samples;
-
-  const unsigned int sad =
-      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
-  const double sad_norm =
-      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
-
-  fprintf(fout, " %g %g", sse_norm, sad_norm);
-
-  double sse_norm_arr[4], sad_norm_arr[4];
-  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
-                                   dst_stride, src_diff, diff_stride,
-                                   sse_norm_arr, sad_norm_arr);
-  if (shift) {
-    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
-    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
-  }
-  for (int i = 0; i < 4; ++i) {
-    fprintf(fout, " %g", sse_norm_arr[i]);
-  }
-  for (int i = 0; i < 4; ++i) {
-    fprintf(fout, " %g", sad_norm_arr[i]);
-  }
-
-  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
-
-  int model_rate;
-  int64_t model_dist;
-  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
-                                   &model_rate, &model_dist);
-  const double model_rdcost_norm =
-      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
-  const double model_rate_norm = (double)model_rate / num_samples;
-  const double model_dist_norm = (double)model_dist / num_samples;
-  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
-          model_rdcost_norm);
-
-  double mean;
-  if (is_cur_buf_hbd(xd)) {
-    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
-                                pd->dst.stride, bw, bh);
-  } else {
-    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                         bw, bh);
-  }
-  mean /= (1 << shift);
-  float hor_corr, vert_corr;
-  av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
-                                  &vert_corr);
-  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
-
-  double hdist[4] = { 0 }, vdist[4] = { 0 };
-  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
-                               dst_stride, 1, hdist, vdist);
-  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
-          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
-
-  if (cpi->sf.inter_mode_rd_model_estimation == 1) {
-    assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
-    const int64_t overall_sse = get_sse(cpi, x);
-    int est_residue_cost = 0;
-    int64_t est_dist = 0;
-    get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
-                      &est_dist);
-    const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
-    const double est_dist_norm = (double)est_dist / num_samples;
-    const double est_rdcost_norm =
-        (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
-    fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
-            est_rdcost_norm);
-  }
-
-  fprintf(fout, "\n");
-  fclose(fout);
-}
-#endif  // CONFIG_COLLECT_RD_STATS >= 2
-#endif  // CONFIG_COLLECT_RD_STATS
-
-static void model_rd_with_dnn(const AV1_COMP *const cpi,
-                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
-                              int plane, int64_t sse, int num_samples,
-                              int *rate, int64_t *dist) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int log_numpels = num_pels_log2_lookup[plane_bsize];
-
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
-
-  const struct macroblock_plane *const p = &x->plane[plane];
-  int bw, bh;
-  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
-                     &bh);
-  const int src_stride = p->src.stride;
-  const uint8_t *const src = p->src.buf;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *const dst = pd->dst.buf;
-  const int16_t *const src_diff = p->src_diff;
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int shift = (xd->bd - 8);
-
-  if (sse == 0) {
-    if (rate) *rate = 0;
-    if (dist) *dist = 0;
-    return;
-  }
-  if (plane) {
-    int model_rate;
-    int64_t model_dist;
-    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
-                          &model_rate, &model_dist);
-    if (rate) *rate = model_rate;
-    if (dist) *dist = model_dist;
-    return;
-  }
-
-  aom_clear_system_state();
-  const double sse_norm = (double)sse / num_samples;
-
-  double sse_norm_arr[4];
-  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
-                                   dst_stride, src_diff, diff_stride,
-                                   sse_norm_arr, NULL);
-  double mean;
-  if (is_cur_buf_hbd(xd)) {
-    mean = get_highbd_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
-  } else {
-    mean = get_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
-  }
-  if (shift) {
-    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
-    mean /= (1 << shift);
-  }
-  double sse_norm_sum = 0.0, sse_frac_arr[3];
-  for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
-  for (int k = 0; k < 3; ++k)
-    sse_frac_arr[k] =
-        sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
-  const double q_sqr = (double)(q_step * q_step);
-  const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
-  const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
-  float hor_corr, vert_corr;
-  av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
-                                  &vert_corr);
-
-  float features[NUM_FEATURES_PUSTATS];
-  features[0] = (float)hor_corr;
-  features[1] = (float)log_numpels;
-  features[2] = (float)mean_sqr_by_sse_norm;
-  features[3] = (float)q_sqr_by_sse_norm;
-  features[4] = (float)sse_frac_arr[0];
-  features[5] = (float)sse_frac_arr[1];
-  features[6] = (float)sse_frac_arr[2];
-  features[7] = (float)vert_corr;
-
-  float rate_f, dist_by_sse_norm_f;
-  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
-  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
-  aom_clear_system_state();
-  const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
-  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
-  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-
-  // Check if skip is better
-  if (rate_i == 0) {
-    dist_i = sse << 4;
-  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
-             RDCOST(x->rdmult, 0, sse << 4)) {
-    rate_i = 0;
-    dist_i = sse << 4;
-  }
-
-  if (rate) *rate = rate_i;
-  if (dist) *dist = dist_i;
-  return;
-}
-
-static void model_rd_for_sb_with_dnn(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
-  (void)mi_row;
-  (void)mi_col;
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    int64_t dist, sse;
-    int rate;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    const struct macroblock_plane *const p = &x->plane[plane];
-    const int shift = (xd->bd - 8);
-    int bw, bh;
-    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
-                       &bw, &bh);
-    if (is_cur_buf_hbd(xd)) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
-
-    model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
-// Fits a surface for rate and distortion using as features:
-// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
-static void model_rd_with_surffit(const AV1_COMP *const cpi,
-                                  const MACROBLOCK *const x,
-                                  BLOCK_SIZE plane_bsize, int plane,
-                                  int64_t sse, int num_samples, int *rate,
-                                  int64_t *dist) {
-  (void)cpi;
-  (void)plane_bsize;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
-  if (sse == 0) {
-    if (rate) *rate = 0;
-    if (dist) *dist = 0;
-    return;
-  }
-  aom_clear_system_state();
-  const double sse_norm = (double)sse / num_samples;
-  const double qstepsqr = (double)qstep * qstep;
-  const double xm = log(sse_norm + 1.0) / log(2.0);
-  const double yl = log(sse_norm / qstepsqr) / log(2.0);
-  double rate_f, dist_by_sse_norm_f;
-
-  av1_model_rd_surffit(plane_bsize, sse_norm, xm, yl, &rate_f,
-                       &dist_by_sse_norm_f);
-
-  const double dist_f = dist_by_sse_norm_f * sse_norm;
-  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
-  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-  aom_clear_system_state();
-
-  // Check if skip is better
-  if (rate_i == 0) {
-    dist_i = sse << 4;
-  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
-             RDCOST(x->rdmult, 0, sse << 4)) {
-    rate_i = 0;
-    dist_i = sse << 4;
-  }
-
-  if (rate) *rate = rate_i;
-  if (dist) *dist = dist_i;
-}
-
-static void model_rd_for_sb_with_surffit(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
-  (void)mi_row;
-  (void)mi_col;
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    int64_t dist, sse;
-    int rate;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    int bw, bh;
-    const struct macroblock_plane *const p = &x->plane[plane];
-    const int shift = (xd->bd - 8);
-    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
-                       &bw, &bh);
-    if (is_cur_buf_hbd(xd)) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
-
-    model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
-                          &dist);
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
-// Fits a curve for rate and distortion using as feature:
-// log2(sse_norm/qstep^2)
-static void model_rd_with_curvfit(const AV1_COMP *const cpi,
-                                  const MACROBLOCK *const x,
-                                  BLOCK_SIZE plane_bsize, int plane,
-                                  int64_t sse, int num_samples, int *rate,
-                                  int64_t *dist) {
-  (void)cpi;
-  (void)plane_bsize;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
-
-  if (sse == 0) {
-    if (rate) *rate = 0;
-    if (dist) *dist = 0;
-    return;
-  }
-  aom_clear_system_state();
-  const double sse_norm = (double)sse / num_samples;
-  const double qstepsqr = (double)qstep * qstep;
-  const double xqr = log2(sse_norm / qstepsqr);
-
-  double rate_f, dist_by_sse_norm_f;
-  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
-                       &dist_by_sse_norm_f);
-
-  const double dist_f = dist_by_sse_norm_f * sse_norm;
-  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
-  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-  aom_clear_system_state();
-
-  // Check if skip is better
-  if (rate_i == 0) {
-    dist_i = sse << 4;
-  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
-             RDCOST(x->rdmult, 0, sse << 4)) {
-    rate_i = 0;
-    dist_i = sse << 4;
-  }
-
-  if (rate) *rate = rate_i;
-  if (dist) *dist = dist_i;
-}
-
-static void model_rd_for_sb_with_curvfit(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
-  (void)mi_row;
-  (void)mi_col;
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    int64_t dist, sse;
-    int rate;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    int bw, bh;
-    const struct macroblock_plane *const p = &x->plane[plane];
-    const int shift = (xd->bd - 8);
-    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
-                       &bw, &bh);
-
-    if (is_cur_buf_hbd(xd)) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-
-    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
-    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
-                          &dist);
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
-static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                               int block, int blk_row, int blk_col,
-                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               const TXB_CTX *const txb_ctx,
-                               FAST_TX_SEARCH_MODE ftxs_mode,
-                               int use_fast_coef_costing, int skip_trellis,
-                               int64_t ref_best_rd, RD_STATS *best_rd_stats) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  int64_t best_rd = INT64_MAX;
-  uint16_t best_eob = 0;
-  TX_TYPE best_tx_type = DCT_DCT;
-  TX_TYPE last_tx_type = TX_TYPES;
-  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
-  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
-  // of the best tx_type
-  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
-  tran_low_t *orig_dqcoeff = pd->dqcoeff;
-  tran_low_t *best_dqcoeff = this_dqcoeff;
-  const int txk_type_idx =
-      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
-  int perform_block_coeff_opt;
-  av1_invalid_rd_stats(best_rd_stats);
-
-  TXB_RD_INFO *intra_txb_rd_info = NULL;
-  uint16_t cur_joint_ctx = 0;
-  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
-  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int within_border =
-      mi_row >= xd->tile.mi_row_start &&
-      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-      mi_col >= xd->tile.mi_col_start &&
-      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-  skip_trellis |=
-      cpi->optimize_seg_arr[mbmi->segment_id] == NO_TRELLIS_OPT ||
-      cpi->optimize_seg_arr[mbmi->segment_id] == FINAL_PASS_TRELLIS_OPT;
-  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
-      !is_inter && plane == 0 &&
-      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
-    const uint32_t intra_hash =
-        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-    const int intra_hash_idx =
-        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
-    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
-
-    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-    if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
-        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
-      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
-      const TX_TYPE ref_tx_type =
-          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
-                          tx_size, cpi->common.reduced_tx_set_used);
-      if (ref_tx_type == intra_txb_rd_info->tx_type) {
-        best_rd_stats->rate = intra_txb_rd_info->rate;
-        best_rd_stats->dist = intra_txb_rd_info->dist;
-        best_rd_stats->sse = intra_txb_rd_info->sse;
-        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
-        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
-        x->plane[plane].txb_entropy_ctx[block] =
-            intra_txb_rd_info->txb_entropy_ctx;
-        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
-        best_eob = intra_txb_rd_info->eob;
-        best_tx_type = intra_txb_rd_info->tx_type;
-        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                         best_tx_type);
-        goto RECON_INTRA;
-      }
-    }
-  }
-
-  int rate_cost = 0;
-  TX_TYPE txk_start = DCT_DCT;
-  TX_TYPE txk_end = TX_TYPES - 1;
-  if ((!is_inter && x->use_default_intra_tx_type) ||
-      (is_inter && x->use_default_inter_tx_type)) {
-    txk_start = txk_end =
-        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
-  } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
-    if (plane == 0) txk_end = DCT_DCT;
-  }
-
-  uint8_t best_txb_ctx = 0;
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
-
-  TX_TYPE uv_tx_type = DCT_DCT;
-  if (plane) {
-    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
-    uv_tx_type = txk_start = txk_end =
-        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
-                        cm->reduced_tx_set_used);
-  }
-  const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
-      ext_tx_used_flag == 0x0001 ||
-      (is_inter && cpi->oxcf.use_inter_dct_only) ||
-      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
-    txk_start = txk_end = DCT_DCT;
-  }
-  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
-  if (txk_start == txk_end) {
-    allowed_tx_mask = 1 << txk_start;
-    allowed_tx_mask &= ext_tx_used_flag;
-  } else if (fast_tx_search) {
-    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
-    allowed_tx_mask &= ext_tx_used_flag;
-  } else {
-    assert(plane == 0);
-    allowed_tx_mask = ext_tx_used_flag;
-    // !fast_tx_search && txk_end != txk_start && plane == 0
-    const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
-    if (do_prune && is_inter) {
-      if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
-        const uint16_t prune =
-            prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
-                        cpi->sf.tx_type_search.prune_mode);
-        allowed_tx_mask &= (~prune);
-      } else {
-        allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]);
-      }
-    }
-  }
-
-  if (cpi->oxcf.enable_flip_idtx == 0) {
-    for (TX_TYPE tx_type = FLIPADST_DCT; tx_type <= H_FLIPADST; ++tx_type) {
-      allowed_tx_mask &= ~(1 << tx_type);
-    }
-  }
-
-  // Need to have at least one transform type allowed.
-  if (allowed_tx_mask == 0) {
-    txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
-    allowed_tx_mask = (1 << txk_start);
-  }
-
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  int64_t block_sse = 0;
-  unsigned int block_mse_q8 = UINT_MAX;
-  block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize,
-                              &block_mse_q8);
-  assert(block_mse_q8 != UINT_MAX);
-  if (is_cur_buf_hbd(xd)) {
-    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
-    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
-  }
-  block_sse *= 16;
-  // Tranform domain distortion is accurate for higher residuals.
-  // TODO(any): Experiment with variance and mean based thresholds
-  int use_transform_domain_distortion =
-      (cpi->sf.use_transform_domain_distortion > 0) &&
-      (block_mse_q8 >= cpi->tx_domain_dist_threshold) &&
-      // Any 64-pt transforms only preserves half the coefficients.
-      // Therefore transform domain distortion is not valid for these
-      // transform sizes.
-      txsize_sqr_up_map[tx_size] != TX_64X64;
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8) use_transform_domain_distortion = 0;
-#endif
-  int calc_pixel_domain_distortion_final =
-      cpi->sf.use_transform_domain_distortion == 1 &&
-      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
-      !x->cb_partition_scan;
-  if (calc_pixel_domain_distortion_final &&
-      (txk_start == txk_end || allowed_tx_mask == 0x0001))
-    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
-
-  const uint16_t *eobs_ptr = x->plane[plane].eobs;
-
-  // Used mse based threshold logic to take decision of R-D of optimization of
-  // coeffs. For smaller residuals, coeff optimization would be helpful. For
-  // larger residuals, R-D optimization may not be effective.
-  // TODO(any): Experiment with variance and mean based thresholds
-  perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold);
-
-  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (!(allowed_tx_mask & (1 << tx_type))) continue;
-    if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
-    RD_STATS this_rd_stats;
-    av1_invalid_rd_stats(&this_rd_stats);
-    if (skip_trellis || (!perform_block_coeff_opt)) {
-      av1_xform_quant(
-          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
-          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
-      rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
-                                  txb_ctx, use_fast_coef_costing);
-    } else {
-      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
-      if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
-          eobs_ptr[block] >= 4) {
-        // Calculate distortion quickly in transform domain.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
-                             &this_rd_stats.sse);
-
-        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
-        const int64_t dist_cost_estimate =
-            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
-        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
-      }
-      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
-                     cpi->sf.trellis_eob_fast, &rate_cost);
-    }
-    if (eobs_ptr[block] == 0) {
-      // When eob is 0, pixel domain distortion is more efficient and accurate.
-      this_rd_stats.dist = this_rd_stats.sse = block_sse;
-    } else if (use_transform_domain_distortion) {
-      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
-                           &this_rd_stats.sse);
-    } else {
-      int64_t sse_diff = INT64_MAX;
-      // high_energy threshold assumes that every pixel within a txfm block
-      // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
-      // for 8 bit, then the threshold is scaled based on input bit depth.
-      const int64_t high_energy_thresh =
-          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
-      const int is_high_energy = (block_sse >= high_energy_thresh);
-      if (tx_size == TX_64X64 || is_high_energy) {
-        // Because 3 out 4 quadrants of transform coefficients are forced to
-        // zero, the inverse transform has a tendency to overflow. sse_diff
-        // is effectively the energy of those 3 quadrants, here we use it
-        // to decide if we should do pixel domain distortion. If the energy
-        // is mostly in first quadrant, then it is unlikely that we have
-        // overflow issue in inverse transform.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
-                             &this_rd_stats.sse);
-        sse_diff = block_sse - this_rd_stats.sse;
-      }
-      if (tx_size != TX_64X64 || !is_high_energy ||
-          (sse_diff * 2) < this_rd_stats.sse) {
-        const int64_t tx_domain_dist = this_rd_stats.dist;
-        this_rd_stats.dist = dist_block_px_domain(
-            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
-        // For high energy blocks, occasionally, the pixel domain distortion
-        // can be artificially low due to clamping at reconstruction stage
-        // even when inverse transform output is hugely different from the
-        // actual residue.
-        if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
-          this_rd_stats.dist = tx_domain_dist;
-      } else {
-        this_rd_stats.dist += sse_diff;
-      }
-      this_rd_stats.sse = block_sse;
-    }
-
-    this_rd_stats.rate = rate_cost;
-
-    const int64_t rd =
-        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-
-    if (rd < best_rd) {
-      best_rd = rd;
-      *best_rd_stats = this_rd_stats;
-      best_tx_type = tx_type;
-      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
-      best_eob = x->plane[plane].eobs[block];
-      last_tx_type = best_tx_type;
-
-      // Swap qcoeff and dqcoeff buffers
-      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
-      best_dqcoeff = pd->dqcoeff;
-      pd->dqcoeff = tmp_dqcoeff;
-    }
-
-#if CONFIG_COLLECT_RD_STATS == 1
-    if (plane == 0) {
-      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
-                              plane_bsize, tx_size, tx_type, rd);
-    }
-#endif  // CONFIG_COLLECT_RD_STATS == 1
-
-#if COLLECT_TX_SIZE_DATA
-    // Generate small sample to restrict output size.
-    static unsigned int seed = 21743;
-    if (lcg_rand16(&seed) % 200 == 0) {
-      FILE *fp = NULL;
-
-      if (within_border) {
-        fp = fopen(av1_tx_size_data_output_file, "a");
-      }
-
-      if (fp) {
-        // Transform info and RD
-        const int txb_w = tx_size_wide[tx_size];
-        const int txb_h = tx_size_high[tx_size];
-
-        // Residue signal.
-        const int diff_stride = block_size_wide[plane_bsize];
-        struct macroblock_plane *const p = &x->plane[plane];
-        const int16_t *src_diff =
-            &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
-
-        for (int r = 0; r < txb_h; ++r) {
-          for (int c = 0; c < txb_w; ++c) {
-            fprintf(fp, "%d,", src_diff[c]);
-          }
-          src_diff += diff_stride;
-        }
-
-        fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
-        fprintf(fp, "\n");
-        fclose(fp);
-      }
-    }
-#endif  // COLLECT_TX_SIZE_DATA
-
-    if (cpi->sf.adaptive_txb_search_level) {
-      if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) >
-          ref_best_rd) {
-        break;
-      }
-    }
-
-    // Skip transform type search when we found the block has been quantized to
-    // all zero and at the same time, it has better rdcost than doing transform.
-    if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
-  }
-
-  assert(best_rd != INT64_MAX);
-
-  best_rd_stats->skip = best_eob == 0;
-  if (plane == 0) {
-    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     best_tx_type);
-  }
-  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
-  x->plane[plane].eobs[block] = best_eob;
-
-  pd->dqcoeff = best_dqcoeff;
-
-  if (calc_pixel_domain_distortion_final && best_eob) {
-    best_rd_stats->dist = dist_block_px_domain(
-        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
-    best_rd_stats->sse = block_sse;
-  }
-
-  if (intra_txb_rd_info != NULL) {
-    intra_txb_rd_info->valid = 1;
-    intra_txb_rd_info->entropy_context = cur_joint_ctx;
-    intra_txb_rd_info->rate = best_rd_stats->rate;
-    intra_txb_rd_info->dist = best_rd_stats->dist;
-    intra_txb_rd_info->sse = best_rd_stats->sse;
-    intra_txb_rd_info->eob = best_eob;
-    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
-    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
-  }
-
-RECON_INTRA:
-  if (!is_inter && best_eob &&
-      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
-       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
-    // intra mode needs decoded result such that the next transform block
-    // can use it for prediction.
-    // if the last search tx_type is the best tx_type, we don't need to
-    // do this again
-    if (best_tx_type != last_tx_type) {
-      if (skip_trellis) {
-        av1_xform_quant(
-            cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-            best_tx_type,
-            USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
-      } else {
-        av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                        tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
-        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
-                       cpi->sf.trellis_eob_fast, &rate_cost);
-      }
-    }
-
-    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
-                                   x->plane[plane].eobs[block],
-                                   cm->reduced_tx_set_used);
-
-    // This may happen because of hash collision. The eob stored in the hash
-    // table is non-zero, but the real eob is zero. We need to make sure tx_type
-    // is DCT_DCT in this case.
-    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
-        best_tx_type != DCT_DCT) {
-      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                       DCT_DCT);
-    }
-  }
-  pd->dqcoeff = orig_dqcoeff;
-
-  return best_rd;
-}
-
-static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
-                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args *args = arg;
-  MACROBLOCK *const x = args->x;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int is_inter = is_inter_block(xd->mi[0]);
-  const AV1_COMP *cpi = args->cpi;
-  ENTROPY_CONTEXT *a = args->t_above + blk_col;
-  ENTROPY_CONTEXT *l = args->t_left + blk_row;
-  const AV1_COMMON *cm = &cpi->common;
-  RD_STATS this_rd_stats;
-
-  av1_init_rd_stats(&this_rd_stats);
-
-  if (args->exit_early) {
-    args->incomplete_exit = 1;
-    return;
-  }
-
-  if (!is_inter) {
-    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
-    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
-  }
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
-                  args->skip_trellis, args->best_rd - args->this_rd,
-                  &this_rd_stats);
-
-  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
-    assert(!is_inter || plane_bsize < BLOCK_8X8);
-    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
-  }
-
-#if CONFIG_RD_DEBUG
-  av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
-                            this_rd_stats.rate);
-#endif  // CONFIG_RD_DEBUG
-  av1_set_txb_context(x, plane, block, tx_size, a, l);
-
-  const int blk_idx =
-      blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
-      blk_col;
-
-  if (plane == 0)
-    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
-  else
-    set_blk_skip(x, plane, blk_idx, 0);
-
-  const int64_t rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-  const int64_t rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
-
-  // TODO(jingning): temporarily enabled only for luma component
-  const int64_t rd = AOMMIN(rd1, rd2);
-
-  this_rd_stats.skip &= !x->plane[plane].eobs[block];
-
-  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
-
-  args->this_rd += rd;
-
-  if (args->this_rd > args->best_rd) args->exit_early = 1;
-}
-
-static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
-                             RD_STATS *rd_stats, int64_t ref_best_rd,
-                             int64_t this_rd, int plane, BLOCK_SIZE bsize,
-                             TX_SIZE tx_size, int use_fast_coef_casting,
-                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct rdcost_block_args args;
-  av1_zero(args);
-  args.x = x;
-  args.cpi = cpi;
-  args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
-  args.ftxs_mode = ftxs_mode;
-  args.this_rd = this_rd;
-  args.skip_trellis = skip_trellis;
-  av1_init_rd_stats(&args.rd_stats);
-
-  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
-    av1_invalid_rd_stats(rd_stats);
-    return;
-  }
-
-  if (plane == 0) xd->mi[0]->tx_size = tx_size;
-
-  av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
-
-  if (args.this_rd > args.best_rd) {
-    args.exit_early = 1;
-  }
-
-  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
-                                         &args);
-
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
-
-  if (invalid_rd) {
-    av1_invalid_rd_stats(rd_stats);
-  } else {
-    *rd_stats = args.rd_stats;
-  }
-}
-
-static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
-                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  assert(bsize == x->e_mbd.mi[0]->sb_type);
-  if (cm->tx_mode != TX_MODE_SELECT || !block_signals_txsize(bsize)) return 0;
-
-  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
-  const int depth = tx_size_to_depth(tx_size, bsize);
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const int tx_size_ctx = get_tx_size_context(xd);
-  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-}
-
-static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                        RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
-                        int skip_trellis) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int64_t rd = INT64_MAX;
-  const int skip_ctx = av1_get_skip_context(xd);
-  int s0, s1;
-  const int is_inter = is_inter_block(mbmi);
-  const int tx_select =
-      cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type);
-  int ctx = txfm_partition_context(
-      xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-  const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0]
-                                 : tx_size_cost(cm, x, bs, tx_size);
-
-  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
-
-  s0 = x->skip_cost[skip_ctx][0];
-  s1 = x->skip_cost[skip_ctx][1];
-
-  int64_t skip_rd;
-  int64_t this_rd;
-
-  if (is_inter) {
-    skip_rd = RDCOST(x->rdmult, s1, 0);
-    this_rd = RDCOST(x->rdmult, s0 + r_tx_size * tx_select, 0);
-  } else {
-    skip_rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, 0);
-    this_rd = RDCOST(x->rdmult, s0 + r_tx_size * tx_select, 0);
-  }
-
-  mbmi->tx_size = tx_size;
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
-                   AOM_PLANE_Y, bs, tx_size, cpi->sf.use_fast_coef_costing,
-                   ftxs_mode, skip_trellis);
-  if (rd_stats->rate == INT_MAX) return INT64_MAX;
-
-  // rdstats->rate should include all the rate except skip/non-skip cost as the
-  // same is accounted in the caller functions after rd evaluation of all
-  // planes. However the decisions should be done after considering the
-  // skip/non-skip header cost
-  if (rd_stats->skip) {
-    if (is_inter) {
-      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-    } else {
-      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
-      rd_stats->rate += r_tx_size * tx_select;
-    }
-  } else {
-    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
-                rd_stats->dist);
-    rd_stats->rate += r_tx_size * tx_select;
-  }
-  if (is_inter && !xd->lossless[xd->mi[0]->segment_id]) {
-    int64_t temp_skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-    if (temp_skip_rd <= rd) {
-      rd = temp_skip_rd;
-      rd_stats->rate = 0;
-      rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-    }
-  }
-
-  return rd;
-}
-
-static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
-                                   MACROBLOCK *x, int64_t ref_best_rd,
-                                   RD_STATS *rd_stats) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  av1_subtract_plane(x, bs, 0);
-  x->rd_model = LOW_TXFM_RD;
-  int skip_trellis = cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
-                     NO_ESTIMATE_YRD_TRELLIS_OPT;
-  const int64_t rd =
-      txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs],
-               FTXS_NONE, skip_trellis);
-  x->rd_model = FULL_TXFM_RD;
-  if (rd != INT64_MAX) {
-    const int skip_ctx = av1_get_skip_context(xd);
-    if (rd_stats->skip) {
-      const int s1 = x->skip_cost[skip_ctx][1];
-      rd_stats->rate = s1;
-    } else {
-      const int s0 = x->skip_cost[skip_ctx][0];
-      rd_stats->rate += s0;
-    }
-  }
-  return rd;
-}
-
-static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                   RD_STATS *rd_stats, int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used);
-  prune_tx(cpi, bs, x, xd, tx_set_type);
-  const int skip_ctx = av1_get_skip_context(xd);
-  int s0, s1;
-
-  s0 = x->skip_cost[skip_ctx][0];
-  s1 = x->skip_cost[skip_ctx][1];
-
-  int64_t skip_rd = RDCOST(x->rdmult, s1, 0);
-  int64_t this_rd = RDCOST(x->rdmult, s0, 0);
-
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
-                   AOM_PLANE_Y, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
-  // Reset the pruning flags.
-  av1_zero(x->tx_search_prune);
-  x->tx_split_prune_flag = 0;
-}
-
-static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    RD_STATS *rd_stats, int64_t ref_best_rd,
-                                    BLOCK_SIZE bs) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-
-  mbmi->tx_size = TX_4X4;
-  // TODO(any) : Pass this_rd based on skip/non-skip cost
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
-}
-
-static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
-  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
-  return num_blk;
-}
-
-static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
-                                 const SPEED_FEATURES *sf) {
-  if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
-
-  if (sf->tx_size_search_lgr_block) {
-    if (mi_width > mi_size_wide[BLOCK_64X64] ||
-        mi_height > mi_size_high[BLOCK_64X64])
-      return MAX_VARTX_DEPTH;
-  }
-
-  if (is_inter) {
-    return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect
-                                   : sf->inter_tx_size_search_init_depth_sqr;
-  } else {
-    return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect
-                                   : sf->intra_tx_size_search_init_depth_sqr;
-  }
-}
-
-static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
-                                        MACROBLOCK *x, RD_STATS *rd_stats,
-                                        int64_t ref_best_rd, BLOCK_SIZE bs) {
-  av1_invalid_rd_stats(rd_stats);
-
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  int start_tx;
-  int depth, init_depth;
-
-  if (tx_select) {
-    start_tx = max_rect_tx_size;
-    init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
-                                       is_inter_block(mbmi), &cpi->sf);
-  } else {
-    const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
-    start_tx = chosen_tx_size;
-    init_depth = MAX_TX_DEPTH;
-  }
-
-  prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
-
-  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  TX_SIZE best_tx_size = max_rect_tx_size;
-  int64_t best_rd = INT64_MAX;
-  const int n4 = bsize_to_num_blk(bs);
-  x->rd_model = FULL_TXFM_RD;
-  depth = init_depth;
-  int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
-  for (int n = start_tx; depth <= MAX_TX_DEPTH;
-       depth++, n = sub_tx_size_map[n]) {
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8) {
-      if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
-    }
-#endif
-    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[n] == TX_64X64) continue;
-
-    RD_STATS this_rd_stats;
-    rd[depth] =
-        txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE, 0);
-
-    if (rd[depth] < best_rd) {
-      memcpy(best_txk_type, mbmi->txk_type,
-             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
-      memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
-      best_tx_size = n;
-      best_rd = rd[depth];
-      *rd_stats = this_rd_stats;
-    }
-    if (n == TX_4X4) break;
-    // If we are searching three depths, prune the smallest size depending
-    // on rd results for the first two depths for low contrast blocks.
-    if (depth > init_depth && depth != MAX_TX_DEPTH &&
-        x->source_variance < 256) {
-      if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
-    }
-  }
-
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    memcpy(mbmi->txk_type, best_txk_type,
-           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
-    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
-  }
-
-  // Reset the pruning flags.
-  av1_zero(x->tx_search_prune);
-  x->tx_split_prune_flag = 0;
-}
-
-// origin_threshold * 128 / 100
-static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
-  {
-      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
-      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
-  },
-  {
-      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
-      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
-  },
-  {
-      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
-      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
-  },
-};
-
-// lookup table for predict_skip_flag
-// int max_tx_size = max_txsize_rect_lookup[bsize];
-// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
-//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
-static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
-  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
-  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
-  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
-  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
-};
-
-// Uses simple features on top of DCT coefficients to quickly predict
-// whether optimal RD decision is to skip encoding the residual.
-// The sse value is stored in dist.
-static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
-                             int reduced_tx_set) {
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
-
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
-
-  const int64_t mse = *dist / bw / bh;
-  // Normalized quantizer takes the transform upscaling factor (8 for tx size
-  // smaller than 32) into account.
-  const int16_t normalized_dc_q = dc_q >> 3;
-  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
-  // Predict not to skip when mse is larger than threshold.
-  if (mse > mse_thresh) return 0;
-
-  const int max_tx_size = max_predict_sf_tx_size[bsize];
-  const int tx_h = tx_size_high[max_tx_size];
-  const int tx_w = tx_size_wide[max_tx_size];
-  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
-  TxfmParam param;
-  param.tx_type = DCT_DCT;
-  param.tx_size = max_tx_size;
-  param.bd = xd->bd;
-  param.is_hbd = is_cur_buf_hbd(xd);
-  param.lossless = 0;
-  param.tx_set_type = av1_get_ext_tx_set_type(
-      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
-  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
-  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
-  const int16_t *src_diff = x->plane[0].src_diff;
-  const int n_coeff = tx_w * tx_h;
-  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
-  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
-  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
-  for (int row = 0; row < bh; row += tx_h) {
-    for (int col = 0; col < bw; col += tx_w) {
-      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
-      // Operating on TX domain, not pixels; we want the QTX quantizers
-      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
-      if (dc_coef >= dc_thresh) return 0;
-      for (int i = 1; i < n_coeff; ++i) {
-        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
-        if (ac_coef >= ac_thresh) return 0;
-      }
-    }
-    src_diff += tx_h * bw;
-  }
-  return 1;
-}
-
-// Used to set proper context for early termination with skip = 1.
-static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
-                          int64_t dist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
-  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
-  mbmi->tx_size = tx_size;
-  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
-  rd_stats->skip = 1;
-  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
-  rd_stats->dist = rd_stats->sse = (dist << 4);
-  // Though decision is to make the block as skip based on luma stats,
-  // it is possible that block becomes non skip after chroma rd. In addition
-  // intermediate non skip costs calculated by caller function will be
-  // incorrect, if rate is set as  zero (i.e., if zero_blk_rate is not
-  // accounted). Hence intermediate rate is populated to code the luma tx blks
-  // as skip, the caller function based on final rd decision (i.e., skip vs
-  // non-skip) sets the final rate accordingly. Here the rate populated
-  // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
-  // size possible) in the current block. Eg: For 128*128 block, rate would be
-  // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
-  // block as 'all zeros'
-  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
-  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
-  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
-  ENTROPY_CONTEXT *ta = ctxa;
-  ENTROPY_CONTEXT *tl = ctxl;
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  TXB_CTX txb_ctx;
-  get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
-                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-  rd_stats->rate = zero_blk_rate *
-                   (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
-                   (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
-}
-
-static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  const int16_t *diff = x->plane[0].src_diff;
-  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                             (uint8_t *)diff, 2 * rows * cols);
-  return (hash << 5) + bsize;
-}
-
-static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            MB_RD_RECORD *tx_rd_record) {
-  int index;
-  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
-    index =
-        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
-    ++tx_rd_record->num;
-  } else {
-    index = tx_rd_record->index_start;
-    tx_rd_record->index_start =
-        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
-  }
-  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  tx_rd_info->hash_value = hash;
-  tx_rd_info->tx_size = mbmi->tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
-  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
-  tx_rd_info->rd_stats = *rd_stats;
-}
-
-static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
-                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  mbmi->tx_size = tx_rd_info->tx_size;
-  memcpy(x->blk_skip, tx_rd_info->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
-  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
-  *rd_stats = tx_rd_info->rd_stats;
-}
-
-static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
-                                      const int64_t ref_best_rd,
-                                      const uint32_t hash) {
-  int32_t match_index = -1;
-  if (ref_best_rd != INT64_MAX) {
-    for (int i = 0; i < mb_rd_record->num; ++i) {
-      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
-      // If there is a match in the tx_rd_record, fetch the RD decision and
-      // terminate early.
-      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
-        match_index = index;
-        break;
-      }
-    }
-  }
-  return match_index;
-}
-
-static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bs,
-                            int64_t ref_best_rd) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  av1_init_rd_stats(rd_stats);
-  int is_inter = is_inter_block(xd->mi[0]);
-  assert(bs == xd->mi[0]->sb_type);
-
-  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
-  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-
-  uint32_t hash = 0;
-  int32_t match_index = -1;
-  MB_RD_RECORD *mb_rd_record = NULL;
-  const int within_border = mi_row >= xd->tile.mi_row_start &&
-                            (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
-                            mi_col >= xd->tile.mi_col_start &&
-                            (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
-  const int is_mb_rd_hash_enabled =
-      (within_border && cpi->sf.use_mb_rd_hash && is_inter);
-  const int n4 = bsize_to_num_blk(bs);
-  if (is_mb_rd_hash_enabled) {
-    hash = get_block_residue_hash(x, bs);
-    mb_rd_record = &x->mb_rd_record;
-    match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
-    if (match_index != -1) {
-      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
-      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
-      // Reset the pruning flags.
-      av1_zero(x->tx_search_prune);
-      x->tx_split_prune_flag = 0;
-      return;
-    }
-  }
-
-  // If we predict that skip is the optimal RD decision - set the respective
-  // context and terminate early.
-  int64_t dist;
-
-  if (cpi->sf.tx_type_search.use_skip_flag_prediction && is_inter &&
-      (!xd->lossless[xd->mi[0]->segment_id]) &&
-      predict_skip_flag(x, bs, &dist, cpi->common.reduced_tx_set_used)) {
-    // Populate rdstats as per skip decision
-    set_skip_flag(x, rd_stats, bs, dist);
-    // Save the RD search results into tx_rd_record.
-    if (is_mb_rd_hash_enabled)
-      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-    // Reset the pruning flags.
-    av1_zero(x->tx_search_prune);
-    x->tx_split_prune_flag = 0;
-    return;
-  }
-
-  if (xd->lossless[xd->mi[0]->segment_id]) {
-    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
-    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else {
-    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
-  }
-
-  // Save the RD search results into tx_rd_record.
-  if (is_mb_rd_hash_enabled) {
-    assert(mb_rd_record != NULL);
-    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-  }
-}
-
-// Return the rate cost for luma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                  int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
-  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
-  const int use_intrabc = mbmi->use_intrabc;
-  // Can only activate one mode.
-  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
-          use_filter_intra) <= 1);
-  const int try_palette =
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mbmi->mode == DC_PRED) {
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-    const int mode_ctx = av1_get_palette_mode_ctx(xd);
-    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
-    if (use_palette) {
-      const uint8_t *const color_map = xd->plane[0].color_index_map;
-      int block_width, block_height, rows, cols;
-      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                               &cols);
-      const int plt_size = mbmi->palette_mode_info.palette_size[0];
-      int palette_mode_cost =
-          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-      palette_mode_cost +=
-          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
-    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
-    if (use_filter_intra) {
-      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
-                                                  .filter_intra_mode];
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode)) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
-                                       [MAX_ANGLE_DELTA +
-                                        mbmi->angle_delta[PLANE_TYPE_Y]];
-    }
-  }
-  if (av1_allow_intrabc(&cpi->common))
-    total_rate += x->intrabc_cost[use_intrabc];
-  return total_rate;
-}
-
-// Return the rate cost for chroma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                   int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
-  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
-  // Can only activate one mode.
-  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
-
-  const int try_palette =
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mode == UV_DC_PRED) {
-    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-    total_rate +=
-        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
-    if (use_palette) {
-      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-      const int plt_size = pmi->palette_size[1];
-      const MACROBLOCKD *xd = &x->e_mbd;
-      const uint8_t *const color_map = xd->plane[1].color_index_map;
-      int palette_mode_cost =
-          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-      palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(mode))) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate +=
-          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
-                                             MAX_ANGLE_DELTA];
-    }
-  }
-  return total_rate;
-}
 
 static int conditional_skipintra(PREDICTION_MODE mode,
                                  PREDICTION_MODE best_intra_mode) {
@@ -4184,2398 +701,6 @@
   return 0;
 }
 
-// Model based RD estimation for luma intra blocks.
-static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost, int mi_row,
-                               int mi_col) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  RD_STATS this_rd_stats;
-  int row, col;
-  int64_t temp_sse, this_rd;
-  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
-  const int stepr = tx_size_high_unit[tx_size];
-  const int stepc = tx_size_wide_unit[tx_size];
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  mbmi->tx_size = tx_size;
-  // Prediction.
-  for (row = 0; row < max_blocks_high; row += stepr) {
-    for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
-    }
-  }
-  // RD estimation.
-  model_rd_sb_fn[MODELRD_TYPE_INTRA](
-      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
-      &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-    mode_cost +=
-        x->angle_delta_cost[mbmi->mode - V_PRED]
-                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
-  }
-  if (mbmi->mode == DC_PRED &&
-      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
-    if (mbmi->filter_intra_mode_info.use_filter_intra) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
-                   x->filter_intra_mode_cost[mode];
-    } else {
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
-    }
-  }
-  this_rd =
-      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
-  return this_rd;
-}
-
-// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
-// new_height'. Extra rows and columns are filled in by copying last valid
-// row/column.
-static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
-                                     int orig_height, int new_width,
-                                     int new_height) {
-  int j;
-  assert(new_width >= orig_width);
-  assert(new_height >= orig_height);
-  if (new_width == orig_width && new_height == orig_height) return;
-
-  for (j = orig_height - 1; j >= 0; --j) {
-    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
-    // Copy last column to extra columns.
-    memset(color_map + j * new_width + orig_width,
-           color_map[j * new_width + orig_width - 1], new_width - orig_width);
-  }
-  // Copy last row to extra rows.
-  for (j = orig_height; j < new_height; ++j) {
-    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
-           new_width);
-  }
-}
-
-// Bias toward using colors in the cache.
-// TODO(huisu): Try other schemes to improve compression.
-static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
-                                    int n_colors, int stride, int *centroids) {
-  if (n_cache <= 0) return;
-  for (int i = 0; i < n_colors * stride; i += stride) {
-    int min_diff = abs(centroids[i] - (int)color_cache[0]);
-    int idx = 0;
-    for (int j = 1; j < n_cache; ++j) {
-      const int this_diff = abs(centroids[i] - color_cache[j]);
-      if (this_diff < min_diff) {
-        min_diff = this_diff;
-        idx = j;
-      }
-    }
-    if (min_diff <= 1) centroids[i] = color_cache[idx];
-  }
-}
-
-// Given the base colors as specified in centroids[], calculate the RD cost
-// of palette mode.
-static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
-                         int mi_col, int dc_mode_cost, const int *data,
-                         int *centroids, int n, uint16_t *color_cache,
-                         int n_cache, MB_MODE_INFO *best_mbmi,
-                         uint8_t *best_palette_color_map, int64_t *best_rd,
-                         int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-                         int *rate_overhead, int64_t *distortion,
-                         int *skippable, PICK_MODE_CONTEXT *ctx,
-                         uint8_t *blk_skip) {
-  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-  int k = av1_remove_duplicates(centroids, n);
-  if (k < PALETTE_MIN_SIZE) {
-    // Too few unique colors to create a palette. And DC_PRED will work
-    // well for that case anyway. So skip.
-    return;
-  }
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth)
-    for (int i = 0; i < k; ++i)
-      pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
-  else
-    for (int i = 0; i < k; ++i)
-      pmi->palette_colors[i] = clip_pixel(centroids[i]);
-  pmi->palette_size[0] = k;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
-  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
-  const int palette_mode_cost =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  int64_t this_model_rd =
-      intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
-  if (*best_model_rd != INT64_MAX &&
-      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
-    return;
-  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
-  RD_STATS tokenonly_rd_stats;
-  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-  if (tokenonly_rd_stats.rate == INT_MAX) return;
-  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
-    tokenonly_rd_stats.rate -=
-        tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
-  }
-  if (this_rd < *best_rd) {
-    *best_rd = this_rd;
-    memcpy(best_palette_color_map, color_map,
-           block_width * block_height * sizeof(color_map[0]));
-    *best_mbmi = *mbmi;
-    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-    *rate_overhead = this_rate - tokenonly_rd_stats.rate;
-    if (rate) *rate = this_rate;
-    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-    if (distortion) *distortion = tokenonly_rd_stats.dist;
-    if (skippable) *skippable = tokenonly_rd_stats.skip;
-  }
-}
-
-static int rd_pick_palette_intra_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-    int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
-  int rate_overhead = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  int colors, n;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *const src = x->plane[0].src.buf;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (seq_params->use_highbitdepth)
-    colors = av1_count_colors_highbd(src, src_stride, rows, cols,
-                                     seq_params->bit_depth, count_buf);
-  else
-    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
-  mbmi->filter_intra_mode_info.use_filter_intra = 0;
-
-  if (colors > 1 && colors <= 64) {
-    int r, c, i;
-    const int max_itr = 50;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[PALETTE_MAX_SIZE];
-    int lb, ub, val;
-    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    if (seq_params->use_highbitdepth)
-      lb = ub = src16[0];
-    else
-      lb = ub = src[0];
-
-    if (seq_params->use_highbitdepth) {
-      for (r = 0; r < rows; ++r) {
-        for (c = 0; c < cols; ++c) {
-          val = src16[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-    } else {
-      for (r = 0; r < rows; ++r) {
-        for (c = 0; c < cols; ++c) {
-          val = src[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-    }
-
-    mbmi->mode = DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra = 0;
-
-    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-
-    // Find the dominant colors, stored in top_colors[].
-    int top_colors[PALETTE_MAX_SIZE] = { 0 };
-    for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
-      int max_count = 0;
-      for (int j = 0; j < (1 << seq_params->bit_depth); ++j) {
-        if (count_buf[j] > max_count) {
-          max_count = count_buf[j];
-          top_colors[i] = j;
-        }
-      }
-      assert(max_count > 0);
-      count_buf[top_colors[i]] = 0;
-    }
-
-    // Try the dominant colors directly.
-    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
-    // where the dominant colors and the k-means results are similar.
-    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
-      for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
-      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
-                   centroids, n, color_cache, n_cache, best_mbmi,
-                   best_palette_color_map, best_rd, best_model_rd, rate,
-                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
-                   best_blk_skip);
-    }
-
-    // K-means clustering.
-    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
-      if (colors == PALETTE_MIN_SIZE) {
-        // Special case: These colors automatically become the centroids.
-        assert(colors == n);
-        assert(colors == 2);
-        centroids[0] = lb;
-        centroids[1] = ub;
-      } else {
-        for (i = 0; i < n; ++i) {
-          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-        }
-        av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
-      }
-      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
-                   centroids, n, color_cache, n_cache, best_mbmi,
-                   best_palette_color_map, best_rd, best_model_rd, rate,
-                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
-                   best_blk_skip);
-    }
-  }
-
-  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           block_width * block_height * sizeof(best_palette_color_map[0]));
-  }
-  *mbmi = *best_mbmi;
-  return rate_overhead;
-}
-
-// Return 1 if an filter intra mode is selected; return 0 otherwise.
-static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col, int *rate,
-                                    int *rate_tokenonly, int64_t *distortion,
-                                    int *skippable, BLOCK_SIZE bsize,
-                                    int mode_cost, int64_t *best_rd,
-                                    int64_t *best_model_rd,
-                                    PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  int filter_intra_selected_flag = 0;
-  FILTER_INTRA_MODE mode;
-  TX_SIZE best_tx_size = TX_8X8;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
-  (void)ctx;
-  av1_zero(filter_intra_mode_info);
-  mbmi->filter_intra_mode_info.use_filter_intra = 1;
-  mbmi->mode = DC_PRED;
-  mbmi->palette_mode_info.palette_size[0] = 0;
-
-  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-    int64_t this_rd, this_model_rd;
-    RD_STATS tokenonly_rd_stats;
-    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
-    this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
-    if (*best_model_rd != INT64_MAX &&
-        this_model_rd > *best_model_rd + (*best_model_rd >> 1))
-      continue;
-    if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
-    super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-    if (tokenonly_rd_stats.rate == INT_MAX) continue;
-    const int this_rate =
-        tokenonly_rd_stats.rate +
-        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
-    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-    if (this_rd < *best_rd) {
-      *best_rd = this_rd;
-      best_tx_size = mbmi->tx_size;
-      filter_intra_mode_info = mbmi->filter_intra_mode_info;
-      memcpy(best_txk_type, mbmi->txk_type,
-             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      *rate = this_rate;
-      *rate_tokenonly = tokenonly_rd_stats.rate;
-      *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
-      filter_intra_selected_flag = 1;
-    }
-  }
-
-  if (filter_intra_selected_flag) {
-    mbmi->mode = DC_PRED;
-    mbmi->tx_size = best_tx_size;
-    mbmi->filter_intra_mode_info = filter_intra_mode_info;
-    memcpy(mbmi->txk_type, best_txk_type,
-           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-// Run RD calculation with given luma intra prediction angle., and return
-// the RD cost. Update the best mode info. if the RD cost is the best so far.
-static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-    int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
-    int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
-    TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
-    TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
-  RD_STATS tokenonly_rd_stats;
-  int64_t this_rd, this_model_rd;
-  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  assert(!is_inter_block(mbmi));
-  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
-  if (*best_model_rd != INT64_MAX &&
-      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
-    return INT64_MAX;
-  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
-  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
-  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
-
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-  if (this_rd < *best_rd) {
-    memcpy(best_txk_type, mbmi->txk_type,
-           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
-    *best_tx_size = mbmi->tx_size;
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
-  }
-  return this_rd;
-}
-
-// With given luma directional intra prediction mode, pick the best angle delta
-// Return the RD cost corresponding to the best angle delta.
-static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int mi_row, int mi_col, int *rate,
-                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int mode_cost, int64_t best_rd,
-                                       int64_t *best_model_rd) {
-  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
-  assert(!is_inter_block(mbmi));
-
-  int best_angle_delta = 0;
-  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mbmi->tx_size;
-  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  int first_try = 1;
-  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      const int64_t best_rd_in =
-          (best_rd == INT64_MAX) ? INT64_MAX
-                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      const int64_t this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
-          (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
-          &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
-          best_txk_type, best_blk_skip);
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (first_try && this_rd == INT64_MAX) return best_rd;
-      first_try = 0;
-      if (angle_delta == 0) {
-        rd_cost[1] = this_rd;
-        break;
-      }
-    }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      const int64_t rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
-                                  best_rd, (1 - 2 * i) * angle_delta,
-                                  MAX_ANGLE_DELTA, rate, rd_stats,
-                                  &best_angle_delta, &best_tx_size, &best_rd,
-                                  best_model_rd, best_txk_type, best_blk_skip);
-      }
-    }
-  }
-
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-    memcpy(mbmi->txk_type, best_txk_type,
-           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-    memcpy(x->blk_skip, best_blk_skip,
-           sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
-  }
-  return best_rd;
-}
-
-// Indices are sign, integer, and fractional part of the gradient value
-static const uint8_t gradient_to_angle_bin[2][7][16] = {
-  {
-      { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
-      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
-      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
-      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
-  },
-  {
-      { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
-      { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
-      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
-      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
-      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
-      { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
-      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
-  },
-};
-
-/* clang-format off */
-static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
-  0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
-  0,
-};
-/* clang-format on */
-
-static void get_gradient_hist(const uint8_t *src, int src_stride, int rows,
-                              int cols, uint64_t *hist) {
-  src += src_stride;
-  for (int r = 1; r < rows; ++r) {
-    for (int c = 1; c < cols; ++c) {
-      int dx = src[c] - src[c - 1];
-      int dy = src[c] - src[c - src_stride];
-      int index;
-      const int temp = dx * dx + dy * dy;
-      if (dy == 0) {
-        index = 2;
-      } else {
-        const int sn = (dx > 0) ^ (dy > 0);
-        dx = abs(dx);
-        dy = abs(dy);
-        const int remd = (dx % dy) * 16 / dy;
-        const int quot = dx / dy;
-        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
-      }
-      hist[index] += temp;
-    }
-    src += src_stride;
-  }
-}
-
-static void get_highbd_gradient_hist(const uint8_t *src8, int src_stride,
-                                     int rows, int cols, uint64_t *hist) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src += src_stride;
-  for (int r = 1; r < rows; ++r) {
-    for (int c = 1; c < cols; ++c) {
-      int dx = src[c] - src[c - 1];
-      int dy = src[c] - src[c - src_stride];
-      int index;
-      const int temp = dx * dx + dy * dy;
-      if (dy == 0) {
-        index = 2;
-      } else {
-        const int sn = (dx > 0) ^ (dy > 0);
-        dx = abs(dx);
-        dy = abs(dy);
-        const int remd = (dx % dy) * 16 / dy;
-        const int quot = dx / dy;
-        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
-      }
-      hist[index] += temp;
-    }
-    src += src_stride;
-  }
-}
-
-static void angle_estimation(const uint8_t *src, int src_stride, int rows,
-                             int cols, BLOCK_SIZE bsize, int is_hbd,
-                             uint8_t *directional_mode_skip_mask) {
-  // Check if angle_delta is used
-  if (!av1_use_angle_delta(bsize)) return;
-
-  uint64_t hist[DIRECTIONAL_MODES] = { 0 };
-  if (is_hbd)
-    get_highbd_gradient_hist(src, src_stride, rows, cols, hist);
-  else
-    get_gradient_hist(src, src_stride, rows, cols, hist);
-
-  int i;
-  uint64_t hist_sum = 0;
-  for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
-  for (i = 0; i < INTRA_MODES; ++i) {
-    if (av1_is_directional_mode(i)) {
-      const uint8_t angle_bin = mode_to_angle_bin[i];
-      uint64_t score = 2 * hist[angle_bin];
-      int weight = 2;
-      if (angle_bin > 0) {
-        score += hist[angle_bin - 1];
-        ++weight;
-      }
-      if (angle_bin < DIRECTIONAL_MODES - 1) {
-        score += hist[angle_bin + 1];
-        ++weight;
-      }
-      const int thresh = 10;
-      if (score * thresh < hist_sum * weight) directional_mode_skip_mask[i] = 1;
-    }
-  }
-}
-
-// Given selected prediction mode, search for the best tx type and size.
-static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                            BLOCK_SIZE bsize, const int *bmode_costs,
-                            int64_t *best_rd, int *rate, int *rate_tokenonly,
-                            int64_t *distortion, int *skippable,
-                            MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  RD_STATS rd_stats;
-  super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd);
-  if (rd_stats.rate == INT_MAX) return;
-  int this_rate_tokenonly = rd_stats.rate;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
-    // super_block_yrd above includes the cost of the tx_size in the
-    // tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
-  }
-  const int this_rate =
-      rd_stats.rate +
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
-  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
-  if (this_rd < *best_rd) {
-    *best_mbmi = *mbmi;
-    *best_rd = this_rd;
-    *rate = this_rate;
-    *rate_tokenonly = this_rate_tokenonly;
-    *distortion = rd_stats.dist;
-    *skippable = rd_stats.skip;
-    memcpy(ctx->blk_skip, x->blk_skip,
-           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-  }
-}
-
-// This function is used only for intra_only frames
-static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                      int mi_row, int mi_col, int *rate,
-                                      int *rate_tokenonly, int64_t *distortion,
-                                      int *skippable, BLOCK_SIZE bsize,
-                                      int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  int64_t best_model_rd = INT64_MAX;
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  int is_directional_mode;
-  uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
-  int beat_best_rd = 0;
-  const int *bmode_costs;
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
-  uint8_t *best_palette_color_map =
-      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
-  const MB_MODE_INFO *above_mi = xd->above_mbmi;
-  const MB_MODE_INFO *left_mi = xd->left_mbmi;
-  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
-  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
-  const int above_ctx = intra_mode_context[A];
-  const int left_ctx = intra_mode_context[L];
-  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
-
-  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-  if (cpi->sf.intra_angle_estimation) {
-    const int src_stride = x->plane[0].src.stride;
-    const uint8_t *src = x->plane[0].src.buf;
-    angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
-                     directional_mode_skip_mask);
-  }
-  mbmi->filter_intra_mode_info.use_filter_intra = 0;
-  pmi->palette_size[0] = 0;
-
-  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
-    x->use_default_intra_tx_type = 1;
-  else
-    x->use_default_intra_tx_type = 0;
-
-  MB_MODE_INFO best_mbmi = *mbmi;
-  /* Y Search for intra prediction mode */
-  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
-    RD_STATS this_rd_stats;
-    int this_rate, this_rate_tokenonly, s;
-    int64_t this_distortion, this_rd, this_model_rd;
-    mbmi->mode = intra_rd_search_mode_order[mode_idx];
-    if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
-      continue;
-    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    this_model_rd =
-        intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
-    if (best_model_rd != INT64_MAX &&
-        this_model_rd > best_model_rd + (best_model_rd >> 1))
-      continue;
-    if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
-    is_directional_mode = av1_is_directional_mode(mbmi->mode);
-    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.enable_angle_delta) {
-      this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
-                              &this_rd_stats, bsize, bmode_costs[mbmi->mode],
-                              best_rd, &best_model_rd);
-    } else {
-      super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-    }
-    this_rate_tokenonly = this_rd_stats.rate;
-    this_distortion = this_rd_stats.dist;
-    s = this_rd_stats.skip;
-
-    if (this_rate_tokenonly == INT_MAX) continue;
-
-    if (!xd->lossless[mbmi->segment_id] &&
-        block_signals_txsize(mbmi->sb_type)) {
-      // super_block_yrd above includes the cost of the tx_size in the
-      // tokenonly rate, but for intra blocks, tx_size is always coded
-      // (prediction granularity), so we account for it in the full rate,
-      // not the tokenonly rate.
-      this_rate_tokenonly -=
-          tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
-    }
-    this_rate =
-        this_rd_stats.rate +
-        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
-    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
-    if (this_rd < best_rd) {
-      best_mbmi = *mbmi;
-      best_rd = this_rd;
-      beat_best_rd = 1;
-      *rate = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion = this_distortion;
-      *skippable = s;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-    }
-  }
-
-  if (try_palette) {
-    rd_pick_palette_intra_sby(
-        cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
-        best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
-        distortion, skippable, ctx, ctx->blk_skip);
-  }
-
-  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
-    if (rd_pick_filter_intra_sby(
-            cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
-            bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
-      best_mbmi = *mbmi;
-    }
-  }
-
-  // If previous searches use only the default tx type, do an extra search for
-  // the best tx type.
-  if (x->use_default_intra_tx_type) {
-    *mbmi = best_mbmi;
-    x->use_default_intra_tx_type = 0;
-    intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
-                    distortion, skippable, &best_mbmi, ctx);
-  }
-
-  *mbmi = best_mbmi;
-  return best_rd;
-}
-
-// Return value 0: early termination triggered, no valid rd cost available;
-//              1: rd cost values are valid.
-static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
-  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-  int plane;
-  int is_cost_valid = 1;
-  const int is_inter = is_inter_block(mbmi);
-  int64_t this_rd = 0, skip_rd = 0;
-  av1_init_rd_stats(rd_stats);
-
-  if (ref_best_rd < 0) is_cost_valid = 0;
-
-  if (x->skip_chroma_rd) return is_cost_valid;
-
-  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-
-  if (is_inter && is_cost_valid) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
-      av1_subtract_plane(x, bsize, plane);
-  }
-
-  if (is_cost_valid) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-      RD_STATS pn_rd_stats;
-      int64_t chroma_ref_best_rd = ref_best_rd;
-      // For inter blocks, refined ref_best_rd is used for early exit
-      // For intra blocks, even though current rd crosses ref_best_rd, early
-      // exit is not recommended as current rd is used for gating subsequent
-      // modes as well (say, for angular modes)
-      // TODO(any): Extend the early exit mechanism for intra modes as well
-      if (cpi->sf.perform_best_rd_based_gating_for_chroma && is_inter &&
-          chroma_ref_best_rd != INT64_MAX)
-        chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
-      txfm_rd_in_plane(x, cpi, &pn_rd_stats, chroma_ref_best_rd, 0, plane,
-                       bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
-                       FTXS_NONE, 0);
-      if (pn_rd_stats.rate == INT_MAX) {
-        is_cost_valid = 0;
-        break;
-      }
-      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
-      if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
-        is_cost_valid = 0;
-        break;
-      }
-    }
-  }
-
-  if (!is_cost_valid) {
-    // reset cost value
-    av1_invalid_rd_stats(rd_stats);
-  }
-
-  return is_cost_valid;
-}
-
-// Pick transform type for a transform block of tx_size.
-static void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                       int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
-                       FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
-                       TXB_RD_INFO *rd_info_array) {
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const uint16_t cur_joint_ctx =
-      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  const int txk_type_idx =
-      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
-  // Look up RD and terminate early in case when we've already processed exactly
-  // the same residual with exactly the same entropy context.
-  if (rd_info_array != NULL && rd_info_array->valid &&
-      rd_info_array->entropy_context == cur_joint_ctx) {
-    if (plane == 0)
-      x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
-                        tx_size, cpi->common.reduced_tx_set_used);
-    if (ref_tx_type == rd_info_array->tx_type) {
-      rd_stats->rate += rd_info_array->rate;
-      rd_stats->dist += rd_info_array->dist;
-      rd_stats->sse += rd_info_array->sse;
-      rd_stats->skip &= rd_info_array->eob == 0;
-      p->eobs[block] = rd_info_array->eob;
-      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
-      return;
-    }
-  }
-
-  RD_STATS this_rd_stats;
-  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, ftxs_mode, 0, 0, ref_rdcost, &this_rd_stats);
-
-  av1_merge_rd_stats(rd_stats, &this_rd_stats);
-
-  // Save RD results for possible reuse in future.
-  if (rd_info_array != NULL) {
-    rd_info_array->valid = 1;
-    rd_info_array->entropy_context = cur_joint_ctx;
-    rd_info_array->rate = this_rd_stats.rate;
-    rd_info_array->dist = this_rd_stats.dist;
-    rd_info_array->sse = this_rd_stats.sse;
-    rd_info_array->eob = p->eobs[block];
-    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
-    if (plane == 0) {
-      rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx];
-    }
-  }
-}
-
-static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
-                             float *mean, float *dev) {
-  int x_sum = 0;
-  uint64_t x2_sum = 0;
-  for (int i = 0; i < bh; ++i) {
-    for (int j = 0; j < bw; ++j) {
-      const int val = data[j];
-      x_sum += val;
-      x2_sum += val * val;
-    }
-    data += stride;
-  }
-
-  const int num = bw * bh;
-  const float e_x = (float)x_sum / num;
-  const float e_x2 = (float)((double)x2_sum / num);
-  const float diff = e_x2 - e_x * e_x;
-  *dev = (diff > 0) ? sqrtf(diff) : 0;
-  *mean = e_x;
-}
-
-static void get_mean_and_dev_float(const float *data, int stride, int bw,
-                                   int bh, float *mean, float *dev) {
-  float x_sum = 0;
-  float x2_sum = 0;
-  for (int i = 0; i < bh; ++i) {
-    for (int j = 0; j < bw; ++j) {
-      const float val = data[j];
-      x_sum += val;
-      x2_sum += val * val;
-    }
-    data += stride;
-  }
-
-  const int num = bw * bh;
-  const float e_x = x_sum / num;
-  const float e_x2 = x2_sum / num;
-  const float diff = e_x2 - e_x * e_x;
-  *dev = (diff > 0) ? sqrtf(diff) : 0;
-  *mean = e_x;
-}
-
-// Feature used by the model to predict tx split: the mean and standard
-// deviation values of the block and sub-blocks.
-static void get_mean_dev_features(const int16_t *data, int stride, int bw,
-                                  int bh, int levels, float *feature) {
-  int feature_idx = 0;
-  int width = bw;
-  int height = bh;
-  const int16_t *const data_ptr = &data[0];
-  for (int lv = 0; lv < levels; ++lv) {
-    if (width < 2 || height < 2) break;
-    float mean_buf[16];
-    float dev_buf[16];
-    int blk_idx = 0;
-    for (int row = 0; row < bh; row += height) {
-      for (int col = 0; col < bw; col += width) {
-        float mean, dev;
-        get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
-                         &mean, &dev);
-        feature[feature_idx++] = mean;
-        feature[feature_idx++] = dev;
-        mean_buf[blk_idx] = mean;
-        dev_buf[blk_idx++] = dev;
-      }
-    }
-    if (blk_idx > 1) {
-      float mean, dev;
-      // Deviation of means.
-      get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
-      feature[feature_idx++] = dev;
-      // Mean of deviations.
-      get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
-      feature[feature_idx++] = mean;
-    }
-    // Reduce the block size when proceeding to the next level.
-    if (height == width) {
-      height = height >> 1;
-      width = width >> 1;
-    } else if (height > width) {
-      height = height >> 1;
-    } else {
-      width = width >> 1;
-    }
-  }
-}
-
-static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
-                               int blk_col, TX_SIZE tx_size) {
-  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
-  if (!nn_config) return -1;
-
-  const int diff_stride = block_size_wide[bsize];
-  const int16_t *diff =
-      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-  aom_clear_system_state();
-
-  float features[64] = { 0.0f };
-  get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
-
-  float score = 0.0f;
-  av1_nn_predict(features, nn_config, &score);
-  aom_clear_system_state();
-  if (score > 8.0f) return 100;
-  if (score < -8.0f) return 0;
-  score = 1.0f / (1.0f + (float)exp(-score));
-  return (int)(score * 100);
-}
-
-typedef struct {
-  int64_t rd;
-  int txb_entropy_ctx;
-  TX_TYPE tx_type;
-} TxCandidateInfo;
-
-static void try_tx_block_no_split(
-    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
-    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
-    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
-    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    TxCandidateInfo *no_split) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  struct macroblock_plane *const p = &x->plane[0];
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-
-  no_split->rd = INT64_MAX;
-  no_split->txb_entropy_ctx = 0;
-  no_split->tx_type = TX_TYPES;
-
-  const ENTROPY_CONTEXT *const pta = ta + blk_col;
-  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
-
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
-                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-
-  rd_stats->ref_rdcost = ref_best_rd;
-  rd_stats->zero_rate = zero_blk_rate;
-  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
-  mbmi->inter_tx_size[index] = tx_size;
-  tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, &txb_ctx,
-             rd_stats, ftxs_mode, ref_best_rd,
-             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
-  assert(rd_stats->rate < INT_MAX);
-
-  if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
-           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
-       rd_stats->skip == 1) &&
-      !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-    av1_update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
-                              zero_blk_rate - rd_stats->rate);
-#endif  // CONFIG_RD_DEBUG
-    rd_stats->rate = zero_blk_rate;
-    rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
-    set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
-    p->eobs[block] = 0;
-    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     DCT_DCT);
-  } else {
-    set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
-    rd_stats->skip = 0;
-  }
-
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
-
-  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
-  const int txk_type_idx =
-      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
-  no_split->tx_type = mbmi->txk_type[txk_type_idx];
-}
-
-static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int block, TX_SIZE tx_size, int depth,
-                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
-                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
-                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
-                            int64_t prev_level_rd, int64_t ref_best_rd,
-                            int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-                            TXB_RD_INFO_NODE *rd_info_node);
-
-static void try_tx_block_split(
-    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
-    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
-    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
-    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    RD_STATS *split_rd_stats, int64_t *split_rd) {
-  assert(tx_size < TX_SIZES_ALL);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
-  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-  const int bsw = tx_size_wide_unit[sub_txs];
-  const int bsh = tx_size_high_unit[sub_txs];
-  const int sub_step = bsw * bsh;
-  const int nblks =
-      (tx_size_high_unit[tx_size] / bsh) * (tx_size_wide_unit[tx_size] / bsw);
-  assert(nblks > 0);
-  int blk_idx = 0;
-  int64_t tmp_rd = 0;
-  *split_rd = INT64_MAX;
-  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
-
-  for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
-    for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
-      assert(blk_idx < 4);
-      const int offsetr = blk_row + r;
-      const int offsetc = blk_col + c;
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      RD_STATS this_rd_stats;
-      int this_cost_valid = 1;
-      select_tx_block(
-          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
-          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
-          ref_best_rd - tmp_rd, &this_cost_valid, ftxs_mode,
-          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-      if (!this_cost_valid) return;
-      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
-      tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-      if (no_split_rd < tmp_rd) return;
-      block += sub_step;
-    }
-  }
-
-  *split_rd = tmp_rd;
-}
-
-// Search for the best tx partition/type for a given luma block.
-static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int block, TX_SIZE tx_size, int depth,
-                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
-                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
-                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
-                            int64_t prev_level_rd, int64_t ref_best_rd,
-                            int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-                            TXB_RD_INFO_NODE *rd_info_node) {
-  assert(tx_size < TX_SIZES_ALL);
-  av1_init_rd_stats(rd_stats);
-  if (ref_best_rd < 0) {
-    *is_cost_valid = 0;
-    return;
-  }
-
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                         mbmi->sb_type, tx_size);
-  struct macroblock_plane *const p = &x->plane[0];
-
-  const int try_no_split =
-      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
-  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8)
-    try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
-#endif
-  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
-
-  // TX no split
-  if (try_no_split) {
-    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
-                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
-                          ftxs_mode, rd_info_node, &no_split);
-
-    if (cpi->sf.adaptive_txb_search_level &&
-        (no_split.rd -
-         (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
-            ref_best_rd) {
-      *is_cost_valid = 0;
-      return;
-    }
-
-    if (cpi->sf.txb_split_cap) {
-      if (p->eobs[block] == 0) try_split = 0;
-    }
-
-    if (cpi->sf.adaptive_txb_search_level &&
-        (no_split.rd -
-         (no_split.rd >> (2 + cpi->sf.adaptive_txb_search_level))) >
-            prev_level_rd) {
-      try_split = 0;
-    }
-  }
-
-  if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
-    const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh;
-    if (threshold >= 0) {
-      const int split_score =
-          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
-      if (split_score >= 0 && split_score < threshold) try_split = 0;
-    }
-  }
-
-  // TX split
-  int64_t split_rd = INT64_MAX;
-  RD_STATS split_rd_stats;
-  av1_init_rd_stats(&split_rd_stats);
-  if (try_split) {
-    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
-                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
-                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
-                       rd_info_node, &split_rd_stats, &split_rd);
-  }
-
-  if (no_split.rd < split_rd) {
-    ENTROPY_CONTEXT *pta = ta + blk_col;
-    ENTROPY_CONTEXT *ptl = tl + blk_row;
-    const TX_SIZE tx_size_selected = tx_size;
-    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
-    av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
-    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
-                          tx_size);
-    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
-      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
-        const int index =
-            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
-        mbmi->inter_tx_size[index] = tx_size_selected;
-      }
-    }
-    mbmi->tx_size = tx_size_selected;
-    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     no_split.tx_type);
-    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
-  } else {
-    *rd_stats = split_rd_stats;
-    if (split_rd == INT64_MAX) *is_cost_valid = 0;
-  }
-}
-
-static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd,
-                                       TXB_RD_INFO_NODE *rd_info_tree) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(is_inter_block(xd->mi[0]));
-
-  // TODO(debargha): enable this as a speed feature where the
-  // select_inter_block_yrd() function above will use a simplified search
-  // such as not using full optimize, but the inter_block_yrd() function
-  // will use more complex search given that the transform partitions have
-  // already been decided.
-
-  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
-  int64_t rd_thresh = ref_best_rd;
-  if (fast_tx_search && rd_thresh < INT64_MAX) {
-    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
-  }
-  assert(rd_thresh > 0);
-
-  const FAST_TX_SEARCH_MODE ftxs_mode =
-      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
-  const struct macroblockd_plane *const pd = &xd->plane[0];
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-  const int mi_width = mi_size_wide[plane_bsize];
-  const int mi_height = mi_size_high[plane_bsize];
-  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
-  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
-  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
-  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
-  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
-  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
-  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
-
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int s0 = x->skip_cost[skip_ctx][0];
-  const int s1 = x->skip_cost[skip_ctx][1];
-  const int init_depth =
-      get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
-  const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
-  const int bh = tx_size_high_unit[max_tx_size];
-  const int bw = tx_size_wide_unit[max_tx_size];
-  const int step = bw * bh;
-  int64_t skip_rd = RDCOST(x->rdmult, s1, 0);
-  int64_t this_rd = RDCOST(x->rdmult, s0, 0);
-  int block = 0;
-
-  av1_init_rd_stats(rd_stats);
-  for (int idy = 0; idy < mi_height; idy += bh) {
-    for (int idx = 0; idx < mi_width; idx += bw) {
-      const int64_t best_rd_sofar =
-          (rd_thresh == INT64_MAX) ? INT64_MAX
-                                   : (rd_thresh - (AOMMIN(skip_rd, this_rd)));
-      int is_cost_valid = 1;
-      RD_STATS pn_rd_stats;
-      select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
-                      plane_bsize, ctxa, ctxl, tx_above, tx_left, &pn_rd_stats,
-                      INT64_MAX, best_rd_sofar, &is_cost_valid, ftxs_mode,
-                      rd_info_tree);
-      if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
-        av1_invalid_rd_stats(rd_stats);
-        return INT64_MAX;
-      }
-      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-      this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
-      block += step;
-      if (rd_info_tree != NULL) rd_info_tree += 1;
-    }
-  }
-
-  if (skip_rd <= this_rd) {
-    rd_stats->skip = 1;
-  } else {
-    rd_stats->skip = 0;
-  }
-
-  if (rd_stats->rate == INT_MAX) return INT64_MAX;
-
-  // If fast_tx_search is true, only DCT and 1D DCT were tested in
-  // select_inter_block_yrd() above. Do a better search for tx type with
-  // tx sizes already decided.
-  if (fast_tx_search) {
-    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
-      return INT64_MAX;
-  }
-
-  int64_t rd;
-  if (rd_stats->skip) {
-    rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-  } else {
-    rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
-    if (!xd->lossless[xd->mi[0]->segment_id])
-      rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
-  }
-
-  return rd;
-}
-
-// Finds rd cost for a y block, given the transform size partitions
-static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                         int blk_col, int block, TX_SIZE tx_size,
-                         BLOCK_SIZE plane_bsize, int depth,
-                         ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
-                         TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
-                         int64_t ref_best_rd, RD_STATS *rd_stats,
-                         FAST_TX_SEARCH_MODE ftxs_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
-
-  assert(tx_size < TX_SIZES_ALL);
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
-      plane_bsize, blk_row, blk_col)];
-
-  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                   mbmi->sb_type, tx_size);
-
-  av1_init_rd_stats(rd_stats);
-  if (tx_size == plane_tx_size) {
-    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
-    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
-    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-    TXB_CTX txb_ctx;
-    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
-
-    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
-                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-    rd_stats->zero_rate = zero_blk_rate;
-    rd_stats->ref_rdcost = ref_best_rd;
-    tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
-               &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
-            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
-        rd_stats->skip == 1) {
-      rd_stats->rate = zero_blk_rate;
-      rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
-      x->plane[0].eobs[block] = 0;
-      x->plane[0].txb_entropy_ctx[block] = 0;
-      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                       DCT_DCT);
-    } else {
-      rd_stats->skip = 0;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
-    }
-    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][0];
-    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
-    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
-                          tx_size);
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    const int step = bsh * bsw;
-    RD_STATS pn_rd_stats;
-    int64_t this_rd = 0;
-    assert(bsw > 0 && bsh > 0);
-
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
-        const int offsetc = blk_col + col;
-
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-        av1_init_rd_stats(&pn_rd_stats);
-        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
-                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
-                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
-        if (pn_rd_stats.rate == INT_MAX) {
-          av1_invalid_rd_stats(rd_stats);
-          return;
-        }
-        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
-        block += step;
-      }
-    }
-
-    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][1];
-  }
-}
-
-// Return value 0: early termination triggered, no valid rd cost available;
-//              1: rd cost values are valid.
-static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int is_cost_valid = 1;
-  int64_t this_rd = 0;
-
-  if (ref_best_rd < 0) is_cost_valid = 0;
-
-  av1_init_rd_stats(rd_stats);
-
-  if (is_cost_valid) {
-    const struct macroblockd_plane *const pd = &xd->plane[0];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int mi_width = mi_size_wide[plane_bsize];
-    const int mi_height = mi_size_high[plane_bsize];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
-    const int bh = tx_size_high_unit[max_tx_size];
-    const int bw = tx_size_wide_unit[max_tx_size];
-    const int init_depth =
-        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
-    int idx, idy;
-    int block = 0;
-    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
-    RD_STATS pn_rd_stats;
-
-    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
-    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
-    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
-
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        av1_init_rd_stats(&pn_rd_stats);
-        tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize,
-                     init_depth, ctxa, ctxl, tx_above, tx_left,
-                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
-        if (pn_rd_stats.rate == INT_MAX) {
-          av1_invalid_rd_stats(rd_stats);
-          return 0;
-        }
-        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd +=
-            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
-                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
-        block += step;
-      }
-    }
-  }
-
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int s0 = x->skip_cost[skip_ctx][0];
-  const int s1 = x->skip_cost[skip_ctx][1];
-  int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-  this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
-  if (skip_rd < this_rd) {
-    this_rd = skip_rd;
-    rd_stats->rate = 0;
-    rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
-  }
-  if (this_rd > ref_best_rd) is_cost_valid = 0;
-
-  if (!is_cost_valid) {
-    // reset cost value
-    av1_invalid_rd_stats(rd_stats);
-  }
-  return is_cost_valid;
-}
-
-static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
-                                const uint32_t hash) {
-  // Linear search through the circular buffer to find matching hash.
-  for (int i = cur_record->index_start - 1; i >= 0; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  int index;
-  // If not found - add new RD info into the buffer and return its index
-  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
-    index = (cur_record->index_start + cur_record->num) %
-            TX_SIZE_RD_RECORD_BUFFER_LEN;
-    cur_record->num++;
-  } else {
-    index = cur_record->index_start;
-    cur_record->index_start =
-        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
-  }
-
-  cur_record->hash_vals[index] = hash;
-  av1_zero(cur_record->tx_rd_info[index]);
-  return index;
-}
-
-typedef struct {
-  int leaf;
-  int8_t children[4];
-} RD_RECORD_IDX_NODE;
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0, 0, 0, 0 } },
-  { 1, { 0, 0, 0, 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0 } },
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
-  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 5, 6 } },
-  { 0, { 7, 8, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 7, 8 } },
-  { 0, { 5, 6, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
-  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
-  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
-  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
-  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
-  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
-  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
-  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
-  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
-  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
-  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
-  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
-  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
-  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
-  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
-  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
-  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
-  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
-  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
-  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
-  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
-  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
-  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
-  { 0, { 1, -1, 2, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
-  NULL,                    // BLOCK_4X4
-  NULL,                    // BLOCK_4X8
-  NULL,                    // BLOCK_8X4
-  rd_record_tree_8x8,      // BLOCK_8X8
-  rd_record_tree_8x16,     // BLOCK_8X16
-  rd_record_tree_16x8,     // BLOCK_16X8
-  rd_record_tree_16x16,    // BLOCK_16X16
-  rd_record_tree_1_2,      // BLOCK_16X32
-  rd_record_tree_2_1,      // BLOCK_32X16
-  rd_record_tree_sqr,      // BLOCK_32X32
-  rd_record_tree_1_2,      // BLOCK_32X64
-  rd_record_tree_2_1,      // BLOCK_64X32
-  rd_record_tree_sqr,      // BLOCK_64X64
-  rd_record_tree_64x128,   // BLOCK_64X128
-  rd_record_tree_128x64,   // BLOCK_128X64
-  rd_record_tree_128x128,  // BLOCK_128X128
-  NULL,                    // BLOCK_4X16
-  NULL,                    // BLOCK_16X4
-  rd_record_tree_1_4,      // BLOCK_8X32
-  rd_record_tree_4_1,      // BLOCK_32X8
-  rd_record_tree_1_4,      // BLOCK_16X64
-  rd_record_tree_4_1,      // BLOCK_64X16
-};
-
-static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
-  0,                                                            // BLOCK_4X4
-  0,                                                            // BLOCK_4X8
-  0,                                                            // BLOCK_8X4
-  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
-  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
-  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
-  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
-  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
-  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
-  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
-  0,                                                            // BLOCK_4X16
-  0,                                                            // BLOCK_16X4
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
-};
-
-static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
-                                       BLOCK_SIZE bsize) {
-  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
-  const int size = rd_record_tree_size[bsize];
-  for (int i = 0; i < size; ++i) {
-    if (rd_record[i].leaf) {
-      av1_zero(tree[i].children);
-    } else {
-      for (int j = 0; j < 4; ++j) {
-        const int8_t idx = rd_record[i].children[j];
-        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
-      }
-    }
-  }
-}
-
-// Go through all TX blocks that could be used in TX size search, compute
-// residual hash values for them and find matching RD info that stores previous
-// RD search results for these TX blocks. The idea is to prevent repeated
-// rate/distortion computations that happen because of the combination of
-// partition and TX size search. The resulting RD info records are returned in
-// the form of a quadtree for easier access in actual TX size search.
-static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                                   int mi_col, TXB_RD_INFO_NODE *dst_rd_info) {
-  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
-                                         x->txb_rd_record_16X16,
-                                         x->txb_rd_record_32X32,
-                                         x->txb_rd_record_64X64 };
-  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-
-  // Hashing is performed only for square TX sizes larger than TX_4X4
-  if (max_square_tx_size < TX_8X8) return 0;
-  const int diff_stride = bw;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  init_rd_record_tree(dst_rd_info, bsize);
-  // Coordinates of the top-left corner of current block within the superblock
-  // measured in pixels:
-  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  int cur_rd_info_idx = 0;
-  int cur_tx_depth = 0;
-  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
-  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
-    const int cur_tx_bw = tx_size_wide[cur_tx_size];
-    const int cur_tx_bh = tx_size_high[cur_tx_size];
-    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
-    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
-    const int tx_size_idx = cur_tx_size - TX_8X8;
-    for (int row = 0; row < bh; row += cur_tx_bh) {
-      for (int col = 0; col < bw; col += cur_tx_bw) {
-        if (cur_tx_bw != cur_tx_bh) {
-          // Use dummy nodes for all rectangular transforms within the
-          // TX size search tree.
-          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
-        } else {
-          // Get spatial location of this TX block within the superblock
-          // (measured in cur_tx_bsize units).
-          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
-          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
-
-          int16_t hash_data[MAX_SB_SQUARE];
-          int16_t *cur_hash_row = hash_data;
-          const int16_t *cur_diff_row = diff + row * diff_stride + col;
-          for (int i = 0; i < cur_tx_bh; i++) {
-            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
-            cur_hash_row += cur_tx_bw;
-            cur_diff_row += diff_stride;
-          }
-          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                                (uint8_t *)hash_data,
-                                                2 * cur_tx_bw * cur_tx_bh);
-          // Find corresponding RD info based on the hash value.
-          const int record_idx =
-              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
-          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
-          int idx = find_tx_size_rd_info(records, hash);
-          dst_rd_info[cur_rd_info_idx].rd_info_array =
-              &records->tx_rd_info[idx];
-        }
-        ++cur_rd_info_idx;
-      }
-    }
-    cur_tx_size = next_tx_size;
-    ++cur_tx_depth;
-  }
-  return 1;
-}
-
-// Search for best transform size and type for luma inter blocks.
-static void pick_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                                  RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col, int64_t ref_best_rd) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(is_inter_block(xd->mi[0]));
-
-  av1_invalid_rd_stats(rd_stats);
-
-  if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) {
-    int model_rate;
-    int64_t model_dist;
-    int model_skip;
-    model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
-        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
-        &model_skip, NULL, NULL, NULL, NULL);
-    const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
-    // If the modeled rd is a lot worse than the best so far, breakout.
-    // TODO(debargha, urvang): Improve the model and make the check below
-    // tighter.
-    assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
-           cpi->sf.model_based_prune_tx_search_level <= 2);
-    static const int prune_factor_by8[] = { 3, 5 };
-    if (!model_skip &&
-        ((model_rd *
-          prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
-         3) > ref_best_rd)
-      return;
-  }
-
-  uint32_t hash = 0;
-  int32_t match_index = -1;
-  MB_RD_RECORD *mb_rd_record = NULL;
-  const int within_border =
-      mi_row >= xd->tile.mi_row_start &&
-      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
-      mi_col >= xd->tile.mi_col_start &&
-      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
-  const int is_mb_rd_hash_enabled = (within_border && cpi->sf.use_mb_rd_hash);
-  const int n4 = bsize_to_num_blk(bsize);
-  if (is_mb_rd_hash_enabled) {
-    hash = get_block_residue_hash(x, bsize);
-    mb_rd_record = &x->mb_rd_record;
-    match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
-    if (match_index != -1) {
-      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
-      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
-      return;
-    }
-  }
-
-  // If we predict that skip is the optimal RD decision - set the respective
-  // context and terminate early.
-  int64_t dist;
-  if (cpi->sf.tx_type_search.use_skip_flag_prediction &&
-      predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
-    set_skip_flag(x, rd_stats, bsize, dist);
-    // Save the RD search results into tx_rd_record.
-    if (is_mb_rd_hash_enabled)
-      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-    return;
-  }
-#if CONFIG_SPEED_STATS
-  ++x->tx_search_count;
-#endif  // CONFIG_SPEED_STATS
-
-  // Precompute residual hashes and find existing or add new RD records to
-  // store and reuse rate and distortion values to speed up TX size search.
-  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
-  int found_rd_info = 0;
-  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
-    found_rd_info =
-        find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
-  }
-
-  // Get the tx_size 1 level down
-  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(min_tx_size, 1, cm->reduced_tx_set_used);
-  prune_tx(cpi, bsize, x, xd, tx_set_type);
-
-  int found = 0;
-  RD_STATS this_rd_stats;
-  av1_init_rd_stats(&this_rd_stats);
-  const int64_t rd =
-      select_tx_size_and_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                              found_rd_info ? matched_rd_info : NULL);
-
-  if (rd < INT64_MAX) {
-    *rd_stats = this_rd_stats;
-    found = 1;
-  }
-
-  // Reset the pruning flags.
-  av1_zero(x->tx_search_prune);
-  x->tx_split_prune_flag = 0;
-
-  // We should always find at least one candidate unless ref_best_rd is less
-  // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
-  // might have failed to find something better)
-  assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
-  if (!found) return;
-
-  // Save the RD search results into tx_rd_record.
-  if (is_mb_rd_hash_enabled) {
-    assert(mb_rd_record != NULL);
-    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-  }
-}
-
-static void model_rd_for_sb_with_fullrdy(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    int64_t sse;
-    int rate;
-    int64_t dist;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    if (is_cur_buf_hbd(xd)) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-
-    RD_STATS rd_stats;
-    if (plane == 0) {
-      pick_tx_size_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col,
-                            INT64_MAX);
-      if (rd_stats.invalid_rate) {
-        rate = 0;
-        dist = sse << 4;
-      } else {
-        rate = rd_stats.rate;
-        dist = rd_stats.dist;
-      }
-    } else {
-      model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
-                            &dist);
-    }
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
-static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int dc_mode_cost,
-                                       uint8_t *best_palette_color_map,
-                                       MB_MODE_INFO *const best_mbmi,
-                                       int64_t *best_rd, int *rate,
-                                       int *rate_tokenonly, int64_t *distortion,
-                                       int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  int this_rate;
-  int64_t this_rd;
-  int colors_u, colors_v, colors;
-  const int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  RD_STATS tokenonly_rd_stats;
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  mbmi->uv_mode = UV_DC_PRED;
-
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (seq_params->use_highbitdepth) {
-    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-  } else {
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
-  }
-
-  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-
-  colors = colors_u > colors_v ? colors_u : colors_v;
-  if (colors > 1 && colors <= 64) {
-    int r, c, n, i, j;
-    const int max_itr = 50;
-    int lb_u, ub_u, val_u;
-    int lb_v, ub_v, val_v;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[2 * PALETTE_MAX_SIZE];
-
-    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
-    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (seq_params->use_highbitdepth) {
-      lb_u = src_u16[0];
-      ub_u = src_u16[0];
-      lb_v = src_v16[0];
-      ub_v = src_v16[0];
-    } else {
-      lb_u = src_u[0];
-      ub_u = src_u[0];
-      lb_v = src_v[0];
-      ub_v = src_v[0];
-    }
-
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) {
-        if (seq_params->use_highbitdepth) {
-          val_u = src_u16[r * src_stride + c];
-          val_v = src_v16[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        } else {
-          val_u = src_u[r * src_stride + c];
-          val_v = src_v[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        }
-        if (val_u < lb_u)
-          lb_u = val_u;
-        else if (val_u > ub_u)
-          ub_u = val_u;
-        if (val_v < lb_v)
-          lb_v = val_v;
-        else if (val_v > ub_v)
-          ub_v = val_v;
-      }
-    }
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
-      for (i = 0; i < n; ++i) {
-        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
-        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
-      }
-      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
-      // Sort the U channel colors in ascending order.
-      for (i = 0; i < 2 * (n - 1); i += 2) {
-        int min_idx = i;
-        int min_val = centroids[i];
-        for (j = i + 2; j < 2 * n; j += 2)
-          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
-        if (min_idx != i) {
-          int temp_u = centroids[i], temp_v = centroids[i + 1];
-          centroids[i] = centroids[min_idx];
-          centroids[i + 1] = centroids[min_idx + 1];
-          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
-        }
-      }
-      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-      extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                               plane_block_height);
-      pmi->palette_size[1] = n;
-      for (i = 1; i < 3; ++i) {
-        for (j = 0; j < n; ++j) {
-          if (seq_params->use_highbitdepth)
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
-          else
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel((int)centroids[j * 2 + i - 1]);
-        }
-      }
-
-      super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate +
-                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        *best_mbmi = *mbmi;
-        memcpy(best_palette_color_map, color_map,
-               plane_block_width * plane_block_height *
-                   sizeof(best_palette_color_map[0]));
-        *rate = this_rate;
-        *distortion = tokenonly_rd_stats.dist;
-        *rate_tokenonly = tokenonly_rd_stats.rate;
-        *skippable = tokenonly_rd_stats.skip;
-      }
-    }
-  }
-  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           plane_block_width * plane_block_height *
-               sizeof(best_palette_color_map[0]));
-  }
-}
-
-// Run RD calculation with given chroma intra prediction angle., and return
-// the RD cost. Update the best mode info. if the RD cost is the best so far.
-static int64_t pick_intra_angle_routine_sbuv(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
-    int *best_angle_delta, int64_t *best_rd) {
-  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
-  assert(!is_inter_block(mbmi));
-  int this_rate;
-  int64_t this_rd;
-  RD_STATS tokenonly_rd_stats;
-
-  if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
-    return INT64_MAX;
-  this_rate = tokenonly_rd_stats.rate +
-              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-  if (this_rd < *best_rd) {
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
-  }
-  return this_rd;
-}
-
-// With given chroma directional intra prediction mode, pick the best angle
-// delta. Return true if a RD cost that is smaller than the input one is found.
-static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int rate_overhead,
-                                    int64_t best_rd, int *rate,
-                                    RD_STATS *rd_stats) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  int i, angle_delta, best_angle_delta = 0;
-  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-
-  rd_stats->rate = INT_MAX;
-  rd_stats->skip = 0;
-  rd_stats->dist = INT64_MAX;
-  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (i = 0; i < 2; ++i) {
-      best_rd_in = (best_rd == INT64_MAX)
-                       ? INT64_MAX
-                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
-      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
-      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
-                                              best_rd_in, rate, rd_stats,
-                                              &best_angle_delta, &best_rd);
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (angle_delta == 0) {
-        if (this_rd == INT64_MAX) return 0;
-        rd_cost[1] = this_rd;
-        break;
-      }
-    }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    int64_t rd_thresh;
-    for (i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
-        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
-                                      rate, rd_stats, &best_angle_delta,
-                                      &best_rd);
-      }
-    }
-  }
-
-  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
-  return rd_stats->rate != INT_MAX;
-}
-
-#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
-  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
-static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
-                             TX_SIZE tx_size, int64_t best_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_DEBUG
-  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
-  const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
-  const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
-  (void)plane_bsize;
-  assert(plane_bsize < BLOCK_SIZES_ALL);
-  if (!xd->lossless[mbmi->segment_id]) {
-    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
-    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
-  }
-#endif  // CONFIG_DEBUG
-
-  xd->cfl.use_dc_pred_cache = 1;
-  const int64_t mode_rd =
-      RDCOST(x->rdmult,
-             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
-  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#if CONFIG_DEBUG
-  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#endif  // CONFIG_DEBUG
-
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    RD_STATS rd_stats;
-    av1_init_rd_stats(&rd_stats);
-    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-      best_rd_uv[joint_sign][plane] = INT64_MAX;
-      best_c[joint_sign][plane] = 0;
-    }
-    // Collect RD stats for an alpha value of zero in this plane.
-    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
-    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
-      const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
-      if (i == CFL_SIGN_NEG) {
-        mbmi->cfl_alpha_idx = 0;
-        mbmi->cfl_alpha_signs = joint_sign;
-        txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
-                         tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
-        if (rd_stats.rate == INT_MAX) break;
-      }
-      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
-      best_rd_uv[joint_sign][plane] =
-          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-#if CONFIG_DEBUG
-      best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-    }
-  }
-
-  int best_joint_sign = -1;
-
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
-      int progress = 0;
-      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-        int flag = 0;
-        RD_STATS rd_stats;
-        if (c > 2 && progress < c) break;
-        av1_init_rd_stats(&rd_stats);
-        for (int i = 0; i < CFL_SIGNS; i++) {
-          const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
-          if (i == 0) {
-            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
-            mbmi->cfl_alpha_signs = joint_sign;
-            txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
-                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE,
-                             0);
-            if (rd_stats.rate == INT_MAX) break;
-          }
-          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
-          int64_t this_rd =
-              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
-          best_rd_uv[joint_sign][plane] = this_rd;
-          best_c[joint_sign][plane] = c;
-#if CONFIG_DEBUG
-          best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-          flag = 2;
-          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
-          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
-          if (this_rd >= best_rd) continue;
-          best_rd = this_rd;
-          best_joint_sign = joint_sign;
-        }
-        progress += flag;
-      }
-    }
-  }
-
-  int best_rate_overhead = INT_MAX;
-  int ind = 0;
-  if (best_joint_sign >= 0) {
-    const int u = best_c[best_joint_sign][CFL_PRED_U];
-    const int v = best_c[best_joint_sign][CFL_PRED_V];
-    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
-                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
-#if CONFIG_DEBUG
-    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
-                   best_rate_overhead +
-                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
-                   best_rate_uv[best_joint_sign][CFL_PRED_V];
-#endif  // CONFIG_DEBUG
-  } else {
-    best_joint_sign = 0;
-  }
-
-  mbmi->cfl_alpha_idx = ind;
-  mbmi->cfl_alpha_signs = best_joint_sign;
-  xd->cfl.use_dc_pred_cache = 0;
-  xd->cfl.dc_pred_is_cached[0] = 0;
-  xd->cfl.dc_pred_is_cached[1] = 0;
-  return best_rate_overhead;
-}
-
-static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
-  mbmi->uv_mode = UV_DC_PRED;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-}
-
-static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, int *rate_tokenonly,
-                                       int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  MB_MODE_INFO best_mbmi = *mbmi;
-  int64_t best_rd = INT64_MAX, this_rd;
-
-  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
-    int this_rate;
-    RD_STATS tokenonly_rd_stats;
-    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
-    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
-    if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
-          (1 << mode)))
-      continue;
-    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
-        mode <= UV_SMOOTH_H_PRED)
-      continue;
-
-    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
-
-    mbmi->uv_mode = mode;
-    int cfl_alpha_rate = 0;
-    if (mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
-      assert(!is_directional_mode);
-      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
-      if (cfl_alpha_rate == INT_MAX) continue;
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
-        cpi->oxcf.enable_angle_delta) {
-      const int rate_overhead =
-          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
-      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
-                                    &this_rate, &tokenonly_rd_stats))
-        continue;
-    } else {
-      if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
-        continue;
-      }
-    }
-    const int mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
-        cfl_alpha_rate;
-    this_rate = tokenonly_rd_stats.rate +
-                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
-    if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
-#if CONFIG_DEBUG
-      if (!xd->lossless[mbmi->segment_id])
-        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
-#endif  // CONFIG_DEBUG
-    }
-    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-    if (this_rd < best_rd) {
-      best_mbmi = *mbmi;
-      best_rd = this_rd;
-      *rate = this_rate;
-      *rate_tokenonly = tokenonly_rd_stats.rate;
-      *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
-    }
-  }
-
-  const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette) {
-    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(
-        cpi, x,
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
-        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
-        distortion, skippable);
-  }
-
-  *mbmi = best_mbmi;
-  // Make sure we actually chose a mode
-  assert(best_rd < INT64_MAX);
-  return best_rd;
-}
-
-static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
-                                 int *rate_uv, int *rate_uv_tokenonly,
-                                 int64_t *dist_uv, int *skip_uv,
-                                 UV_PREDICTION_MODE *mode_uv) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
-  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  // Use an estimated rd for uv_intra based on DC_PRED if the
-  // appropriate speed flag is set.
-  init_sbuv_mode(mbmi);
-  if (x->skip_chroma_rd) {
-    *rate_uv = 0;
-    *rate_uv_tokenonly = 0;
-    *dist_uv = 0;
-    *skip_uv = 1;
-    *mode_uv = UV_DC_PRED;
-    return;
-  }
-  xd->cfl.is_chroma_reference =
-      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
-                          cm->seq_params.subsampling_y);
-  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
-                             xd->plane[AOM_PLANE_U].subsampling_y);
-  // Only store reconstructed luma when there's chroma RDO. When there's no
-  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
-  if (xd->cfl.store_y) {
-    // Restore reconstructed luma values.
-    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
-                                 cpi->optimize_seg_arr[mbmi->segment_id],
-                                 mi_row, mi_col);
-    xd->cfl.store_y = 0;
-  }
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bsize, max_tx_size);
-  *mode_uv = mbmi->uv_mode;
-}
-
 static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
                        int16_t mode_context) {
   if (is_inter_compound_mode(mode)) {
@@ -6607,236 +732,13 @@
   }
 }
 
-static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
-                                             const MB_MODE_INFO *const mbmi) {
-  switch (mbmi->interinter_comp.type) {
-    case COMPOUND_AVERAGE: return 0;
-    case COMPOUND_WEDGE:
-      return get_interinter_wedge_bits(mbmi->sb_type) > 0
-                 ? av1_cost_literal(1) +
-                       x->wedge_idx_cost[mbmi->sb_type]
-                                        [mbmi->interinter_comp.wedge_index]
-                 : 0;
-    case COMPOUND_DIFFWTD: return av1_cost_literal(1);
-    default: assert(0); return 0;
-  }
-}
-
-static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
-  return (mv->row >> 3) < mv_limits->row_min ||
-         (mv->row >> 3) > mv_limits->row_max ||
-         (mv->col >> 3) < mv_limits->col_min ||
-         (mv->col >> 3) > mv_limits->col_max;
-}
-
 static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
-                                              int ref_idx, int is_comp_pred) {
-  PREDICTION_MODE single_mode;
-  if (is_comp_pred) {
-    single_mode =
-        ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
-  } else {
-    single_mode = this_mode;
-  }
-  return single_mode;
+                                              int ref_idx) {
+  return ref_idx ? compound_ref1_mode(this_mode)
+                 : compound_ref0_mode(this_mode);
 }
 
-static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
-                                int mi_col, int_mv *ref_mv_sub8x8[2],
-                                const uint8_t *mask, int mask_stride,
-                                int *rate_mv, const int block) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const int pw = block_size_wide[bsize];
-  const int ph = block_size_high[bsize];
-  const int plane = 0;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  // This function should only ever be called for compound modes
-  assert(has_second_ref(mbmi));
-  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
-  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-  int_mv ref_mv[2];
-  int ite, ref;
-  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
-  const int ic = block & 1;
-  const int ir = (block - ic) >> 1;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
-  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-
-  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-  conv_params.use_dist_wtd_comp_avg = 0;
-  WarpTypesAllowed warp_types[2];
-  for (ref = 0; ref < 2; ++ref) {
-    const WarpedMotionParams *const wm =
-        &xd->global_motion[xd->mi[0]->ref_frame[ref]];
-    const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
-    warp_types[ref].global_warp_allowed = is_global;
-    warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-  }
-
-  // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
-  int last_besterr[2] = { INT_MAX, INT_MAX };
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    av1_get_scaled_ref_frame(cpi, refs[0]),
-    av1_get_scaled_ref_frame(cpi, refs[1])
-  };
-
-  // Prediction buffer from second frame.
-  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
-  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
-  (void)ref_mv_sub8x8;
-
-  MV *const best_mv = &x->best_mv.as_mv;
-  const int search_range = SEARCH_RANGE_8P;
-  const int sadpb = x->sadperbit16;
-  // Allow joint search multiple times iteratively for each reference frame
-  // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < 4; ite++) {
-    struct buf_2d ref_yv12[2];
-    int bestsme = INT_MAX;
-    MvLimits tmp_mv_limits = x->mv_limits;
-    int id = ite % 2;  // Even iterations search in the first reference frame,
-                       // odd iterations search in the second. The predictor
-                       // found for the 'other' reference frame is factored in.
-    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
-      if (cur_mv[id].as_int == init_mv[id].as_int) {
-        break;
-      } else {
-        int_mv cur_int_mv, init_int_mv;
-        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
-        cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
-        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
-        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
-        if (cur_int_mv.as_int == init_int_mv.as_int) {
-          break;
-        }
-      }
-    }
-    for (ref = 0; ref < 2; ++ref) {
-      ref_mv[ref] = av1_get_ref_mv(x, ref);
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      if (scaled_ref_frame[ref]) {
-        int i;
-        for (i = 0; i < num_planes; i++)
-          backup_yv12[ref][i] = xd->plane[i].pre[ref];
-        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                             NULL, num_planes);
-      }
-    }
-
-    assert(IMPLIES(scaled_ref_frame[0] != NULL,
-                   cm->width == scaled_ref_frame[0]->y_crop_width &&
-                       cm->height == scaled_ref_frame[0]->y_crop_height));
-    assert(IMPLIES(scaled_ref_frame[1] != NULL,
-                   cm->width == scaled_ref_frame[1]->y_crop_width &&
-                       cm->height == scaled_ref_frame[1]->y_crop_height));
-
-    // Initialize based on (possibly scaled) prediction buffers.
-    ref_yv12[0] = xd->plane[plane].pre[0];
-    ref_yv12[1] = xd->plane[plane].pre[1];
-
-    // Get the prediction block from the 'other' reference frame.
-    const InterpFilters interp_filters = EIGHTTAP_REGULAR;
-
-    // Since we have scaled the reference frames to match the size of the
-    // current frame we must use a unit scaling factor during mode selection.
-    av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
-                              second_pred, pw, &cur_mv[!id].as_mv,
-                              &cm->sf_identity, pw, ph, &conv_params,
-                              interp_filters, &warp_types[!id], p_col, p_row,
-                              plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
-                              mi_row * MI_SIZE, xd, cm->allow_warped_motion);
-
-    const int order_idx = id != 0;
-    av1_dist_wtd_comp_weight_assign(
-        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
-        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
-
-    // Do full-pixel compound motion search on the current reference frame.
-    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
-    av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
-
-    // Use the mv result from the single mode as mv predictor.
-    *best_mv = cur_mv[id].as_mv;
-
-    best_mv->col >>= 3;
-    best_mv->row >>= 3;
-
-    // Small-range full-pixel motion search.
-    bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
-                                       &cpi->fn_ptr[bsize], mask, mask_stride,
-                                       id, &ref_mv[id].as_mv, second_pred);
-    if (bestsme < INT_MAX) {
-      if (mask)
-        bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
-                                          second_pred, mask, mask_stride, id,
-                                          &cpi->fn_ptr[bsize], 1);
-      else
-        bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
-                                        second_pred, &cpi->fn_ptr[bsize], 1);
-    }
-
-    x->mv_limits = tmp_mv_limits;
-
-    // Restore the pointer to the first (possibly scaled) prediction buffer.
-    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
-
-    for (ref = 0; ref < 2; ++ref) {
-      if (scaled_ref_frame[ref]) {
-        // Swap back the original buffers for subpel motion search.
-        for (int i = 0; i < num_planes; i++) {
-          xd->plane[i].pre[ref] = backup_yv12[ref][i];
-        }
-        // Re-initialize based on unscaled prediction buffers.
-        ref_yv12[ref] = xd->plane[plane].pre[ref];
-      }
-    }
-
-    // Do sub-pixel compound motion search on the current reference frame.
-    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
-
-    if (cpi->common.cur_frame_force_integer_mv) {
-      x->best_mv.as_mv.row *= 8;
-      x->best_mv.as_mv.col *= 8;
-    }
-    if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step(
-          x, cm, mi_row, mi_col, &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmv_vec_cost, x->mv_cost_stack, &dis, &sse, second_pred, mask,
-          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
-    }
-
-    // Restore the pointer to the first prediction buffer.
-    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
-    if (bestsme < last_besterr[id]) {
-      cur_mv[id].as_mv = *best_mv;
-      last_besterr[id] = bestsme;
-    } else {
-      break;
-    }
-  }
-
-  *rate_mv = 0;
-
-  for (ref = 0; ref < 2; ++ref) {
-    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
-    *rate_mv +=
-        av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, MV_COST_WEIGHT);
-  }
-}
-
-static void estimate_ref_frame_costs(
+static AOM_INLINE void estimate_ref_frame_costs(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
     int segment_id, unsigned int *ref_costs_single,
     unsigned int (*ref_costs_comp)[REF_FRAMES]) {
@@ -6986,28 +888,33 @@
   }
 }
 
-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                                 int mode_index,
-                                 int64_t comp_pred_diff[REFERENCE_MODES],
-                                 int skippable) {
+static AOM_INLINE void store_coding_context(
+#if CONFIG_INTERNAL_STATS
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+#else
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+#endif  // CONFIG_INTERNAL_STATS
+    int64_t comp_pred_diff[REFERENCE_MODES], int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  ctx->skip = x->skip;
+  ctx->rd_stats.skip = x->force_skip;
   ctx->skippable = skippable;
+#if CONFIG_INTERNAL_STATS
   ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
-  ctx->mbmi_ext = *x->mbmi_ext;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 }
 
-static void setup_buffer_ref_mvs_inter(
+static AOM_INLINE void setup_buffer_ref_mvs_inter(
     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
-    BLOCK_SIZE block_size, int mi_row, int mi_col,
-    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+    BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
@@ -7023,18 +930,19 @@
   if (scaled_ref_frame) {
     // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
     // support scaling.
-    av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, mi_row,
-                         mi_col, NULL, NULL, num_planes);
-  } else {
-    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL,
                          num_planes);
+  } else {
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
   }
 
   // Gets an initial list of candidate vectors from neighbours and orders them
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
-                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
-                   mi_col, mbmi_ext->mode_context);
-
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the center point for subsequent searches.
   // The current implementation doesn't support scaling.
@@ -7045,996 +953,23 @@
   if (scaled_ref_frame) {
     // We had temporarily setup pred block based on scaled reference above. Go
     // back to unscaled reference now, for subsequent use.
-    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
-                         num_planes);
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
   }
 }
 
-static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 int ref_idx, int *rate_mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  int bestsme = INT_MAX;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  MV mvp_full;
-  int ref = mbmi->ref_frame[ref_idx];
-  MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
-
-  MvLimits tmp_mv_limits = x->mv_limits;
-  int cost_list[5];
-
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-
-  if (scaled_ref_frame) {
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // full-pixel motion search code to be used without additional
-    // modifications.
-    for (int i = 0; i < num_planes; i++) {
-      backup_yv12[i] = xd->plane[i].pre[ref_idx];
-    }
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
-  }
-
-  // Work out the size of the first step in the mv step search.
-  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
-  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
-    // Take the weighted average of the step_params based on the last frame's
-    // max mv magnitude and that based on the best ref mvs of the current
-    // block for the given reference.
-    step_param =
-        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
-        2;
-  } else {
-    step_param = cpi->mv_step_param;
-  }
-
-  if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
-    int boffset =
-        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
-             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
-    step_param = AOMMAX(step_param, boffset);
-  }
-
-  if (cpi->sf.adaptive_motion_search) {
-    int bwl = mi_size_wide_log2[bsize];
-    int bhl = mi_size_high_log2[bsize];
-    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
-
-    if (tlevel < 5) {
-      step_param += 2;
-      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
-    }
-
-    // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
-      int i;
-      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
-          x->best_mv.as_int = INVALID_MV;
-
-          if (scaled_ref_frame) {
-            // Swap back the original buffers before returning.
-            for (int j = 0; j < num_planes; ++j)
-              xd->plane[j].pre[ref_idx] = backup_yv12[j];
-          }
-          return;
-        }
-      }
-    }
-  }
-
-  // Note: MV limits are modified here. Always restore the original values
-  // after full-pixel motion search.
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-
-  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
-    mvp_full = mbmi->mv[0].as_mv;
-  else
-    mvp_full = ref_mv;
-
-  mvp_full.col >>= 3;
-  mvp_full.row >>= 3;
-
-  x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
-
-  switch (mbmi->motion_mode) {
-    case SIMPLE_TRANSLATION:
-      bestsme = av1_full_pixel_search(
-          cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
-          sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
-          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
-      break;
-    case OBMC_CAUSAL:
-      bestsme = av1_obmc_full_pixel_search(
-          cpi, x, &mvp_full, step_param, sadpb,
-          MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
-          &(x->best_mv.as_mv), 0, &cpi->ss_cfg[SS_CFG_SRC]);
-      break;
-    default: assert(0 && "Invalid motion mode!\n");
-  }
-
-  if (scaled_ref_frame) {
-    // Swap back the original buffers for subpel motion search.
-    for (int i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-    }
-  }
-
-  x->mv_limits = tmp_mv_limits;
-
-  if (cpi->common.cur_frame_force_integer_mv) {
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-  }
-  const int use_fractional_mv =
-      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
-  if (use_fractional_mv) {
-    int dis; /* TODO: use dis in distortion calculation later. */
-    switch (mbmi->motion_mode) {
-      case SIMPLE_TRANSLATION:
-        if (cpi->sf.use_accurate_subpel_search) {
-          int best_mv_var;
-          const int try_second = x->second_best_mv.as_int != INVALID_MV &&
-                                 x->second_best_mv.as_int != x->best_mv.as_int;
-          const int pw = block_size_wide[bsize];
-          const int ph = block_size_high[bsize];
-          best_mv_var = cpi->find_fractional_mv_step(
-              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmv_vec_cost, x->mv_cost_stack, &dis, &x->pred_sse[ref], NULL,
-              NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
-
-          if (try_second) {
-            const int minc =
-                AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
-            const int maxc =
-                AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
-            const int minr =
-                AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
-            const int maxr =
-                AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
-            int this_var;
-            MV best_mv = x->best_mv.as_mv;
-
-            x->best_mv = x->second_best_mv;
-            if (x->best_mv.as_mv.row * 8 <= maxr &&
-                x->best_mv.as_mv.row * 8 >= minr &&
-                x->best_mv.as_mv.col * 8 <= maxc &&
-                x->best_mv.as_mv.col * 8 >= minc) {
-              this_var = cpi->find_fractional_mv_step(
-                  x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-                  x->errorperbit, &cpi->fn_ptr[bsize],
-                  cpi->sf.mv.subpel_force_stop,
-                  cpi->sf.mv.subpel_iters_per_step,
-                  cond_cost_list(cpi, cost_list), x->nmv_vec_cost,
-                  x->mv_cost_stack, &dis, &x->pred_sse[ref], NULL, NULL, 0, 0,
-                  pw, ph, cpi->sf.use_accurate_subpel_search, 0);
-              if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
-              x->best_mv.as_mv = best_mv;
-            }
-          }
-        } else {
-          cpi->find_fractional_mv_step(
-              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmv_vec_cost, x->mv_cost_stack, &dis, &x->pred_sse[ref], NULL,
-              NULL, 0, 0, 0, 0, 0, 1);
-        }
-        break;
-      case OBMC_CAUSAL:
-        av1_find_best_obmc_sub_pixel_tree_up(
-            x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
-            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
-            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
-            x->nmv_vec_cost, x->mv_cost_stack, &dis, &x->pred_sse[ref], 0,
-            cpi->sf.use_accurate_subpel_search);
-        break;
-      default: assert(0 && "Invalid motion mode!\n");
-    }
-  }
-  *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmv_vec_cost,
-                             x->mv_cost_stack, MV_COST_WEIGHT);
-
-  if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
-    x->pred_mv[ref] = x->best_mv.as_mv;
-}
-
-static INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
-                                   const int num_planes) {
-  for (int i = 0; i < num_planes; i++) {
-    xd->plane[i].dst.buf = dst.plane[i];
-    xd->plane[i].dst.stride = dst.stride[i];
-  }
-}
-
-static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, const MV *other_mv,
-                                    int mi_row, int mi_col, const int block,
-                                    int ref_idx, uint8_t *second_pred) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int pw = block_size_wide[bsize];
-  const int ph = block_size_high[bsize];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int other_ref = mbmi->ref_frame[!ref_idx];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
-  const int ic = block & 1;
-  const int ir = (block - ic) >> 1;
-  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
-  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-  const WarpedMotionParams *const wm = &xd->global_motion[other_ref];
-  int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
-
-  // This function should only ever be called for compound modes
-  assert(has_second_ref(mbmi));
-
-  const int plane = 0;
-  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
-
-  struct scale_factors sf;
-  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
-                                    cm->width, cm->height);
-
-  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-  WarpTypesAllowed warp_types;
-  warp_types.global_warp_allowed = is_global;
-  warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-
-  // Get the prediction block from the 'other' reference frame.
-  av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
-                            other_mv, &sf, pw, ph, &conv_params,
-                            mbmi->interp_filters, &warp_types, p_col, p_row,
-                            plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
-                            mi_row * MI_SIZE, xd, cm->allow_warped_motion);
-
-  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
-                                  &xd->jcp_param.bck_offset,
-                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
-}
-
-// Search for the best mv for one component of a compound,
-// given that the other component is fixed.
-static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                          BLOCK_SIZE bsize, MV *this_mv,
-                                          int mi_row, int mi_col,
-                                          const uint8_t *second_pred,
-                                          const uint8_t *mask, int mask_stride,
-                                          int *rate_mv, int ref_idx) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const int pw = block_size_wide[bsize];
-  const int ph = block_size_high[bsize];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int ref = mbmi->ref_frame[ref_idx];
-  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
-  struct macroblockd_plane *const pd = &xd->plane[0];
-
-  struct buf_2d backup_yv12[MAX_MB_PLANE];
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-
-  // Check that this is either an interinter or an interintra block
-  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
-
-  // Store the first prediction buffer.
-  struct buf_2d orig_yv12;
-  if (ref_idx) {
-    orig_yv12 = pd->pre[0];
-    pd->pre[0] = pd->pre[ref_idx];
-  }
-
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // full-pixel motion search code to be used without additional
-    // modifications.
-    for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
-  }
-
-  int bestsme = INT_MAX;
-  int sadpb = x->sadperbit16;
-  MV *const best_mv = &x->best_mv.as_mv;
-  int search_range = SEARCH_RANGE_8P;
-
-  MvLimits tmp_mv_limits = x->mv_limits;
-
-  // Do compound motion search on the current reference frame.
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
-
-  // Use the mv result from the single mode as mv predictor.
-  *best_mv = *this_mv;
-
-  best_mv->col >>= 3;
-  best_mv->row >>= 3;
-
-  // Small-range full-pixel motion search.
-  bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
-                                     &cpi->fn_ptr[bsize], mask, mask_stride,
-                                     ref_idx, &ref_mv.as_mv, second_pred);
-  if (bestsme < INT_MAX) {
-    if (mask)
-      bestsme =
-          av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
-                                  mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
-    else
-      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
-                                      &cpi->fn_ptr[bsize], 1);
-  }
-
-  x->mv_limits = tmp_mv_limits;
-
-  if (scaled_ref_frame) {
-    // Swap back the original buffers for subpel motion search.
-    for (int i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-    }
-  }
-
-  if (cpi->common.cur_frame_force_integer_mv) {
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-  }
-  const int use_fractional_mv =
-      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
-  if (use_fractional_mv) {
-    int dis; /* TODO: use dis in distortion calculation later. */
-    unsigned int sse;
-    bestsme = cpi->find_fractional_mv_step(
-        x, cm, mi_row, mi_col, &ref_mv.as_mv,
-        cpi->common.allow_high_precision_mv, x->errorperbit,
-        &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-        x->nmv_vec_cost, x->mv_cost_stack, &dis, &sse, second_pred, mask,
-        mask_stride, ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
-  }
-
-  // Restore the pointer to the first unscaled prediction buffer.
-  if (ref_idx) pd->pre[0] = orig_yv12;
-
-  if (bestsme < INT_MAX) *this_mv = *best_mv;
-
-  *rate_mv = 0;
-
-  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                              x->mv_cost_stack, MV_COST_WEIGHT);
-}
-
-// Wrapper for compound_single_motion_search, for the common case
-// where the second prediction is also an inter mode.
-static void compound_single_motion_search_interinter(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
-    int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
-    const int block, int ref_idx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  // This function should only ever be called for compound modes
-  assert(has_second_ref(xd->mi[0]));
-
-  // Prediction buffer from second frame.
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
-  uint8_t *second_pred;
-  if (is_cur_buf_hbd(xd))
-    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
-  else
-    second_pred = (uint8_t *)second_pred_alloc_16;
-
-  MV *this_mv = &cur_mv[ref_idx].as_mv;
-  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
-
-  build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
-                          ref_idx, second_pred);
-
-  compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
-                                second_pred, mask, mask_stride, rate_mv,
-                                ref_idx);
-}
-
-static void do_masked_motion_search_indexed(
-    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
-    int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
-  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  BLOCK_SIZE sb_type = mbmi->sb_type;
-  const uint8_t *mask;
-  const int mask_stride = block_size_wide[bsize];
-
-  mask = av1_get_compound_type_mask(comp_data, sb_type);
-
-  tmp_mv[0].as_int = cur_mv[0].as_int;
-  tmp_mv[1].as_int = cur_mv[1].as_int;
-  if (which == 0 || which == 1) {
-    compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row,
-                                             mi_col, mask, mask_stride, rate_mv,
-                                             0, which);
-  } else if (which == 2) {
-    joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask,
-                        mask_stride, rate_mv, 0);
-  }
-}
-
-#define USE_DISCOUNT_NEWMV_TEST 0
-#if USE_DISCOUNT_NEWMV_TEST
-// In some situations we want to discount the apparent cost of a new motion
-// vector. Where there is a subtle motion field and especially where there is
-// low spatial complexity then it can be hard to cover the cost of a new motion
-// vector in a single block, even if that motion vector reduces distortion.
-// However, once established that vector may be usable through the nearest and
-// near mv modes to reduce distortion in subsequent blocks and also improve
-// visual quality.
-#define NEW_MV_DISCOUNT_FACTOR 8
-static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
-                               int ref_idx, int ref_mv_idx,
-                               const MV_REFERENCE_FRAME *ref_frame,
-                               const MB_MODE_INFO_EXT *mbmi_ext);
-static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
-                               PREDICTION_MODE this_mode, int_mv this_mv) {
-  if (this_mode == NEWMV && this_mv.as_int != 0 &&
-      !cpi->rc.is_src_frame_alt_ref) {
-    // Only discount new_mv when nearst_mv and all near_mv are zero, and the
-    // new_mv is not equal to global_mv
-    const AV1_COMMON *const cm = &cpi->common;
-    const MACROBLOCKD *const xd = &x->e_mbd;
-    const MB_MODE_INFO *const mbmi = xd->mi[0];
-    const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0],
-                                                   NONE_FRAME };
-    const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames);
-    int_mv nearest_mv;
-    get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-    int ret = nearest_mv.as_int == 0;
-    for (int ref_mv_idx = 0;
-         ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) {
-      int_mv near_mv;
-      get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext);
-      ret &= near_mv.as_int == 0;
-    }
-    if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) {
-      int_mv global_mv;
-      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      ret &= global_mv.as_int != this_mv.as_int;
-    }
-    return ret;
-  }
-  return 0;
-}
-#endif
-
 #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
 
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+  const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                                     xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_bottom_edge +
+                                         RIGHT_BOTTOM_MARGIN };
+  clamp_mv(mv, &mv_limits);
 }
 
-static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
-                               const BLOCK_SIZE bsize, const uint8_t *pred0,
-                               int stride0, const uint8_t *pred1, int stride1) {
-  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
-    //                            4X4
-    BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
-    // 64x128,     128x64,        128x128
-    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
-    // 32X8,       16X64,         64X16
-    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
-  };
-  const struct macroblock_plane *const p = &x->plane[0];
-  const uint8_t *src = p->src.buf;
-  int src_stride = p->src.stride;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  uint32_t esq[2][4];
-  int64_t tl, br;
-
-  const BLOCK_SIZE f_index = split_qtr[bsize];
-  assert(f_index != BLOCK_INVALID);
-
-  if (is_cur_buf_hbd(&x->e_mbd)) {
-    pred0 = CONVERT_TO_BYTEPTR(pred0);
-    pred1 = CONVERT_TO_BYTEPTR(pred1);
-  }
-
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
-  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
-                          &esq[0][1]);
-  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
-                          pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
-  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
-                          pred0 + bh / 2 * stride0 + bw / 2, stride0,
-                          &esq[0][3]);
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
-  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
-                          &esq[1][1]);
-  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
-                          pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
-  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
-                          pred1 + bh / 2 * stride1 + bw / 2, stride0,
-                          &esq[1][3]);
-
-  tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
-       ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
-  br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
-       ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
-  return (tl + br > 0);
-}
-
-// Choose the best wedge index and sign
-static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
-                          const BLOCK_SIZE bsize, const uint8_t *const p0,
-                          const int16_t *const residual1,
-                          const int16_t *const diff10,
-                          int *const best_wedge_sign,
-                          int *const best_wedge_index) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const src = &x->plane[0].src;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const int N = bw * bh;
-  assert(N >= 64);
-  int rate;
-  int64_t dist;
-  int64_t rd, best_rd = INT64_MAX;
-  int wedge_index;
-  int wedge_sign;
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
-  const uint8_t *mask;
-  uint64_t sse;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-
-  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
-  if (hbd) {
-    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
-  }
-
-  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
-                        (int64_t)aom_sum_squares_i16(residual1, N)) *
-                       (1 << WEDGE_WEIGHT_BITS) / 2;
-  int16_t *ds = residual0;
-
-  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
-
-  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
-
-    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
-
-    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
-    sse = ROUND_POWER_OF_TWO(sse, bd_round);
-
-    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
-                                                  &rate, &dist);
-    // int rate2;
-    // int64_t dist2;
-    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
-    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
-    // sse, rate, dist, rate2, dist2); dist = dist2;
-    // rate = rate2;
-
-    rate += x->wedge_idx_cost[bsize][wedge_index];
-    rd = RDCOST(x->rdmult, rate, dist);
-
-    if (rd < best_rd) {
-      *best_wedge_index = wedge_index;
-      *best_wedge_sign = wedge_sign;
-      best_rd = rd;
-    }
-  }
-
-  return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
-}
-
-// Choose the best wedge index the specified sign
-static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
-                                     const MACROBLOCK *const x,
-                                     const BLOCK_SIZE bsize,
-                                     const int16_t *const residual1,
-                                     const int16_t *const diff10,
-                                     const int wedge_sign,
-                                     int *const best_wedge_index) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const int N = bw * bh;
-  assert(N >= 64);
-  int rate;
-  int64_t dist;
-  int64_t rd, best_rd = INT64_MAX;
-  int wedge_index;
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
-  const uint8_t *mask;
-  uint64_t sse;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
-    sse = ROUND_POWER_OF_TWO(sse, bd_round);
-
-    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
-                                                  &rate, &dist);
-    rate += x->wedge_idx_cost[bsize][wedge_index];
-    rd = RDCOST(x->rdmult, rate, dist);
-
-    if (rd < best_rd) {
-      *best_wedge_index = wedge_index;
-      best_rd = rd;
-    }
-  }
-  return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
-}
-
-static int64_t pick_interinter_wedge(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
-    const uint8_t *const p0, const uint8_t *const p1,
-    const int16_t *const residual1, const int16_t *const diff10) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int bw = block_size_wide[bsize];
-
-  int64_t rd;
-  int wedge_index = -1;
-  int wedge_sign = 0;
-
-  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.seq_params.enable_masked_compound);
-
-  if (cpi->sf.fast_wedge_sign_estimate) {
-    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
-    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
-                               &wedge_index);
-  } else {
-    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
-                    &wedge_index);
-  }
-
-  mbmi->interinter_comp.wedge_sign = wedge_sign;
-  mbmi->interinter_comp.wedge_index = wedge_index;
-  return rd;
-}
-
-static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
-                                   const uint8_t *const p0,
-                                   const uint8_t *const p1,
-                                   const int16_t *const residual1,
-                                   const int16_t *const diff10) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const int N = 1 << num_pels_log2_lookup[bsize];
-  int rate;
-  int64_t dist;
-  DIFFWTD_MASK_TYPE cur_mask_type;
-  int64_t best_rd = INT64_MAX;
-  DIFFWTD_MASK_TYPE best_mask_type = 0;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
-  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
-  // try each mask type and its inverse
-  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
-    // build mask and inverse
-    if (hbd)
-      av1_build_compound_diffwtd_mask_highbd(
-          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
-    else
-      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
-                                      p0, bw, p1, bw, bh, bw);
-
-    // compute rd for mask
-    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
-                                                tmp_mask[cur_mask_type], N);
-    sse = ROUND_POWER_OF_TWO(sse, bd_round);
-
-    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
-                                                  &rate, &dist);
-    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
-
-    if (rd0 < best_rd) {
-      best_mask_type = cur_mask_type;
-      best_rd = rd0;
-    }
-  }
-  mbmi->interinter_comp.mask_type = best_mask_type;
-  if (best_mask_type == DIFFWTD_38_INV) {
-    memcpy(xd->seg_mask, seg_mask, N * 2);
-  }
-  return best_rd;
-}
-
-static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
-                                     const MACROBLOCK *const x,
-                                     const BLOCK_SIZE bsize,
-                                     const uint8_t *const p0,
-                                     const uint8_t *const p1) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(is_interintra_wedge_used(bsize));
-  assert(cpi->common.seq_params.enable_interintra_compound);
-
-  const struct buf_2d *const src = &x->plane[0].src;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
-  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
-  if (is_cur_buf_hbd(xd)) {
-    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
-    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
-  }
-  int wedge_index = -1;
-  int64_t rd =
-      pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index);
-
-  mbmi->interintra_wedge_sign = 0;
-  mbmi->interintra_wedge_index = wedge_index;
-  return rd;
-}
-
-static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    const BLOCK_SIZE bsize,
-                                    const uint8_t *const p0,
-                                    const uint8_t *const p1,
-                                    const int16_t *const residual1,
-                                    const int16_t *const diff10) {
-  const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
-  switch (compound_type) {
-    case COMPOUND_WEDGE:
-      return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10);
-    case COMPOUND_DIFFWTD:
-      return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10);
-    default: assert(0); return 0;
-  }
-}
-
-static int interinter_compound_motion_search(const AV1_COMP *const cpi,
-                                             MACROBLOCK *x,
-                                             const int_mv *const cur_mv,
-                                             const BLOCK_SIZE bsize,
-                                             const PREDICTION_MODE this_mode,
-                                             int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int_mv tmp_mv[2];
-  int tmp_rate_mv = 0;
-  mbmi->interinter_comp.seg_mask = xd->seg_mask;
-  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
-
-  if (this_mode == NEW_NEWMV) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
-                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
-    mbmi->mv[0].as_int = tmp_mv[0].as_int;
-    mbmi->mv[1].as_int = tmp_mv[1].as_int;
-  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
-                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
-    mbmi->mv[0].as_int = tmp_mv[0].as_int;
-  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
-                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
-    mbmi->mv[1].as_int = tmp_mv[1].as_int;
-  }
-  return tmp_rate_mv;
-}
-
-static void get_inter_predictors_masked_compound(
-    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
-    int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
-    int16_t *residual1, int16_t *diff10, int *strides) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  int can_use_previous = cm->allow_warped_motion;
-  // get inter predictors to use for masked compound modes
-  av1_build_inter_predictors_for_planes_single_buf(
-      xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
-  av1_build_inter_predictors_for_planes_single_buf(
-      xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
-  const struct buf_2d *const src = &x->plane[0].src;
-  if (is_cur_buf_hbd(xd)) {
-    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
-                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
-                       bw);
-    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
-  }
-}
-
-static int64_t build_and_cost_compound_type(
-    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
-    int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
-    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
-    int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
-    int *calc_pred_masked_compound, int32_t *comp_rate, int64_t *comp_dist,
-    int64_t *const comp_model_rd, const int64_t comp_best_model_rd,
-    int64_t *const comp_model_rd_cur) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int64_t best_rd_cur = INT64_MAX;
-  int64_t rd = INT64_MAX;
-  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
-  int rate_sum, tmp_skip_txfm_sb;
-  int64_t dist_sum, tmp_skip_sse_sb;
-
-  // TODO(any): Save pred and mask calculation as well into records. However
-  // this may increase memory requirements as compound segment mask needs to be
-  // stored in each record.
-  if (*calc_pred_masked_compound) {
-    get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
-                                         preds1, residual1, diff10, strides);
-    *calc_pred_masked_compound = 0;
-  }
-  if (cpi->sf.prune_wedge_pred_diff_based && compound_type == COMPOUND_WEDGE) {
-    unsigned int sse;
-    if (is_cur_buf_hbd(xd))
-      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
-                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
-    else
-      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
-    const unsigned int mse =
-        ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
-    // If two predictors are very similar, skip wedge compound mode search
-    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
-      *comp_model_rd_cur = INT64_MAX;
-      return INT64_MAX;
-    }
-  }
-
-  best_rd_cur =
-      pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
-  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
-  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
-
-  // Although the true rate_mv might be different after motion search, but it
-  // is unlikely to be the best mode considering the transform rd cost and other
-  // mode overhead cost
-  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
-  if (mode_rd > ref_best_rd) {
-    *comp_model_rd_cur = INT64_MAX;
-    return INT64_MAX;
-  }
-
-  // Reuse data if matching record is found
-  if (comp_rate[compound_type] == INT_MAX) {
-    if (have_newmv_in_inter_mode(this_mode) &&
-        compound_type == COMPOUND_WEDGE &&
-        !cpi->sf.disable_interinter_wedge_newmv_search) {
-      *out_rate_mv = interinter_compound_motion_search(
-          cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col);
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
-
-      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-      rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
-      *comp_model_rd_cur = rd;
-      if (rd >= best_rd_cur) {
-        mbmi->mv[0].as_int = cur_mv[0].as_int;
-        mbmi->mv[1].as_int = cur_mv[1].as_int;
-        *out_rate_mv = rate_mv;
-        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
-                                                 strides, preds1, strides);
-        *comp_model_rd_cur = best_rd_cur;
-      }
-    } else {
-      *out_rate_mv = rate_mv;
-      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
-                                               preds1, strides);
-      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-      *comp_model_rd_cur =
-          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
-    }
-
-    RD_STATS rd_stats;
-
-    if (cpi->sf.prune_comp_type_by_model_rd &&
-        (*comp_model_rd_cur > comp_best_model_rd) &&
-        comp_best_model_rd != INT64_MAX) {
-      *comp_model_rd_cur = INT64_MAX;
-      return INT64_MAX;
-    }
-    rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd =
-          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
-      // Backup rate and distortion for future reuse
-      comp_rate[compound_type] = rd_stats.rate;
-      comp_dist[compound_type] = rd_stats.dist;
-      comp_model_rd[compound_type] = *comp_model_rd_cur;
-    }
-  } else {
-    assert(comp_dist[compound_type] != INT64_MAX);
-    // When disable_interinter_wedge_newmv_search is set, motion refinement is
-    // disabled. Hence rate and distortion can be reused in this case as well
-    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
-                   cpi->sf.disable_interinter_wedge_newmv_search));
-    assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
-    assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
-    *out_rate_mv = rate_mv;
-    // Calculate RD cost based on stored stats
-    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
-                comp_dist[compound_type]);
-    *comp_model_rd_cur = comp_model_rd[compound_type];
-  }
-  return rd;
-}
-
-typedef struct {
-  // OBMC secondary prediction buffers and respective strides
-  uint8_t *above_pred_buf[MAX_MB_PLANE];
-  int above_pred_stride[MAX_MB_PLANE];
-  uint8_t *left_pred_buf[MAX_MB_PLANE];
-  int left_pred_stride[MAX_MB_PLANE];
-  int_mv (*single_newmv)[REF_FRAMES];
-  // Pointer to array of motion vectors to use for each ref and their rates
-  // Should point to first of 2 arrays in 2D array
-  int (*single_newmv_rate)[REF_FRAMES];
-  int (*single_newmv_valid)[REF_FRAMES];
-  // Pointer to array of predicted rate-distortion
-  // Should point to first of 2 arrays in 2D array
-  int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
-  InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
-  int ref_frame_cost;
-  int single_comp_cost;
-  int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
-  int skip_motion_mode;
-  INTERINTRA_MODE *inter_intra_mode;
-  int single_ref_first_pass;
-  SimpleRDState *simple_rd_state;
-} HandleInterModeArgs;
-
 /* If the current mode shares the same mv with other modes with higher cost,
  * skip this mode. */
 static int skip_repeated_mv(const AV1_COMMON *const cm,
@@ -8097,17 +1032,29 @@
                                      const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   *out_mv = in_mv;
-  lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv,
-                     cm->cur_frame_force_integer_mv);
+  lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv,
+                     cm->features.cur_frame_force_integer_mv);
   clamp_mv2(&out_mv->as_mv, xd);
-  return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv);
+  return av1_is_fullmv_in_range(&x->mv_limits,
+                                get_fullmv_from_mv(&out_mv->as_mv));
+}
+
+// To use single newmv directly for compound modes, need to clamp the mv to the
+// valid mv range. Without this, encoder would generate out of range mv, and
+// this is seen in 8k encoding.
+static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+                                     int ref_idx) {
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  SubpelMvLimits mv_limits;
+
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+  clamp_mv(&mv->as_mv, &mv_limits);
 }
 
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize, int_mv *cur_mv,
-                            const int mi_row, const int mi_col,
-                            int *const rate_mv,
-                            HandleInterModeArgs *const args) {
+                            int *const rate_mv, HandleInterModeArgs *const args,
+                            inter_mode_info *mode_info) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
@@ -8115,21 +1062,28 @@
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   const int ref_mv_idx = mbmi->ref_mv_idx;
-  int i;
-
-  (void)args;
 
   if (is_comp_pred) {
-    if (this_mode == NEW_NEWMV) {
-      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
-      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+    const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
+    const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
 
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
-                            0, rate_mv, 0);
+    if (this_mode == NEW_NEWMV) {
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+
+      // aomenc1
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv0 || !valid_mv1) {
+        av1_joint_motion_search(cpi, x, bsize, cur_mv, NULL, 0, rate_mv);
       } else {
         *rate_mv = 0;
-        for (i = 0; i < 2; ++i) {
+        for (int i = 0; i < 2; ++i) {
           const int_mv ref_mv = av1_get_ref_mv(x, i);
           *rate_mv +=
               av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
@@ -8137,10 +1091,16 @@
         }
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+
+      // aomenc2
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv1) {
+        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
+                                                     NULL, 0, rate_mv, 1);
       } else {
         const int_mv ref_mv = av1_get_ref_mv(x, 1);
         *rate_mv =
@@ -8149,10 +1109,16 @@
       }
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
-      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+
+      // aomenc3
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv0) {
+        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
+                                                     NULL, 0, rate_mv, 0);
       } else {
         const int_mv ref_mv = av1_get_ref_mv(x, 0);
         *rate_mv =
@@ -8161,1081 +1127,56 @@
       }
     }
   } else {
-    single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
-    if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+    // Single ref case.
+    const int ref_idx = 0;
+    int search_range = INT_MAX;
 
-    args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
+    if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) {
+      const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+      int min_mv_diff = INT_MAX;
+      int best_match = -1;
+      MV prev_ref_mv[2] = { { 0 } };
+      for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
+        prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
+                                                     idx, x->mbmi_ext)
+                               .as_mv;
+        const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
+                                       abs(ref_mv.col - prev_ref_mv[idx].col));
+
+        if (min_mv_diff > ref_mv_diff) {
+          min_mv_diff = ref_mv_diff;
+          best_match = idx;
+        }
+      }
+
+      if (min_mv_diff < (16 << 3)) {
+        if (args->single_newmv_valid[best_match][refs[0]]) {
+          search_range = min_mv_diff;
+          search_range +=
+              AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row -
+                         prev_ref_mv[best_match].row),
+                     abs(args->single_newmv[best_match][refs[0]].as_mv.col -
+                         prev_ref_mv[best_match].col));
+          // Get full pixel search range.
+          search_range = (search_range + 4) >> 3;
+        }
+      }
+    }
+
+    int_mv best_mv;
+    av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+                             mode_info, &best_mv);
+    if (best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+    args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
     args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
     args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
-
-    cur_mv[0].as_int = x->best_mv.as_int;
-
-#if USE_DISCOUNT_NEWMV_TEST
-    // Estimate the rate implications of a new mv but discount this
-    // under certain circumstances where we want to help initiate a weak
-    // motion field, where the distortion gain for a single block may not
-    // be enough to overcome the cost of a new mv.
-    if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) {
-      *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
-    }
-#endif
+    cur_mv[0].as_int = best_mv.as_int;
   }
 
   return 0;
 }
 
-static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
-                                int num_planes) {
-  const BUFFER_SET *buf0 = dst_bufs[0];
-  dst_bufs[0] = dst_bufs[1];
-  dst_bufs[1] = buf0;
-  restore_dst_buf(xd, *dst_bufs[0], num_planes);
-}
-
-static INLINE int get_switchable_rate(MACROBLOCK *const x,
-                                      const InterpFilters filters,
-                                      const int ctx[2]) {
-  int inter_filter_cost;
-  const InterpFilter filter0 = av1_extract_interp_filter(filters, 0);
-  const InterpFilter filter1 = av1_extract_interp_filter(filters, 1);
-  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
-  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
-  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
-}
-
-// calculate the rdcost of given interpolation_filter
-static INLINE int64_t interpolation_filter_rd(
-    MACROBLOCK *const x, const AV1_COMP *const cpi,
-    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
-    const BUFFER_SET *const orig_dst, int64_t *const rd,
-    int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
-    const int switchable_ctx[2], const int skip_pred, int *rate,
-    int64_t *dist) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
-  int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
-
-  const InterpFilters last_best = mbmi->interp_filters;
-  mbmi->interp_filters = filter_sets[filter_idx];
-  const int tmp_rs =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
-
-  int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
-  if (min_rd > *rd) {
-    mbmi->interp_filters = last_best;
-    return 0;
-  }
-
-  (void)tile_data;
-
-  assert(skip_pred != 2);
-  assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
-  assert(rate[0] >= 0);
-  assert(dist[0] >= 0);
-  assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
-  assert(skip_sse_sb[0] >= 0);
-  assert(rate[1] >= 0);
-  assert(dist[1] >= 0);
-  assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
-  assert(skip_sse_sb[1] >= 0);
-
-  if (skip_pred != cpi->default_interp_skip_flags) {
-    if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
-#if CONFIG_COLLECT_RD_STATS == 3
-      RD_STATS rd_stats_y;
-      pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
-                            INT64_MAX);
-      PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
-#endif  // CONFIG_COLLECT_RD_STATS == 3
-      model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
-          &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
-      tmp_rate[1] = tmp_rate[0];
-      tmp_dist[1] = tmp_dist[0];
-    } else {
-      // only luma MC is skipped
-      tmp_rate[1] = rate[0];
-      tmp_dist[1] = dist[0];
-    }
-    if (num_planes > 1) {
-      for (int plane = 1; plane < num_planes; ++plane) {
-        int tmp_rate_uv, tmp_skip_sb_uv;
-        int64_t tmp_dist_uv, tmp_skip_sse_uv;
-        int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
-        if (tmp_rd >= *rd) {
-          mbmi->interp_filters = last_best;
-          return 0;
-        }
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      plane, plane);
-        model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
-            cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
-            &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
-        tmp_rate[1] =
-            (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
-        tmp_dist[1] += tmp_dist_uv;
-        tmp_skip_sb[1] &= tmp_skip_sb_uv;
-        tmp_skip_sse[1] += tmp_skip_sse_uv;
-      }
-    }
-  } else {
-    // both luma and chroma MC is skipped
-    tmp_rate[1] = rate[1];
-    tmp_dist[1] = dist[1];
-  }
-  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
-
-  if (tmp_rd < *rd) {
-    *rd = tmp_rd;
-    *switchable_rate = tmp_rs;
-    if (skip_pred != cpi->default_interp_skip_flags) {
-      if (skip_pred == 0) {
-        // Overwrite the data as current filter is the best one
-        tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
-        tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
-        memcpy(rate, tmp_rate, sizeof(*rate) * 2);
-        memcpy(dist, tmp_dist, sizeof(*dist) * 2);
-        memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
-        memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
-        // As luma MC data is computed, no need to recompute after the search
-        x->recalc_luma_mc_data = 0;
-      } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
-        // As luma MC data is not computed, update of luma data can be skipped
-        rate[1] = tmp_rate[1];
-        dist[1] = tmp_dist[1];
-        skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
-        skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
-        // As luma MC data is not recomputed and current filter is the best,
-        // indicate the possibility of recomputing MC data
-        // If current buffer contains valid MC data, toggle to indicate that
-        // luma MC data needs to be recomputed
-        x->recalc_luma_mc_data ^= 1;
-      }
-      swap_dst_buf(xd, dst_bufs, num_planes);
-    }
-    return 1;
-  }
-  mbmi->interp_filters = last_best;
-  return 0;
-}
-
-static INLINE void pred_dual_interp_filter_rd(
-    MACROBLOCK *const x, const AV1_COMP *const cpi,
-    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
-    const BUFFER_SET *const orig_dst, int64_t *const rd,
-    int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
-    InterpFilters filter_idx, const int switchable_ctx[2], const int skip_pred,
-    int *rate, int64_t *dist, InterpFilters af_horiz, InterpFilters af_vert,
-    InterpFilters lf_horiz, InterpFilters lf_vert) {
-  if ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) {
-    if (((af_vert == lf_vert) && (af_vert != SWITCHABLE))) {
-      filter_idx = af_horiz + (af_vert * SWITCHABLE_FILTERS);
-      if (filter_idx) {
-        interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                                orig_dst, rd, switchable_rate, skip_txfm_sb,
-                                skip_sse_sb, dst_bufs, filter_idx,
-                                switchable_ctx, skip_pred, rate, dist);
-      }
-    } else {
-      for (filter_idx = af_horiz; filter_idx < (DUAL_FILTER_SET_SIZE);
-           filter_idx += SWITCHABLE_FILTERS) {
-        if (filter_idx) {
-          interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                                  orig_dst, rd, switchable_rate, skip_txfm_sb,
-                                  skip_sse_sb, dst_bufs, filter_idx,
-                                  switchable_ctx, skip_pred, rate, dist);
-        }
-      }
-    }
-  } else if ((af_vert == lf_vert) && (af_vert != SWITCHABLE)) {
-    for (filter_idx = (af_vert * SWITCHABLE_FILTERS);
-         filter_idx <= ((af_vert * SWITCHABLE_FILTERS) + 2); filter_idx += 1) {
-      if (filter_idx) {
-        interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                                orig_dst, rd, switchable_rate, skip_txfm_sb,
-                                skip_sse_sb, dst_bufs, filter_idx,
-                                switchable_ctx, skip_pred, rate, dist);
-      }
-    }
-  }
-}
-
-// Find the best interp filter if dual_interp_filter = 0
-static INLINE void find_best_non_dual_interp_filter(
-    MACROBLOCK *const x, const AV1_COMP *const cpi,
-    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
-    const BUFFER_SET *const orig_dst, int64_t *const rd,
-    int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
-    const int switchable_ctx[2], const int skip_ver, const int skip_hor,
-    int *rate, int64_t *dist, int filter_set_size) {
-  int16_t i;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-
-  // Regular filter evaluation should have been done and hence the same should
-  // be the winner
-  assert(x->e_mbd.mi[0]->interp_filters == filter_sets[0]);
-  assert(filter_set_size == DUAL_FILTER_SET_SIZE);
-  if ((skip_hor & skip_ver) != cpi->default_interp_skip_flags) {
-    const AV1_COMMON *cm = &cpi->common;
-    int bsl, pred_filter_search;
-    InterpFilters af = SWITCHABLE, lf = SWITCHABLE, filter_idx = 0;
-    const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-    const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-    bsl = mi_size_wide_log2[bsize];
-    pred_filter_search =
-        cpi->sf.cb_pred_filter_search
-            ? (((mi_row + mi_col) >> bsl) +
-               get_chessboard_index(cm->current_frame.frame_number)) &
-                  0x1
-            : 0;
-    if (above_mbmi && is_inter_block(above_mbmi)) {
-      af = above_mbmi->interp_filters;
-    }
-    if (left_mbmi && is_inter_block(left_mbmi)) {
-      lf = left_mbmi->interp_filters;
-    }
-    pred_filter_search &= ((af == lf) && (af != SWITCHABLE));
-    if (pred_filter_search) {
-      filter_idx = SWITCHABLE * (af & 0xf);
-      // This assert tells that (filter_x == filter_y) for non-dual filter case
-      assert((filter_sets[filter_idx] & 0xffff) ==
-             (filter_sets[filter_idx] >> 16));
-      if (cpi->sf.adaptive_interp_filter_search &&
-          (cpi->sf.interp_filter_search_mask & (1 << (filter_idx >> 2)))) {
-        return;
-      }
-      if (filter_idx) {
-        interpolation_filter_rd(
-            x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
-            switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, filter_idx,
-            switchable_ctx, (skip_hor & skip_ver), rate, dist);
-      }
-      return;
-    }
-  }
-  // Reuse regular filter's modeled rd data for sharp filter for following
-  // cases
-  // 1) When bsize is 4x4
-  // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
-  // direction is full-pel
-  // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
-  // direction is full-pel
-  // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
-  // alone is full-pel
-
-  if ((bsize == BLOCK_4X4) ||
-      (block_size_wide[bsize] == 4 &&
-       skip_ver == cpi->default_interp_skip_flags) ||
-      (block_size_high[bsize] == 4 &&
-       skip_hor == cpi->default_interp_skip_flags)) {
-    int skip_pred = cpi->default_interp_skip_flags;
-    for (i = filter_set_size - 1; i > 0; i -= (SWITCHABLE_FILTERS + 1)) {
-      // This assert tells that (filter_x == filter_y) for non-dual filter case
-      assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
-      if (cpi->sf.adaptive_interp_filter_search &&
-          (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
-        continue;
-      }
-      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                              orig_dst, rd, switchable_rate, skip_txfm_sb,
-                              skip_sse_sb, dst_bufs, i, switchable_ctx,
-                              skip_pred, rate, dist);
-      skip_pred = (skip_hor & skip_ver);
-    }
-  } else {
-    int skip_pred = (skip_hor & skip_ver);
-    for (i = (SWITCHABLE_FILTERS + 1); i < filter_set_size;
-         i += (SWITCHABLE_FILTERS + 1)) {
-      // This assert tells that (filter_x == filter_y) for non-dual filter case
-      assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
-      if (cpi->sf.adaptive_interp_filter_search &&
-          (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
-        continue;
-      }
-      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                              orig_dst, rd, switchable_rate, skip_txfm_sb,
-                              skip_sse_sb, dst_bufs, i, switchable_ctx,
-                              skip_pred, rate, dist);
-      // In first iteration, smooth filter is evaluated. If smooth filter
-      // (which is less sharper) is the winner among regular and smooth filters,
-      // sharp filter evaluation is skipped
-      // TODO(any): Refine this gating based on modelled rd only (i.e., by not
-      // accounting switchable filter rate)
-      if (cpi->sf.skip_sharp_interp_filter_search &&
-          skip_pred != cpi->default_interp_skip_flags) {
-        if (mbmi->interp_filters == filter_sets[(SWITCHABLE_FILTERS + 1)])
-          break;
-      }
-    }
-  }
-}
-
-// check if there is saved result match with this search
-static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
-                                         MB_MODE_INFO *const mi) {
-  for (int i = 0; i < 2; ++i) {
-    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
-        (st->mv[i].as_int != mi->mv[i].as_int)) {
-      return 0;
-    }
-  }
-  if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
-  return 1;
-}
-
-// Checks if characteristics of search match
-static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
-                                   const MACROBLOCK *const x,
-                                   const COMP_RD_STATS *st,
-                                   const MB_MODE_INFO *const mi,
-                                   int32_t *comp_rate, int64_t *comp_dist,
-                                   int64_t *comp_model_rd) {
-  // TODO(ranjit): Ensure that compound type search use regular filter always
-  // and check if following check can be removed
-  // Check if interp filter matches with previous case
-  if (st->filter != mi->interp_filters) return 0;
-
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  // Match MV and reference indices
-  for (int i = 0; i < 2; ++i) {
-    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
-        (st->mv[i].as_int != mi->mv[i].as_int)) {
-      return 0;
-    }
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
-    if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
-  }
-
-  // Store the stats for compound average
-  comp_rate[COMPOUND_AVERAGE] = st->rate[COMPOUND_AVERAGE];
-  comp_dist[COMPOUND_AVERAGE] = st->dist[COMPOUND_AVERAGE];
-  comp_model_rd[COMPOUND_AVERAGE] = st->comp_model_rd[COMPOUND_AVERAGE];
-  comp_rate[COMPOUND_DISTWTD] = st->rate[COMPOUND_DISTWTD];
-  comp_dist[COMPOUND_DISTWTD] = st->dist[COMPOUND_DISTWTD];
-  comp_model_rd[COMPOUND_DISTWTD] = st->comp_model_rd[COMPOUND_DISTWTD];
-
-  // For compound wedge/segment, reuse data only if NEWMV is not present in
-  // either of the directions
-  if ((!have_newmv_in_inter_mode(mi->mode) &&
-       !have_newmv_in_inter_mode(st->mode)) ||
-      (cpi->sf.disable_interinter_wedge_newmv_search)) {
-    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
-           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
-           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_model_rd[COMPOUND_WEDGE], &st->comp_model_rd[COMPOUND_WEDGE],
-           sizeof(comp_model_rd[COMPOUND_WEDGE]) * 2);
-  }
-  return 1;
-}
-
-static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
-                                              MB_MODE_INFO *const mbmi) {
-  const int comp_idx = mbmi->compound_idx;
-  const int offset = x->interp_filter_stats_idx[comp_idx];
-  for (int j = 0; j < offset; ++j) {
-    const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
-    if (is_interp_filter_match(st, mbmi)) {
-      mbmi->interp_filters = st->filters;
-      return j;
-    }
-  }
-  return -1;  // no match result found
-}
-// Checks if similar compound type search case is accounted earlier
-// If found, returns relevant rd data
-static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
-                                        const MACROBLOCK *x,
-                                        const MB_MODE_INFO *const mbmi,
-                                        int32_t *comp_rate, int64_t *comp_dist,
-                                        int64_t *comp_model_rd) {
-  for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
-    if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
-                         comp_dist, comp_model_rd)) {
-      return 1;
-    }
-  }
-  return 0;  // no match result found
-}
-
-static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
-                                                  MB_MODE_INFO *const mbmi,
-                                                  int64_t rd, int skip_txfm_sb,
-                                                  int64_t skip_sse_sb,
-                                                  unsigned int pred_sse) {
-  const int comp_idx = mbmi->compound_idx;
-  const int offset = x->interp_filter_stats_idx[comp_idx];
-  if (offset < MAX_INTERP_FILTER_STATS) {
-    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
-                                        { mbmi->mv[0], mbmi->mv[1] },
-                                        { mbmi->ref_frame[0],
-                                          mbmi->ref_frame[1] },
-                                        mbmi->interinter_comp.type,
-                                        rd,
-                                        skip_txfm_sb,
-                                        skip_sse_sb,
-                                        pred_sse };
-    x->interp_filter_stats[comp_idx][offset] = stat;
-    x->interp_filter_stats_idx[comp_idx]++;
-  }
-}
-
-static INLINE void save_comp_rd_search_stat(MACROBLOCK *x,
-                                            const MB_MODE_INFO *const mbmi,
-                                            const int32_t *comp_rate,
-                                            const int64_t *comp_dist,
-                                            const int64_t *comp_model_rd,
-                                            const int_mv *cur_mv) {
-  const int offset = x->comp_rd_stats_idx;
-  if (offset < MAX_COMP_RD_STATS) {
-    COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
-    memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
-    memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
-    memcpy(rd_stats->comp_model_rd, comp_model_rd,
-           sizeof(rd_stats->comp_model_rd));
-    memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
-    memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
-    rd_stats->mode = mbmi->mode;
-    rd_stats->filter = mbmi->interp_filters;
-    rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
-    const MACROBLOCKD *const xd = &x->e_mbd;
-    for (int i = 0; i < 2; ++i) {
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mbmi->ref_frame[i]];
-      rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
-    }
-    ++x->comp_rd_stats_idx;
-  }
-}
-
-static int64_t interpolation_filter_search(
-    MACROBLOCK *const x, const AV1_COMP *const cpi,
-    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
-    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
-    InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd,
-    int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, int *skip_build_pred, HandleInterModeArgs *args,
-    int64_t ref_best_rd) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int need_search =
-      av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
-  int i;
-  // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
-  // data of all planes
-  int tmp_rate[2] = { 0, 0 };
-  int64_t tmp_dist[2] = { 0, 0 };
-  int best_skip_txfm_sb[2] = { 1, 1 };
-  int64_t best_skip_sse_sb[2] = { 0, 0 };
-  const int ref_frame = xd->mi[0]->ref_frame[0];
-
-  (void)single_filter;
-  int match_found_idx = -1;
-  const InterpFilter assign_filter = cm->interp_filter;
-  if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
-    match_found_idx = find_interp_filter_in_stats(x, mbmi);
-  }
-  if (match_found_idx != -1) {
-    const int comp_idx = mbmi->compound_idx;
-    *rd = x->interp_filter_stats[comp_idx][match_found_idx].rd;
-    *skip_txfm_sb =
-        x->interp_filter_stats[comp_idx][match_found_idx].skip_txfm_sb;
-    *skip_sse_sb =
-        x->interp_filter_stats[comp_idx][match_found_idx].skip_sse_sb;
-    x->pred_sse[ref_frame] =
-        x->interp_filter_stats[comp_idx][match_found_idx].pred_sse;
-    return 0;
-  }
-  if (!need_search || match_found_idx == -1) {
-    set_default_interp_filters(mbmi, assign_filter);
-  }
-  int switchable_ctx[2];
-  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
-  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
-  *switchable_rate =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
-  if (!(*skip_build_pred)) {
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
-                                  av1_num_planes(cm) - 1);
-    *skip_build_pred = 1;
-  }
-
-#if CONFIG_COLLECT_RD_STATS == 3
-  RD_STATS rd_stats_y;
-  pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
-  PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
-#endif  // CONFIG_COLLECT_RD_STATS == 3
-  model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
-      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
-      &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
-  if (num_planes > 1)
-    model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
-        cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
-        &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
-        NULL);
-  tmp_rate[1] =
-      (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
-  assert(tmp_rate[1] >= 0);
-  tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
-  best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
-  best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
-  *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
-  *skip_txfm_sb = best_skip_txfm_sb[1];
-  *skip_sse_sb = best_skip_sse_sb[1];
-  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
-
-  if (assign_filter != SWITCHABLE || match_found_idx != -1) {
-    return 0;
-  }
-  if (!need_search) {
-    assert(mbmi->interp_filters ==
-           av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
-    return 0;
-  }
-  if (args->modelled_rd != NULL) {
-    if (has_second_ref(mbmi)) {
-      const int ref_mv_idx = mbmi->ref_mv_idx;
-      int refs[2] = { mbmi->ref_frame[0],
-                      (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-      const int mode0 = compound_ref0_mode(mbmi->mode);
-      const int mode1 = compound_ref1_mode(mbmi->mode);
-      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
-                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
-      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
-        return INT64_MAX;
-      }
-    }
-  }
-
-  x->recalc_luma_mc_data = 0;
-  // skip_flag=xx (in binary form)
-  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
-  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
-  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
-  // Skip_flag=2 is not a valid case
-  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
-  int skip_hor = cpi->default_interp_skip_flags;
-  int skip_ver = cpi->default_interp_skip_flags;
-  const int is_compound = has_second_ref(mbmi);
-  assert(is_intrabc_block(mbmi) == 0);
-  for (int j = 0; j < 1 + is_compound; ++j) {
-    const struct scale_factors *const sf =
-        get_ref_scale_factors_const(cm, mbmi->ref_frame[j]);
-    // TODO(any): Refine skip flag calculation considering scaling
-    if (av1_is_scaled(sf)) {
-      skip_hor = 0;
-      skip_ver = 0;
-      break;
-    }
-    const MV mv = mbmi->mv[j].as_mv;
-    int skip_hor_plane = 0;
-    int skip_ver_plane = 0;
-    for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
-      struct macroblockd_plane *const pd = &xd->plane[k];
-      const int bw = pd->width;
-      const int bh = pd->height;
-      const MV mv_q4 = clamp_mv_to_umv_border_sb(
-          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-      skip_hor_plane |= ((sub_x == 0) << k);
-      skip_ver_plane |= ((sub_y == 0) << k);
-    }
-    skip_hor = skip_hor & skip_hor_plane;
-    skip_ver = skip_ver & skip_ver_plane;
-    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
-    assert(skip_hor != 2);
-    assert(skip_ver != 2);
-  }
-  // When compond prediction type is compound segment wedge, luma MC and chroma
-  // MC need to go hand in hand as mask generated during luma MC is reuired for
-  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
-  // vertical filter decision may be incorrect as temporary MC evaluation
-  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
-  // populated during luma MC
-  if (is_compound && mbmi->compound_idx == 1 &&
-      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
-    assert(mbmi->comp_group_idx == 1);
-    if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
-  }
-  // do interp_filter search
-  const int filter_set_size = DUAL_FILTER_SET_SIZE;
-  restore_dst_buf(xd, *tmp_dst, num_planes);
-  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
-  if (cpi->sf.use_fast_interpolation_filter_search &&
-      cm->seq_params.enable_dual_filter) {
-    // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
-    int best_dual_mode = 0;
-    // Find best of {R}x{R,Sm,Sh}
-    const int bw = block_size_wide[bsize];
-    const int bh = block_size_high[bsize];
-    int skip_pred;
-    int bsl, pred_filter_search;
-    InterpFilters af_horiz = SWITCHABLE, af_vert = SWITCHABLE,
-                  lf_horiz = SWITCHABLE, lf_vert = SWITCHABLE, filter_idx = 0;
-    const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-    const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-    bsl = mi_size_wide_log2[bsize];
-    pred_filter_search =
-        cpi->sf.cb_pred_filter_search
-            ? (((mi_row + mi_col) >> bsl) +
-               get_chessboard_index(cm->current_frame.frame_number)) &
-                  0x1
-            : 0;
-    if (above_mbmi && is_inter_block(above_mbmi)) {
-      af_horiz = av1_extract_interp_filter(above_mbmi->interp_filters, 1);
-      af_vert = av1_extract_interp_filter(above_mbmi->interp_filters, 0);
-    }
-    if (left_mbmi && is_inter_block(left_mbmi)) {
-      lf_horiz = av1_extract_interp_filter(left_mbmi->interp_filters, 1);
-      lf_vert = av1_extract_interp_filter(left_mbmi->interp_filters, 0);
-    }
-    pred_filter_search &= !have_newmv_in_inter_mode(mbmi->mode);
-    pred_filter_search &=
-        ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) ||
-        ((af_vert == lf_vert) && (af_vert != SWITCHABLE));
-    if (pred_filter_search) {
-      pred_dual_interp_filter_rd(
-          x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
-          switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
-          filter_idx, switchable_ctx, (skip_hor & skip_ver), tmp_rate, tmp_dist,
-          af_horiz, af_vert, lf_horiz, lf_vert);
-    } else {
-      skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor;
-      for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
-        if (interpolation_filter_rd(
-                x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
-                switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
-                i, switchable_ctx, skip_pred, tmp_rate, tmp_dist)) {
-          best_dual_mode = i;
-        }
-        skip_pred = skip_hor;
-      }
-      // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
-      skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver;
-      assert(filter_set_size == DUAL_FILTER_SET_SIZE);
-      for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
-           i >= (best_dual_mode + SWITCHABLE_FILTERS);
-           i -= SWITCHABLE_FILTERS) {
-        interpolation_filter_rd(
-            x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
-            switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, i,
-            switchable_ctx, skip_pred, tmp_rate, tmp_dist);
-        skip_pred = skip_ver;
-      }
-    }
-  } else if (cm->seq_params.enable_dual_filter == 0) {
-    find_best_non_dual_interp_filter(
-        x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
-        best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
-        skip_hor, tmp_rate, tmp_dist, filter_set_size);
-  } else {
-    // EIGHTTAP_REGULAR mode is calculated beforehand
-    for (i = 1; i < filter_set_size; ++i) {
-      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
-                              orig_dst, rd, switchable_rate, best_skip_txfm_sb,
-                              best_skip_sse_sb, dst_bufs, i, switchable_ctx,
-                              (skip_hor & skip_ver), tmp_rate, tmp_dist);
-    }
-  }
-  swap_dst_buf(xd, dst_bufs, num_planes);
-  // Recompute final MC data if required
-  if (x->recalc_luma_mc_data == 1) {
-    // Recomputing final luma MC data is required only if the same was skipped
-    // in either of the directions  Condition below is necessary, but not
-    // sufficient
-    assert((skip_hor == 1) || (skip_ver == 1));
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                  AOM_PLANE_Y, AOM_PLANE_Y);
-  }
-  *skip_txfm_sb = best_skip_txfm_sb[1];
-  *skip_sse_sb = best_skip_sse_sb[1];
-  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
-
-  // save search results
-  if (cpi->sf.skip_repeat_interpolation_filter_search) {
-    assert(match_found_idx == -1);
-    save_interp_filter_search_stat(x, mbmi, *rd, *skip_txfm_sb, *skip_sse_sb,
-                                   x->pred_sse[ref_frame]);
-  }
-  return 0;
-}
-
-static int txfm_search(const AV1_COMP *cpi, const TileDataEnc *tile_data,
-                       MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
-                       RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                       RD_STATS *rd_stats_uv, int mode_rate,
-                       int64_t ref_best_rd) {
-  /*
-   * This function combines y and uv planes' transform search processes
-   * together, when the prediction is generated. It first does subtraction to
-   * obtain the prediction error. Then it calls
-   * pick_tx_size_type_yrd/super_block_yrd and super_block_uvrd sequentially and
-   * handles the early terminations happening in those functions. At the end, it
-   * computes the rd_stats/_y/_uv accordingly.
-   */
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int ref_frame_1 = mbmi->ref_frame[1];
-  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
-  const int64_t rd_thresh =
-      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
-                                  x->skip_cost[skip_ctx][1] };
-  const int64_t min_header_rate =
-      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
-  // Account for minimum skip and non_skip rd.
-  // Eventually either one of them will be added to mode_rate
-  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
-  (void)tile_data;
-
-  if (min_header_rd_possible > ref_best_rd) {
-    av1_invalid_rd_stats(rd_stats_y);
-    return 0;
-  }
-
-  av1_init_rd_stats(rd_stats);
-  av1_init_rd_stats(rd_stats_y);
-  rd_stats->rate = mode_rate;
-
-  // cost and distortion
-  av1_subtract_plane(x, bsize, 0);
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    pick_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
-#if CONFIG_COLLECT_RD_STATS == 2
-    PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
-#endif  // CONFIG_COLLECT_RD_STATS == 2
-  } else {
-    super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
-    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-    for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
-      set_blk_skip(x, 0, i, rd_stats_y->skip);
-  }
-
-  if (rd_stats_y->rate == INT_MAX) {
-    // TODO(angiebird): check if we need this
-    // restore_dst_buf(xd, *orig_dst, num_planes);
-    mbmi->ref_frame[1] = ref_frame_1;
-    return 0;
-  }
-
-  av1_merge_rd_stats(rd_stats, rd_stats_y);
-
-  const int64_t non_skip_rdcosty =
-      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
-  const int64_t skip_rdcosty =
-      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
-  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
-  if (min_rdcosty > ref_best_rd) {
-    const int64_t tokenonly_rdy =
-        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
-               RDCOST(x->rdmult, 0, rd_stats_y->sse));
-    // Invalidate rd_stats_y to skip the rest of the motion modes search
-    if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.prune_motion_mode_level) >
-        rd_thresh)
-      av1_invalid_rd_stats(rd_stats_y);
-    mbmi->ref_frame[1] = ref_frame_1;
-    return 0;
-  }
-
-  av1_init_rd_stats(rd_stats_uv);
-  const int num_planes = av1_num_planes(cm);
-  if (num_planes > 1) {
-    int64_t ref_best_chroma_rd = ref_best_rd;
-    // Calculate best rd cost possible for chroma
-    if (cpi->sf.perform_best_rd_based_gating_for_chroma &&
-        (ref_best_chroma_rd != INT64_MAX)) {
-      ref_best_chroma_rd =
-          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
-    }
-    const int is_cost_valid_uv =
-        super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
-    if (!is_cost_valid_uv) {
-      mbmi->ref_frame[1] = ref_frame_1;
-      return 0;
-    }
-    av1_merge_rd_stats(rd_stats, rd_stats_uv);
-  }
-
-  if (rd_stats->skip) {
-    rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-    rd_stats_y->rate = 0;
-    rd_stats_uv->rate = 0;
-    rd_stats->dist = rd_stats->sse;
-    rd_stats_y->dist = rd_stats_y->sse;
-    rd_stats_uv->dist = rd_stats_uv->sse;
-    rd_stats->rate += skip_flag_cost[1];
-    mbmi->skip = 1;
-    // here mbmi->skip temporarily plays a role as what this_skip2 does
-
-    const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    if (tmprd > ref_best_rd) {
-      mbmi->ref_frame[1] = ref_frame_1;
-      return 0;
-    }
-  } else if (!xd->lossless[mbmi->segment_id] &&
-             (RDCOST(x->rdmult,
-                     rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
-                     rd_stats->dist) >=
-              RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse))) {
-    rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-    rd_stats->rate += skip_flag_cost[1];
-    rd_stats->dist = rd_stats->sse;
-    rd_stats_y->dist = rd_stats_y->sse;
-    rd_stats_uv->dist = rd_stats_uv->sse;
-    rd_stats_y->rate = 0;
-    rd_stats_uv->rate = 0;
-    mbmi->skip = 1;
-  } else {
-    rd_stats->rate += skip_flag_cost[0];
-    mbmi->skip = 0;
-  }
-
-  return 1;
-}
-
-static INLINE bool enable_wedge_search(MACROBLOCK *const x,
-                                       const AV1_COMP *const cpi) {
-  // Enable wedge search if source variance and edge strength are above
-  // the thresholds.
-  return x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-         x->edge_strength > cpi->sf.disable_wedge_search_edge_thresh;
-}
-
-static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
-                                                  const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge;
-}
-
-static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
-                                                  const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
-         !cpi->sf.disable_wedge_interintra_search;
-}
-
-static int handle_inter_intra_mode(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                   int mi_row, int mi_col, MB_MODE_INFO *mbmi,
-                                   HandleInterModeArgs *args,
-                                   int64_t ref_best_rd, int *rate_mv,
-                                   int *tmp_rate2, const BUFFER_SET *orig_dst) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
-  int64_t rd = INT64_MAX;
-  int64_t best_interintra_rd = INT64_MAX;
-  int rmode, rate_sum;
-  int64_t dist_sum;
-  int tmp_rate_mv = 0;
-  int tmp_skip_txfm_sb;
-  int bw = block_size_wide[bsize];
-  int64_t tmp_skip_sse_sb;
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
-  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
-  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
-  const int *const interintra_mode_cost =
-      x->interintra_mode_cost[size_group_lookup[bsize]];
-  const int_mv mv0 = mbmi->mv[0];
-  const int is_wedge_used = is_interintra_wedge_used(bsize);
-  int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  xd->plane[0].dst.buf = tmp_buf;
-  xd->plane[0].dst.stride = bw;
-  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                AOM_PLANE_Y, AOM_PLANE_Y);
-
-  restore_dst_buf(xd, *orig_dst, num_planes);
-  mbmi->ref_frame[1] = INTRA_FRAME;
-  best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
-
-  if (cpi->oxcf.enable_smooth_interintra &&
-      !cpi->sf.disable_smooth_interintra) {
-    mbmi->use_wedge_interintra = 0;
-    int j = 0;
-    if (cpi->sf.reuse_inter_intra_mode == 0 ||
-        best_interintra_mode == INTERINTRA_MODES) {
-      for (j = 0; j < INTERINTRA_MODES; ++j) {
-        if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
-            (INTERINTRA_MODE)j == II_SMOOTH_PRED)
-          continue;
-        mbmi->interintra_mode = (INTERINTRA_MODE)j;
-        rmode = interintra_mode_cost[mbmi->interintra_mode];
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-        model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
-            cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-            &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-        rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-        if (rd < best_interintra_rd) {
-          best_interintra_rd = rd;
-          best_interintra_mode = mbmi->interintra_mode;
-        }
-      }
-      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-    }
-    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
-                       cpi->sf.disable_smooth_interintra,
-                   best_interintra_mode != II_SMOOTH_PRED));
-    rmode = interintra_mode_cost[best_interintra_mode];
-    if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
-      mbmi->interintra_mode = best_interintra_mode;
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    }
-
-    RD_STATS rd_stats;
-    rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd = RDCOST(x->rdmult, *rate_mv + rmode + rd_stats.rate + rwedge,
-                  rd_stats.dist);
-    }
-    best_interintra_rd = rd;
-    if (ref_best_rd < INT64_MAX &&
-        ((best_interintra_rd >> 4) * 9) > ref_best_rd) {
-      return -1;
-    }
-  }
-  if (is_wedge_used) {
-    int64_t best_interintra_rd_nowedge = rd;
-    int64_t best_interintra_rd_wedge = INT64_MAX;
-    int_mv tmp_mv;
-    if (enable_wedge_interintra_search(x, cpi)) {
-      mbmi->use_wedge_interintra = 1;
-
-      rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
-               x->wedge_interintra_cost[bsize][1];
-
-      if (!cpi->oxcf.enable_smooth_interintra ||
-          cpi->sf.disable_smooth_interintra) {
-        if (best_interintra_mode == INTERINTRA_MODES) {
-          mbmi->interintra_mode = II_SMOOTH_PRED;
-          best_interintra_mode = II_SMOOTH_PRED;
-          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                    intrapred, bw);
-          best_interintra_rd_wedge =
-              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
-          int j = 0;
-          for (j = 0; j < INTERINTRA_MODES; ++j) {
-            mbmi->interintra_mode = (INTERINTRA_MODE)j;
-            rmode = interintra_mode_cost[mbmi->interintra_mode];
-            av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
-                                                      orig_dst, intrapred, bw);
-            av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-            model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
-                cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-            rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-            if (rd < best_interintra_rd) {
-              best_interintra_rd_wedge = rd;
-              best_interintra_mode = mbmi->interintra_mode;
-            }
-          }
-          args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-          mbmi->interintra_mode = best_interintra_mode;
-
-          if (best_interintra_mode != II_SMOOTH_PRED) {
-            av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
-                                                      orig_dst, intrapred, bw);
-          }
-        } else {
-          mbmi->interintra_mode = best_interintra_mode;
-          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                    intrapred, bw);
-          best_interintra_rd_wedge =
-              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-        }
-      } else {
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-      }
-
-      rmode = interintra_mode_cost[mbmi->interintra_mode];
-      best_interintra_rd_wedge +=
-          RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
-      rd = INT64_MAX;
-      // Refine motion vector.
-      if (have_newmv_in_inter_mode(mbmi->mode)) {
-        // get negative of mask
-        const uint8_t *mask = av1_get_contiguous_soft_mask(
-            mbmi->interintra_wedge_index, 1, bsize);
-        tmp_mv = mbmi->mv[0];
-        compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
-                                      mi_col, intrapred, mask, bw, &tmp_rate_mv,
-                                      0);
-        if (mbmi->mv[0].as_int != tmp_mv.as_int) {
-          mbmi->mv[0].as_int = tmp_mv.as_int;
-          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                        AOM_PLANE_Y, AOM_PLANE_Y);
-          model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-              cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-          rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
-                      dist_sum);
-        }
-      }
-      if (rd >= best_interintra_rd_wedge) {
-        tmp_mv.as_int = mv0.as_int;
-        tmp_rate_mv = *rate_mv;
-        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      }
-      // Evaluate closer to true rd
-      RD_STATS rd_stats;
-      rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
-      if (rd != INT64_MAX) {
-        rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rd_stats.rate,
-                    rd_stats.dist);
-      }
-      best_interintra_rd_wedge = rd;
-      if ((!cpi->oxcf.enable_smooth_interintra ||
-           cpi->sf.disable_smooth_interintra) &&
-          best_interintra_rd_wedge == INT64_MAX)
-        return -1;
-      if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-        mbmi->use_wedge_interintra = 1;
-        mbmi->mv[0].as_int = tmp_mv.as_int;
-        *tmp_rate2 += tmp_rate_mv - *rate_mv;
-        *rate_mv = tmp_rate_mv;
-      } else {
-        mbmi->use_wedge_interintra = 0;
-        mbmi->mv[0].as_int = mv0.as_int;
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      AOM_PLANE_Y, AOM_PLANE_Y);
-      }
-    } else {
-      if (!cpi->oxcf.enable_smooth_interintra ||
-          cpi->sf.disable_smooth_interintra)
-        return -1;
-      mbmi->use_wedge_interintra = 0;
-    }
-  } else {
-    if (best_interintra_rd == INT64_MAX) return -1;
-  }
-  if (num_planes > 1) {
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                  AOM_PLANE_U, num_planes - 1);
-  }
-  return 0;
-}
-
 // If number of valid neighbours is 1,
 // 1) ROTZOOM parameters can be obtained reliably (2 parameters from
 // one neighbouring MV)
@@ -9248,7 +1189,7 @@
                                  WarpedMotionParams *wm_params,
                                  int num_proj_ref) {
   int is_valid_warp = 1;
-  if (cpi->sf.prune_warp_using_wmtype) {
+  if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
     TransformationType wmtype = get_wmtype(wm_params);
     if (num_proj_ref == 1) {
       if (wmtype != ROTZOOM) is_valid_warp = 0;
@@ -9259,80 +1200,23 @@
   return is_valid_warp;
 }
 
-struct obmc_check_mv_field_ctxt {
-  MB_MODE_INFO *current_mi;
-  int mv_field_check_result;
-};
-
-static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
-                                           uint8_t nb_mi_width,
-                                           MB_MODE_INFO *nb_mi, void *fun_ctxt,
-                                           const int num_planes) {
-  (void)xd;
-  (void)rel_mi_col;
-  (void)nb_mi_width;
-  (void)num_planes;
-  struct obmc_check_mv_field_ctxt *ctxt =
-      (struct obmc_check_mv_field_ctxt *)fun_ctxt;
-  const MB_MODE_INFO *current_mi = ctxt->current_mi;
-
-  if (ctxt->mv_field_check_result == 0) return;
-
-  if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
-      nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
-      nb_mi->interp_filters != current_mi->interp_filters) {
-    ctxt->mv_field_check_result = 0;
-  }
-}
-
-// Check if the neighbors' motions used by obmc have same parameters as for
-// the current block. If all the parameters are identical, obmc will produce
-// the same prediction as from regular bmc, therefore we can skip the
-// overlapping operations for less complexity. The parameters checked include
-// reference frame, motion vector, and interpolation filter.
-int check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                  int mi_row, int mi_col) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
-
-  foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                obmc_check_identical_mv, &mv_field_check_ctxt);
-  foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               obmc_check_identical_mv, &mv_field_check_ctxt);
-
-  return mv_field_check_ctxt.mv_field_check_result;
-}
-
-static int skip_interintra_based_on_first_pass_stats(const AV1_COMP *const cpi,
-                                                     MACROBLOCK *const x,
-                                                     BLOCK_SIZE bsize,
-                                                     int mi_row, int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  if (cpi->two_pass_partition_search &&
-      cpi->sf.use_first_partition_pass_interintra_stats &&
-      !x->cb_partition_scan) {
-    const int mi_width = mi_size_wide[bsize];
-    const int mi_height = mi_size_high[bsize];
-    // Search in the stats table to see if obmc motion mode was used in the
-    // first pass of partition search.
-    for (int row = mi_row; row < mi_row + mi_width;
-         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-      for (int col = mi_col; col < mi_col + mi_height;
-           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-        const int index = av1_first_partition_pass_stats_index(row, col);
-        const FIRST_PARTITION_PASS_STATS *const stats =
-            &x->first_partition_pass_stats[index];
-        if (stats->interintra_motion_mode_count[mbmi->ref_frame[0]]) {
-          return 0;
-        }
-      }
+static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
+                                               int *mode_index_start,
+                                               int *mode_index_end,
+                                               int last_motion_mode_allowed,
+                                               int interintra_allowed,
+                                               int eval_motion_mode) {
+  *mode_index_start = (int)SIMPLE_TRANSLATION;
+  *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
+  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    if (!eval_motion_mode) {
+      *mode_index_end = (int)SIMPLE_TRANSLATION;
+    } else {
+      // Set the start index appropriately to process motion modes other than
+      // simple translation
+      *mode_index_start = 1;
     }
-    return 1;
   }
-  return 0;
 }
 
 // TODO(afergs): Refactor the MBMI references in here - there's four
@@ -9340,11 +1224,12 @@
 static int64_t motion_mode_rd(
     const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
-    HandleInterModeArgs *const args, int64_t ref_best_rd, const int *refs,
-    int *rate_mv, const BUFFER_SET *orig_dst, int64_t *best_est_rd,
-    int do_tx_search, InterModesInfo *inter_modes_info) {
+    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *const args,
+    int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv,
+    const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search,
+    InterModesInfo *inter_modes_info, int eval_motion_mode) {
   const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
@@ -9354,8 +1239,8 @@
   int best_xskip = 0, best_disable_skip = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   const int rate_mv0 = *rate_mv;
-  int skip_interintra_mode = 0;
   const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
                                  is_interintra_allowed(mbmi) &&
                                  mbmi->compound_idx;
@@ -9368,12 +1253,13 @@
   aom_clear_system_state();
   mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
-  if (cm->switchable_motion_mode) {
-    last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                                   cm->allow_warped_motion);
+  if (features->switchable_motion_mode) {
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
   }
+
   if (last_motion_mode_allowed == WARPED_CAUSAL) {
-    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts0, pts_inref0);
   }
   const int total_samples = mbmi->num_proj_ref;
   if (total_samples == 0) {
@@ -9383,20 +1269,22 @@
   const MB_MODE_INFO base_mbmi = *mbmi;
   MB_MODE_INFO best_mbmi;
   SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
+  const int interp_filter = features->interp_filter;
   const int switchable_rate =
-      av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
+      av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter)
+                               : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
-  const int identical_obmc_mv_field_detected =
-      (cpi->sf.skip_obmc_in_uniform_mv_field ||
-       cpi->sf.skip_wm_in_uniform_mv_field)
-          ? check_identical_obmc_mv_field(cm, xd, mi_row, mi_col)
-          : 0;
-  for (int mode_index = (int)SIMPLE_TRANSLATION;
-       mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int mode_index_start, mode_index_end;
+  update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
+                              last_motion_mode_allowed, interintra_allowed,
+                              eval_motion_mode);
+  for (int mode_index = mode_index_start; mode_index <= mode_index_end;
        mode_index++) {
     if (args->skip_motion_mode && mode_index) continue;
-    if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
+    if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
         args->single_ref_first_pass && mode_index)
       break;
     int tmp_rate2 = rate2_nocoeff;
@@ -9411,23 +1299,20 @@
       assert(mbmi->ref_frame[1] != INTRA_FRAME);
     }
 
-    if (cpi->oxcf.enable_obmc == 0 && mbmi->motion_mode == OBMC_CAUSAL)
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
+                           cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.inter_sf.disable_obmc ||
+         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+        mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
-    if (identical_obmc_mv_field_detected) {
-      if (cpi->sf.skip_obmc_in_uniform_mv_field &&
-          mbmi->motion_mode == OBMC_CAUSAL)
-        continue;
-      if (cpi->sf.skip_wm_in_uniform_mv_field &&
-          mbmi->motion_mode == WARPED_CAUSAL)
-        continue;
-    }
-
     if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
       // SIMPLE_TRANSLATION mode: no need to recalculate.
       // The prediction is calculated before motion_mode_rd() is called in
       // handle_inter_mode()
-      if (cpi->sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred) {
+      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+          !is_comp_pred) {
         if (args->single_ref_first_pass == 0) {
           if (simple_states->early_skipped) {
             assert(simple_states->rd_stats.rdcost == INT64_MAX);
@@ -9439,7 +1324,9 @@
             best_rd_stats_y = simple_states->rd_stats_y;
             best_rd_stats_uv = simple_states->rd_stats_uv;
             memcpy(best_blk_skip, simple_states->blk_skip,
-                   sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+                   sizeof(x->blk_skip[0]) * xd->height * xd->width);
+            av1_copy_array(best_tx_type_map, simple_states->tx_type_map,
+                           xd->height * xd->width);
             best_xskip = simple_states->skip;
             best_disable_skip = simple_states->disable_skip;
             best_mbmi = *mbmi;
@@ -9452,40 +1339,35 @@
       const uint32_t cur_mv = mbmi->mv[0].as_int;
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
-        single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
-        mbmi->mv[0].as_int = x->best_mv.as_int;
-#if USE_DISCOUNT_NEWMV_TEST
-        if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
-          tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
-        }
-#endif
+        av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
+                                 &mbmi->mv[0]);
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
-      if (mbmi->mv[0].as_int != cur_mv) {
+      if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
                                       0, av1_num_planes(cm) - 1);
       }
       av1_build_obmc_inter_prediction(
-          cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
+          cm, xd, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
     } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
       mbmi->motion_mode = WARPED_CAUSAL;
       mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
-      mbmi->interp_filters = av1_broadcast_interp_filter(
-          av1_unswitchable_filter(cm->interp_filter));
+      mbmi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
 
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
       // Select the samples according to motion vector difference
       if (mbmi->num_proj_ref > 1) {
-        mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
-                                           mbmi->num_proj_ref, bsize);
+        mbmi->num_proj_ref = av1_selectSamples(
+            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
       }
 
-      if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
-                           mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                           &mbmi->wm_params, mi_row, mi_col)) {
+      if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                               mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                               &mbmi->wm_params, mi_row, mi_col)) {
         // Refine MV for NEWMV mode
         assert(!is_comp_pred);
         if (have_newmv_in_inter_mode(this_mode)) {
@@ -9493,31 +1375,28 @@
           const WarpedMotionParams wm_params0 = mbmi->wm_params;
           const int num_proj_ref0 = mbmi->num_proj_ref;
 
-          if (cpi->sf.prune_warp_using_wmtype) {
+          if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
             TransformationType wmtype = get_wmtype(&mbmi->wm_params);
             if (wmtype < ROTZOOM) continue;
           }
 
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
           // Refine MV in a small range.
-          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
                                total_samples);
 
           // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
-            const int ref = refs[0];
-            const int_mv ref_mv = av1_get_ref_mv(x, 0);
             tmp_rate_mv = av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
                                           x->nmv_vec_cost, x->mv_cost_stack,
                                           MV_COST_WEIGHT);
-
-            if (cpi->sf.adaptive_motion_search)
-              x->pred_mv[ref] = mbmi->mv[0].as_mv;
-
-#if USE_DISCOUNT_NEWMV_TEST
-            if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
-              tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+            if (cpi->sf.mv_sf.adaptive_motion_search) {
+              x->pred_mv[mbmi->ref_frame[0]] = mbmi->mv[0].as_mv;
             }
-#endif
             tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
@@ -9536,16 +1415,39 @@
         continue;
       }
     } else if (is_interintra_mode) {
-      skip_interintra_mode = skip_interintra_based_on_first_pass_stats(
-          cpi, x, bsize, mi_row, mi_col);
-      if (skip_interintra_mode) continue;
-      const int ret = handle_inter_intra_mode(
-          cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
-          &tmp_rate2, orig_dst);
+      const int ret =
+          av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
+                                      &tmp_rate_mv, &tmp_rate2, orig_dst);
       if (ret < 0) continue;
     }
 
-    x->skip = 0;
+    // If we are searching newmv and the mv is the same as refmv, skip the
+    // current mode
+    if (this_mode == NEW_NEWMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+          mbmi->mv[1].as_int == ref_mv_1.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+      if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEWMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+        continue;
+      }
+    }
+
+    x->force_skip = 0;
     rd_stats->dist = 0;
     rd_stats->sse = 0;
     rd_stats->skip = 1;
@@ -9554,18 +1456,6 @@
     if (interintra_allowed) {
       rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
                                           [mbmi->ref_frame[1] == INTRA_FRAME];
-      if (mbmi->ref_frame[1] == INTRA_FRAME) {
-        rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]]
-                                                 [mbmi->interintra_mode];
-        if (is_interintra_wedge_used(bsize)) {
-          rd_stats->rate +=
-              x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra];
-          if (mbmi->use_wedge_interintra) {
-            rd_stats->rate +=
-                av1_cost_literal(get_interintra_wedge_bits(bsize));
-          }
-        }
-      }
     }
     if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
         (mbmi->ref_frame[1] != INTRA_FRAME)) {
@@ -9576,39 +1466,30 @@
       }
     }
 
-    if (cpi->sf.model_based_motion_mode_rd_breakout && do_tx_search) {
-      int model_rate;
-      int64_t model_dist;
-      model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
-          cpi, mbmi->sb_type, x, xd, 0, num_planes - 1, mi_row, mi_col,
-          &model_rate, &model_dist, NULL, NULL, NULL, NULL, NULL);
-      const int64_t est_rd =
-          RDCOST(x->rdmult, rd_stats->rate + model_rate, model_dist);
-      if ((est_rd >> 3) * 6 > ref_best_rd) {
-        mbmi->ref_frame[1] = ref_frame_1;
-        continue;
-      }
-    }
-
     if (!do_tx_search) {
       int64_t curr_sse = -1;
+      int64_t sse_y = -1;
       int est_residue_cost = 0;
       int64_t est_dist = 0;
       int64_t est_rd = 0;
-      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
-        curr_sse = get_sse(cpi, x);
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        curr_sse = get_sse(cpi, x, &sse_y);
+        // Scale luma SSE as per bit depth so as to be consistent with
+        // model_rd_sb_fn and compound type rd
+        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
                                                  &est_residue_cost, &est_dist);
         (void)has_est_rd;
         assert(has_est_rd);
-      } else if (cpi->sf.inter_mode_rd_model_estimation == 2 ||
-                 cpi->sf.use_nonrd_pick_mode) {
+      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 ||
+                 cpi->sf.rt_sf.use_nonrd_pick_mode) {
         model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
-            cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col,
-            &est_residue_cost, &est_dist, NULL, &curr_sse, NULL, NULL, NULL);
+            cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist,
+            NULL, &curr_sse, NULL, NULL, NULL);
+        sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]];
       }
       est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
-      if (est_rd * 0.8 > *best_est_rd) {
+      if (est_rd * 0.80 > *best_est_rd) {
         mbmi->ref_frame[1] = ref_frame_1;
         continue;
       }
@@ -9616,25 +1497,48 @@
       rd_stats->rate += est_residue_cost;
       rd_stats->dist = est_dist;
       rd_stats->rdcost = est_rd;
-      *best_est_rd = AOMMIN(*best_est_rd, rd_stats->rdcost);
+      if (rd_stats->rdcost < *best_est_rd) {
+        *best_est_rd = rd_stats->rdcost;
+        assert(sse_y >= 0);
+        ref_skip_rd[1] = cpi->sf.inter_sf.txfm_rd_gate_level
+                             ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
+                             : INT64_MAX;
+      }
       if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
         if (!is_comp_pred) {
           assert(curr_sse >= 0);
           inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
-                                rd_stats->rdcost, false, NULL, rd_stats,
-                                rd_stats_y, rd_stats_uv, mbmi);
+                                rd_stats->rdcost, rd_stats, rd_stats_y,
+                                rd_stats_uv, mbmi);
         }
       } else {
         assert(curr_sse >= 0);
         inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
-                              rd_stats->rdcost, false, NULL, rd_stats,
-                              rd_stats_y, rd_stats_uv, mbmi);
+                              rd_stats->rdcost, rd_stats, rd_stats_y,
+                              rd_stats_uv, mbmi);
       }
+      mbmi->skip = 0;
     } else {
-      if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, rd_stats,
-                       rd_stats_y, rd_stats_uv, rd_stats->rate, ref_best_rd)) {
+      int64_t skip_rd = INT64_MAX;
+      int64_t skip_rdy = INT64_MAX;
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        // Check if the mode is good enough based on skip RD
+        int64_t sse_y = INT64_MAX;
+        int64_t curr_sse = get_sse(cpi, x, &sse_y);
+        // Scale luma SSE as per bit depth so as to be consistent with
+        // model_rd_sb_fn and compound type rd
+        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
+        skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
+        skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
+        int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
+                                        cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+        if (!eval_txfm) continue;
+      }
+
+      if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                           rd_stats->rate, ref_best_rd)) {
         if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
-          if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
+          if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
               !is_comp_pred) {
             simple_states->early_skipped = 1;
           }
@@ -9644,30 +1548,25 @@
       }
 
       const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      ref_best_rd = AOMMIN(ref_best_rd, curr_rd);
+      if (curr_rd < ref_best_rd) {
+        ref_best_rd = curr_rd;
+        ref_skip_rd[0] = skip_rd;
+        ref_skip_rd[1] = skip_rdy;
+      }
       *disable_skip = 0;
-      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
         const int skip_ctx = av1_get_skip_context(xd);
         inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
                              rd_stats->dist,
                              rd_stats_y->rate + rd_stats_uv->rate +
                                  x->skip_cost[skip_ctx][mbmi->skip]);
       }
-
-      // 2 means to both do the tx search and also update the inter_modes_info
-      // structure, since some modes will be conditionally TX searched.
-      if (do_tx_search == 2) {
-        rd_stats->rdcost = curr_rd;
-        inter_modes_info_push(inter_modes_info, rd_stats->rate, rd_stats->sse,
-                              curr_rd, true, x->blk_skip, rd_stats, rd_stats_y,
-                              rd_stats_uv, mbmi);
-      }
     }
 
     if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
       if (is_nontrans_global_motion(xd, xd->mi[0])) {
-        mbmi->interp_filters = av1_broadcast_interp_filter(
-            av1_unswitchable_filter(cm->interp_filter));
+        mbmi->interp_filters =
+            av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
       }
     }
 
@@ -9680,8 +1579,10 @@
         simple_states->rd_stats_y = *rd_stats_y;
         simple_states->rd_stats_uv = *rd_stats_uv;
         memcpy(simple_states->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
-        simple_states->skip = x->skip;
+               sizeof(x->blk_skip[0]) * xd->height * xd->width);
+        av1_copy_array(simple_states->tx_type_map, xd->tx_type_map,
+                       xd->height * xd->width);
+        simple_states->skip = mbmi->skip;
         simple_states->disable_skip = *disable_skip;
       }
     }
@@ -9693,10 +1594,13 @@
       best_rate_mv = tmp_rate_mv;
       if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
       memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
-      best_xskip = x->skip;
+             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+      best_xskip = mbmi->skip;
       best_disable_skip = *disable_skip;
-      if (best_xskip) break;
+      // TODO(anyone): evaluate the quality and speed trade-off of the early
+      // termination logic below.
+      // if (best_xskip) break;
     }
   }
   mbmi->ref_frame[1] = ref_frame_1;
@@ -9711,8 +1615,9 @@
   *rd_stats_y = best_rd_stats_y;
   if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
-  x->skip = best_xskip;
+         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+  x->force_skip = best_xskip;
   *disable_skip = best_disable_skip;
 
   restore_dst_buf(xd, *orig_dst, num_planes);
@@ -9720,11 +1625,14 @@
 }
 
 static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
-                            MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
-                            int mi_col, const BUFFER_SET *const orig_dst) {
+                            MACROBLOCK *const x, BLOCK_SIZE bsize,
+                            const BUFFER_SET *const orig_dst) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
                                 av1_num_planes(cm) - 1);
 
@@ -9737,7 +1645,7 @@
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
 
-    av1_subtract_plane(x, bsize, plane);
+    av1_subtract_plane(x, plane_bsize, plane);
     int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
     total_sse += sse;
   }
@@ -9750,36 +1658,63 @@
   return 0;
 }
 
-static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
-                                    uint8_t ref_mv_idx) {
-  assert(is_inter_singleref_mode(single_mode));
-  int ref_mv_offset;
+// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant
+// mode
+static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+                                      int ref_idx,
+                                      const MV_REFERENCE_FRAME *ref_frame,
+                                      PREDICTION_MODE single_mode) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  assert(single_mode != NEWMV);
   if (single_mode == NEARESTMV) {
-    ref_mv_offset = 0;
+    return 0;
   } else if (single_mode == NEARMV) {
-    ref_mv_offset = ref_mv_idx + 1;
-  } else {
-    ref_mv_offset = -1;
+    // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV
+    // when ref_mv_count = 1, NEARMV is same as GLOBALMV
+    if (ref_mv_count < 2) return 1;
+  } else if (single_mode == GLOBALMV) {
+    // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV
+    if (ref_mv_count == 0) return 1;
+    // when ref_mv_count == 1, NEARMV is same as GLOBALMV
+    else if (ref_mv_count == 1)
+      return 0;
+
+    int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count);
+    // Check GLOBALMV is matching with any mv in ref_mv_stack
+    for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) {
+      int_mv this_mv;
+
+      if (ref_idx == 0)
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      else
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int)
+        return 1;
+    }
   }
-  return ref_mv_offset;
+  return 0;
 }
 
-static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
-                               int ref_idx, int ref_mv_idx,
-                               const MV_REFERENCE_FRAME *ref_frame,
-                               const MB_MODE_INFO_EXT *mbmi_ext) {
-  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
-  const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
-  const PREDICTION_MODE single_mode =
-      get_single_mode(this_mode, ref_idx, is_comp_pred);
+static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                              int ref_idx, int ref_mv_idx,
+                              int skip_repeated_ref_mv,
+                              const MV_REFERENCE_FRAME *ref_frame,
+                              const MB_MODE_INFO_EXT *mbmi_ext) {
+  const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
   assert(is_inter_singleref_mode(single_mode));
   if (single_mode == NEWMV) {
     this_mv->as_int = INVALID_MV;
   } else if (single_mode == GLOBALMV) {
+    if (skip_repeated_ref_mv &&
+        check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+      return 0;
     *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
   } else {
     assert(single_mode == NEARMV || single_mode == NEARESTMV);
-    const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx);
+    const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+    const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1;
     if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
       assert(ref_mv_offset >= 0);
       if (ref_idx == 0) {
@@ -9790,26 +1725,38 @@
             mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
       }
     } else {
+      if (skip_repeated_ref_mv &&
+          check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+        return 0;
       *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
     }
   }
+  return 1;
 }
 
 // This function update the non-new mv for the current prediction mode
 static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
-                               const AV1_COMMON *cm, const MACROBLOCK *x) {
+                               const AV1_COMMON *cm, const MACROBLOCK *x,
+                               int skip_repeated_ref_mv) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
+
   int ret = 1;
   for (int i = 0; i < is_comp_pred + 1; ++i) {
     int_mv this_mv;
-    get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
-                x->mbmi_ext);
-    const PREDICTION_MODE single_mode =
-        get_single_mode(this_mode, i, is_comp_pred);
+    this_mv.as_int = INVALID_MV;
+    ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
+                      skip_repeated_ref_mv, mbmi->ref_frame, x->mbmi_ext);
+    if (!ret) return 0;
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
     if (single_mode == NEWMV) {
-      cur_mv[i] = this_mv;
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      cur_mv[i] =
+          (i == 0) ? x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .this_mv
+                   : x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .comp_mv;
     } else {
       ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
     }
@@ -9819,14 +1766,13 @@
 
 static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
                                const MB_MODE_INFO_EXT *mbmi_ext,
-                               int (*drl_mode_cost0)[2],
+                               const int (*const drl_mode_cost0)[2],
                                int8_t ref_frame_type) {
   int cost = 0;
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
     for (int idx = 0; idx < 2; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
         cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
         if (mbmi->ref_mv_idx == idx) return cost;
       }
@@ -9837,8 +1783,7 @@
   if (have_nearmv_in_inter_mode(mbmi->mode)) {
     for (int idx = 1; idx < 3; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
         cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
         if (mbmi->ref_mv_idx == (idx - 1)) return cost;
       }
@@ -9848,294 +1793,11 @@
   return cost;
 }
 
-// Struct for buffers used by compound_type_rd() function.
-// For sizes and alignment of these arrays, refer to
-// alloc_compound_type_rd_buffers() function.
-typedef struct {
-  uint8_t *pred0;
-  uint8_t *pred1;
-  int16_t *residual1;          // src - pred1
-  int16_t *diff10;             // pred1 - pred0
-  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
-} CompoundTypeRdBuffers;
-
-static int compound_type_rd(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_col,
-    int mi_row, int_mv *cur_mv, int mode_search_mask, int masked_compound_used,
-    const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
-    CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd,
-    RD_STATS *rd_stats, int64_t ref_best_rd, int *is_luma_interp_done) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const PREDICTION_MODE this_mode = mbmi->mode;
-  const int bw = block_size_wide[bsize];
-  int rs2;
-  int_mv best_mv[2];
-  int best_tmp_rate_mv = *rate_mv;
-  INTERINTER_COMPOUND_DATA best_compound_data;
-  best_compound_data.type = COMPOUND_AVERAGE;
-  uint8_t *preds0[1] = { buffers->pred0 };
-  uint8_t *preds1[1] = { buffers->pred1 };
-  int strides[1] = { bw };
-  int tmp_rate_mv;
-  const int num_pix = 1 << num_pels_log2_lookup[bsize];
-  const int mask_len = 2 * num_pix * sizeof(uint8_t);
-  COMPOUND_TYPE cur_type;
-  int best_compmode_interinter_cost = 0;
-  int calc_pred_masked_compound = 1;
-  int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
-                                        INT64_MAX };
-  int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-  int64_t comp_model_rd[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
-                                            INT64_MAX };
-  const int match_found =
-      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rd);
-
-  best_mv[0].as_int = cur_mv[0].as_int;
-  best_mv[1].as_int = cur_mv[1].as_int;
-  *rd = INT64_MAX;
-  int rate_sum, tmp_skip_txfm_sb;
-  int64_t dist_sum, tmp_skip_sse_sb;
-  int64_t comp_best_model_rd = INT64_MAX;
-  // Special handling if both compound_average and compound_distwtd
-  // are to be searched. In this case, first estimate between the two
-  // modes and then call estimate_yrd_for_sb() only for the better of
-  // the two.
-  const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
-  const int try_distwtd_comp =
-      ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
-       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
-       cpi->sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
-  const int try_average_and_distwtd_comp =
-      try_average_comp && try_distwtd_comp &&
-      comp_rate[COMPOUND_AVERAGE] == INT_MAX &&
-      comp_rate[COMPOUND_DISTWTD] == INT_MAX;
-  for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
-    if (((1 << cur_type) & mode_search_mask) == 0) {
-      if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
-      continue;
-    }
-    if (!is_interinter_compound_used(cur_type, bsize)) continue;
-    if (cur_type >= COMPOUND_WEDGE && !masked_compound_used) break;
-    if (cur_type == COMPOUND_DISTWTD && !try_distwtd_comp) continue;
-    if (cur_type == COMPOUND_AVERAGE && try_average_and_distwtd_comp) continue;
-
-    int64_t comp_model_rd_cur = INT64_MAX;
-    tmp_rate_mv = *rate_mv;
-    int64_t best_rd_cur = INT64_MAX;
-    const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
-    const int comp_index_ctx = get_comp_index_context(cm, xd);
-
-    if (cur_type == COMPOUND_DISTWTD && try_average_and_distwtd_comp) {
-      int est_rate[2];
-      int64_t est_dist[2], est_rd[2];
-
-      int masked_type_cost[2] = { 0, 0 };
-      mbmi->comp_group_idx = 0;
-
-      // First find the modeled rd cost for COMPOUND_AVERAGE
-      mbmi->interinter_comp.type = COMPOUND_AVERAGE;
-      mbmi->compound_idx = 1;
-      if (masked_compound_used) {
-        masked_type_cost[COMPOUND_AVERAGE] +=
-            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
-      }
-      masked_type_cost[COMPOUND_AVERAGE] +=
-          x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
-      *is_luma_interp_done = 1;
-      model_rd_sb_fn[MODELRD_CURVFIT](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_AVERAGE],
-          &est_dist[COMPOUND_AVERAGE], NULL, NULL, NULL, NULL, NULL);
-      est_rate[COMPOUND_AVERAGE] += masked_type_cost[COMPOUND_AVERAGE];
-      est_rd[COMPOUND_AVERAGE] =
-          RDCOST(x->rdmult, est_rate[COMPOUND_AVERAGE] + *rate_mv,
-                 est_dist[COMPOUND_AVERAGE]);
-      restore_dst_buf(xd, *tmp_dst, 1);
-
-      // Next find the modeled rd cost for COMPOUND_DISTWTD
-      mbmi->interinter_comp.type = COMPOUND_DISTWTD;
-      mbmi->compound_idx = 0;
-      if (masked_compound_used) {
-        masked_type_cost[COMPOUND_DISTWTD] +=
-            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
-      }
-      masked_type_cost[COMPOUND_DISTWTD] +=
-          x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
-      model_rd_sb_fn[MODELRD_CURVFIT](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_DISTWTD],
-          &est_dist[COMPOUND_DISTWTD], NULL, NULL, NULL, NULL, NULL);
-      est_rate[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_DISTWTD];
-      est_rd[COMPOUND_DISTWTD] =
-          RDCOST(x->rdmult, est_rate[COMPOUND_DISTWTD] + *rate_mv,
-                 est_dist[COMPOUND_DISTWTD]);
-
-      // Choose the better of the two based on modeled cost and call
-      // estimate_yrd_for_sb() for that one.
-      if (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) {
-        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
-        mbmi->compound_idx = 1;
-        restore_dst_buf(xd, *orig_dst, 1);
-        RD_STATS est_rd_stats;
-        const int64_t est_rd_ =
-            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
-        rs2 = masked_type_cost[COMPOUND_AVERAGE];
-        if (est_rd_ != INT64_MAX) {
-          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                               est_rd_stats.dist);
-          restore_dst_buf(xd, *tmp_dst, 1);
-          comp_rate[COMPOUND_AVERAGE] = est_rd_stats.rate;
-          comp_dist[COMPOUND_AVERAGE] = est_rd_stats.dist;
-          comp_model_rd[COMPOUND_AVERAGE] = est_rd[COMPOUND_AVERAGE];
-          comp_model_rd_cur = est_rd[COMPOUND_AVERAGE];
-        }
-        restore_dst_buf(xd, *tmp_dst, 1);
-      } else {
-        RD_STATS est_rd_stats;
-        const int64_t est_rd_ =
-            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
-        rs2 = masked_type_cost[COMPOUND_DISTWTD];
-        if (est_rd_ != INT64_MAX) {
-          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                               est_rd_stats.dist);
-          comp_rate[COMPOUND_DISTWTD] = est_rd_stats.rate;
-          comp_dist[COMPOUND_DISTWTD] = est_rd_stats.dist;
-          comp_model_rd[COMPOUND_DISTWTD] = est_rd[COMPOUND_DISTWTD];
-          comp_model_rd_cur = est_rd[COMPOUND_DISTWTD];
-        }
-      }
-    } else {
-      mbmi->interinter_comp.type = cur_type;
-      int masked_type_cost = 0;
-      if (cur_type == COMPOUND_AVERAGE || cur_type == COMPOUND_DISTWTD) {
-        mbmi->comp_group_idx = 0;
-        mbmi->compound_idx = (cur_type == COMPOUND_AVERAGE);
-        if (masked_compound_used) {
-          masked_type_cost +=
-              x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
-        }
-        masked_type_cost +=
-            x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
-        rs2 = masked_type_cost;
-        const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
-        if (mode_rd < ref_best_rd) {
-          // Reuse data if matching record is found
-          if (comp_rate[cur_type] == INT_MAX) {
-            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
-                                          bsize, AOM_PLANE_Y, AOM_PLANE_Y);
-            if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
-            RD_STATS est_rd_stats;
-            const int64_t est_rd =
-                estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
-            if (comp_rate[cur_type] != INT_MAX) {
-              assert(comp_rate[cur_type] == est_rd_stats.rate);
-              assert(comp_dist[cur_type] == est_rd_stats.dist);
-            }
-            if (est_rd != INT64_MAX) {
-              best_rd_cur =
-                  RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                         est_rd_stats.dist);
-              model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-                  cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-                  &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-              comp_model_rd_cur =
-                  RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
-
-              // Backup rate and distortion for future reuse
-              comp_rate[cur_type] = est_rd_stats.rate;
-              comp_dist[cur_type] = est_rd_stats.dist;
-              comp_model_rd[cur_type] = comp_model_rd_cur;
-            }
-          } else {
-            // Calculate RD cost based on stored stats
-            assert(comp_dist[cur_type] != INT64_MAX);
-            best_rd_cur =
-                RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
-                       comp_dist[cur_type]);
-            comp_model_rd_cur = comp_model_rd[cur_type];
-          }
-        }
-        // use spare buffer for following compound type try
-        if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
-      } else {
-        mbmi->comp_group_idx = 1;
-        mbmi->compound_idx = 1;
-        masked_type_cost +=
-            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
-        masked_type_cost +=
-            x->compound_type_cost[bsize][cur_type - COMPOUND_WEDGE];
-        rs2 = masked_type_cost;
-
-        if (((*rd / cpi->max_comp_type_rd_threshold_div) *
-             cpi->max_comp_type_rd_threshold_mul) < ref_best_rd) {
-          const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
-
-          if (!((compound_type == COMPOUND_WEDGE &&
-                 !enable_wedge_interinter_search(x, cpi)) ||
-                (compound_type == COMPOUND_DIFFWTD &&
-                 !cpi->oxcf.enable_diff_wtd_comp)))
-            best_rd_cur = build_and_cost_compound_type(
-                cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
-                &tmp_rate_mv, preds0, preds1, buffers->residual1,
-                buffers->diff10, strides, mi_row, mi_col, rd_stats->rate,
-                ref_best_rd, &calc_pred_masked_compound, comp_rate, comp_dist,
-                comp_model_rd, comp_best_model_rd, &comp_model_rd_cur);
-        }
-      }
-    }
-    if (best_rd_cur < *rd) {
-      *rd = best_rd_cur;
-      comp_best_model_rd = comp_model_rd_cur;
-      best_compound_data = mbmi->interinter_comp;
-      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
-        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
-      }
-      best_compmode_interinter_cost = rs2;
-      if (have_newmv_in_inter_mode(this_mode)) {
-        if (cur_type == COMPOUND_WEDGE) {
-          best_tmp_rate_mv = tmp_rate_mv;
-          best_mv[0].as_int = mbmi->mv[0].as_int;
-          best_mv[1].as_int = mbmi->mv[1].as_int;
-        } else {
-          best_mv[0].as_int = cur_mv[0].as_int;
-          best_mv[1].as_int = cur_mv[1].as_int;
-        }
-      }
-    }
-    // reset to original mvs for next iteration
-    mbmi->mv[0].as_int = cur_mv[0].as_int;
-    mbmi->mv[1].as_int = cur_mv[1].as_int;
-  }
-  if (mbmi->interinter_comp.type != best_compound_data.type) {
-    mbmi->comp_group_idx = (best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
-    mbmi->compound_idx = !(best_compound_data.type == COMPOUND_DISTWTD);
-    mbmi->interinter_comp = best_compound_data;
-    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
-  }
-  if (have_newmv_in_inter_mode(this_mode)) {
-    mbmi->mv[0].as_int = best_mv[0].as_int;
-    mbmi->mv[1].as_int = best_mv[1].as_int;
-    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
-      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
-      *rate_mv = best_tmp_rate_mv;
-    }
-  }
-  restore_dst_buf(xd, *orig_dst, 1);
-  if (!match_found)
-    save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rd,
-                             cur_mv);
-  return best_compmode_interinter_cost;
-}
-
-static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
-                                        MB_MODE_INFO *mbmi,
+static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+                                        const MB_MODE_INFO *const mbmi,
                                         PREDICTION_MODE this_mode) {
   for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
-    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
     const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
     if (single_mode == NEWMV &&
         args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
@@ -10156,25 +1818,415 @@
   const int has_drl =
       (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
   const int ref_set =
-      has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+      has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1;
 
   return ref_set;
 }
 
-typedef struct {
-  int64_t rd;
-  int drl_cost;
+// Whether this reference motion vector can be skipped, based on initial
+// heuristics.
+static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      const HandleInterModeArgs *const args,
+                                      int64_t ref_best_rd, int ref_mv_idx) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const int is_comp_pred = has_second_ref(mbmi);
+  if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
+    if (mbmi->ref_frame[0] == LAST2_FRAME ||
+        mbmi->ref_frame[0] == LAST3_FRAME ||
+        mbmi->ref_frame[1] == LAST2_FRAME ||
+        mbmi->ref_frame[1] == LAST3_FRAME) {
+      const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+      if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+          REF_CAT_LEVEL) {
+        return true;
+      }
+    }
+    // TODO(any): Experiment with reduce_inter_modes for compound prediction
+    if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
+        have_newmv_in_inter_mode(mbmi->mode)) {
+      if (mbmi->ref_frame[0] != cpi->nearest_past_ref &&
+          mbmi->ref_frame[0] != cpi->nearest_future_ref) {
+        const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+        if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+            REF_CAT_LEVEL) {
+          return true;
+        }
+      }
+    }
+  }
+  if (sf->inter_sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred &&
+      args->single_ref_first_pass == 0) {
+    if (args->simple_rd_state[ref_mv_idx].early_skipped) {
+      return true;
+    }
+  }
+  mbmi->ref_mv_idx = ref_mv_idx;
+  if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
+    return true;
+  }
+  size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost =
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  est_rd_rate += drl_cost;
+  if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+    return true;
+  }
+  return false;
+}
+
+// Compute the estimated RD cost for the motion vector with simple translation.
+static int64_t simple_translation_pred_rd(
+    AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+    HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info,
+    int64_t ref_best_rd, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const AV1_COMMON *cm = &cpi->common;
+  const int is_comp_pred = has_second_ref(mbmi);
+
+  struct macroblockd_plane *p = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  av1_init_rd_stats(rd_stats);
+
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  if (mbmi->ref_frame[1] == INTRA_FRAME) {
+    mbmi->ref_frame[1] = NONE_FRAME;
+  }
+  int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+  mbmi->num_proj_ref = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = ref_mv_idx;
+
+  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost =
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  rd_stats->rate += drl_cost;
+  mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+  int_mv cur_mv[2];
+  if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
+    return INT64_MAX;
+  }
+  assert(have_nearmv_in_inter_mode(mbmi->mode));
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+  const int ref_mv_cost = cost_mv_ref(x, mbmi->mode, mode_ctx);
+  rd_stats->rate += ref_mv_cost;
+
+  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
+    return INT64_MAX;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 0;
+  if (is_comp_pred) {
+    // Only compound_average
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+  }
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  int est_rate;
+  int64_t est_dist;
+  model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist,
+                                  NULL, NULL, NULL, NULL, NULL);
+  return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist);
+}
+
+// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
+// an integer. 0 for the i-th bit means that integer is excluded, 1 means
+// it is included.
+static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+
+static INLINE bool mask_check_bit(int mask, int index) {
+  return (mask >> index) & 0x1;
+}
+
+// Before performing the full MV search in handle_inter_mode, do a simple
+// translation search and see if we can eliminate any motion vectors.
+// Returns an integer where, if the i-th bit is set, it means that the i-th
+// motion vector should be searched. This is only set for NEAR_MV.
+static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
+                                RD_STATS *rd_stats,
+                                HandleInterModeArgs *const args,
+                                int64_t ref_best_rd, inter_mode_info *mode_info,
+                                BLOCK_SIZE bsize, const int ref_set) {
+  AV1_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+  // Only search indices if they have some chance of being good.
+  int good_indices = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (ref_mv_idx_early_breakout(cpi, x, args, ref_best_rd, i)) {
+      continue;
+    }
+    mask_set_bit(&good_indices, i);
+  }
+
+  // Only prune in NEARMV mode, if the speed feature is set, and the block size
+  // is large enough. If these conditions are not met, return all good indices
+  // found so far.
+  if (!cpi->sf.inter_sf.prune_mode_search_simple_translation)
+    return good_indices;
+  if (!have_nearmv_in_inter_mode(this_mode)) return good_indices;
+  if (num_pels_log2_lookup[bsize] <= 6) return good_indices;
+  // Do not prune when there is internal resizing. TODO(elliottk) fix this
+  // so b/2384 can be resolved.
+  if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) ||
+      (mbmi->ref_frame[1] > 0 &&
+       av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) {
+    return good_indices;
+  }
+
+  // Calculate the RD cost for the motion vectors using simple translation.
+  int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    // If this index is bad, ignore it.
+    if (!mask_check_bit(good_indices, ref_mv_idx)) {
+      continue;
+    }
+    idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
+        cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize);
+  }
+  // Find the index with the best RD cost.
+  int best_idx = 0;
+  for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) {
+    if (idx_rdcost[i] < idx_rdcost[best_idx]) {
+      best_idx = i;
+    }
+  }
+  // Only include indices that are good and within a % of the best.
+  const double dth = has_second_ref(mbmi) ? 1.05 : 1.001;
+  // If the simple translation cost is not within this multiple of the
+  // best RD, skip it. Note that the cutoff is derived experimentally.
+  const double ref_dth = 5;
+  int result = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (mask_check_bit(good_indices, i) &&
+        (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth &&
+        (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) {
+      mask_set_bit(&result, i);
+    }
+  }
+  return result;
+}
+
+typedef struct motion_mode_candidate {
+  MB_MODE_INFO mbmi;
   int rate_mv;
-  int_mv mv;
-} inter_mode_info;
+  int rate2_nocoeff;
+  int skip_motion_mode;
+  int64_t rd_cost;
+} motion_mode_candidate;
+
+typedef struct motion_mode_best_st_candidate {
+  motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
+  int num_motion_mode_cand;
+} motion_mode_best_st_candidate;
+
+// Checks if the current reference frame matches with neighbouring block's
+// (top/left) reference frames
+static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+                                                   MB_MODE_INFO *nb_mbmi) {
+  MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
+                                          nb_mbmi->ref_frame[1] };
+  MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
+                                           cur_mbmi->ref_frame[1] };
+  const int is_cur_comp_pred = has_second_ref(cur_mbmi);
+  int match_found = 0;
+
+  for (int i = 0; i < (is_cur_comp_pred + 1); i++) {
+    if ((cur_ref_frames[i] == nb_ref_frames[0]) ||
+        (cur_ref_frames[i] == nb_ref_frames[1]))
+      match_found = 1;
+  }
+  return match_found;
+}
+
+static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
+                                                  MACROBLOCKD *xd) {
+  if (!xd->up_available) return 0;
+  const int mi_col = xd->mi_col;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col;
+       above_mi_col += mi_step) {
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step = mi_size_wide[above_mi[0]->sb_type];
+    int match_found = 0;
+    if (is_inter_block(*above_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
+                                                 MACROBLOCKD *xd) {
+  if (!xd->left_available) return 0;
+  const int mi_row = xd->mi_row;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row;
+       left_mi_row += mi_step) {
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step = mi_size_high[left_mi[0]->sb_type];
+    int match_found = 0;
+    if (is_inter_block(*left_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+
+typedef struct {
+  int64_t best_inter_cost;
+  int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
+} PruneInfoFromTpl;
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(Remya): Check if get_tpl_stats_b() can be reused
+static AOM_INLINE void get_block_level_tpl_stats(
+    AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
+    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+  const int tpl_idx = gf_group->index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  if (tpl_idx >= MAX_LAG_BUFFERS || !tpl_frame->is_valid) {
+    return;
+  }
+
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
+       row += step) {
+    for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
+         col += step) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+      // Sums up the inter cost of corresponding ref frames
+      for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx] +=
+            this_stats->pred_error[ref_idx];
+      }
+    }
+  }
+
+  // Computes the best inter cost (minimum inter_cost)
+  int64_t best_inter_cost = INT64_MAX;
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+    const int64_t cur_inter_cost =
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx];
+    // For invalid ref frames, cur_inter_cost = 0 and has to be handled while
+    // calculating the minimum inter_cost
+    if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) &&
+        valid_refs[ref_idx])
+      best_inter_cost = cur_inter_cost;
+  }
+  inter_cost_info_from_tpl->best_inter_cost = best_inter_cost;
+}
+#endif
+
+static AOM_INLINE int prune_modes_based_on_tpl_stats(
+    PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
+    const PREDICTION_MODE this_mode, int prune_mode_level) {
+  const int have_newmv = have_newmv_in_inter_mode(this_mode);
+  if ((prune_mode_level < 3) && have_newmv) return 0;
+
+  static const int prune_level_idx[3] = { 0, 1, 1 };
+  const int prune_level = prune_level_idx[prune_mode_level - 1];
+  int64_t cur_inter_cost;
+
+  const int is_globalmv =
+      (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV);
+  const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx;
+
+  // Thresholds used for pruning:
+  // Lower value indicates aggressive pruning and higher value indicates
+  // conservative pruning which is set based on ref_mv_idx and speed feature.
+  // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
+  // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
+  static const int tpl_inter_mode_prune_mul_factor[2][MAX_REF_MV_SEARCH + 1] = {
+    { 3, 3, 3, 2 }, { 3, 2, 2, 2 }
+  };
+
+  const int is_comp_pred = (refs[1] > INTRA_FRAME);
+  if (!is_comp_pred) {
+    cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+  } else {
+    const int64_t inter_cost_ref0 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+    const int64_t inter_cost_ref1 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1];
+    // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for
+    // more aggressive pruning
+    cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1);
+  }
+
+  // Prune the mode if cur_inter_cost is greater than threshold times
+  // best_inter_cost
+  const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+  if (cur_inter_cost >
+      ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
+        best_inter_cost) >>
+       1))
+    return 1;
+  return 0;
+}
 
 static int64_t handle_inter_mode(
     AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
-    HandleInterModeArgs *args, int64_t ref_best_rd, uint8_t *const tmp_buf,
-    CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
-    const int do_tx_search, InterModesInfo *inter_modes_info) {
+    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *args,
+    int64_t ref_best_rd, uint8_t *const tmp_buf,
+    const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
+    const int do_tx_search, InterModesInfo *inter_modes_info,
+    motion_mode_candidate *motion_mode_cand, int64_t *skip_rd,
+    PruneInfoFromTpl *inter_cost_info_from_tpl) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
@@ -10182,12 +2234,18 @@
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
+
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int tpl_idx = gf_group->index;
+  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  const int prune_modes_based_on_tpl =
+      cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
+      tpl_idx >= MAX_LAG_BUFFERS && tpl_frame->is_valid;
   int i;
-  int refs[2] = { mbmi->ref_frame[0],
-                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int rate_mv = 0;
   int64_t rd = INT64_MAX;
-
   // do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
   // of these currently holds the best predictor, and use the other
@@ -10202,9 +2260,6 @@
                                  tmp_buf + 2 * MAX_SB_SQUARE },
                                { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
 
-  int skip_txfm_sb = 0;
-  int64_t skip_sse_sb = INT64_MAX;
-  int16_t mode_ctx;
   const int masked_compound_used = is_any_masked_compound_used(bsize) &&
                                    cm->seq_params.enable_masked_compound;
   int64_t ret_val = INT64_MAX;
@@ -10212,53 +2267,55 @@
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   int64_t best_rd = INT64_MAX;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   MB_MODE_INFO best_mbmi = *mbmi;
-  int best_disable_skip;
-  int best_xskip;
+  int best_disable_skip = 0;
+  int best_xskip = 0;
   int64_t newmv_ret_val = INT64_MAX;
-  int_mv backup_mv[2] = { { 0 } };
-  int backup_rate_mv = 0;
-  inter_mode_info mode_info[MAX_REF_MV_SERCH];
+  inter_mode_info mode_info[MAX_REF_MV_SEARCH];
 
-  int mode_search_mask[2];
-  const int do_two_loop_comp_search =
-      is_comp_pred && cpi->sf.two_loop_comp_search;
-  if (do_two_loop_comp_search) {
-    // TODO(debargha): Change this to try alternate ways of splitting
-    // modes while doing two pass compound_mode search.
-    mode_search_mask[0] = (1 << COMPOUND_AVERAGE);
-  } else {
-    mode_search_mask[0] = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
-                          (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+  // Do not prune the mode based on inter cost from tpl if the current ref frame
+  // is the winner ref in neighbouring blocks.
+  int ref_match_found_in_above_nb = 0;
+  int ref_match_found_in_left_nb = 0;
+  if (prune_modes_based_on_tpl) {
+    ref_match_found_in_above_nb =
+        find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd);
+    ref_match_found_in_left_nb =
+        find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd);
   }
-  mode_search_mask[1] = ((1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
-                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD)) -
-                        mode_search_mask[0];
 
-  // TODO(jingning): This should be deprecated shortly.
-  const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+  // First, perform a simple translation search for each of the indices. If
+  // an index performs well, it will be fully searched here.
   const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
-
+  // Save MV results from first 2 ref_mv_idx.
+  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2] = { { { 0 } } };
+  int best_ref_mv_idx = -1;
+  const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
+                                            mode_info, bsize, ref_set);
+  const int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+  const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+  const int base_rate =
+      args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
     mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
     mode_info[ref_mv_idx].rd = INT64_MAX;
 
-    if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
-      if (mbmi->ref_frame[0] == LAST2_FRAME ||
-          mbmi->ref_frame[0] == LAST3_FRAME ||
-          mbmi->ref_frame[1] == LAST2_FRAME ||
-          mbmi->ref_frame[1] == LAST3_FRAME) {
-        if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
-                .weight < REF_CAT_LEVEL) {
-          continue;
-        }
-      }
+    if (!mask_check_bit(idx_mask, ref_mv_idx)) {
+      // MV did not perform well in simple translation search. Skip it.
+      continue;
     }
-    if (cpi->sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred &&
-        args->single_ref_first_pass == 0) {
-      if (args->simple_rd_state[ref_mv_idx].early_skipped) {
+    if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
+        !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+      if (prune_modes_based_on_tpl_stats(
+              inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
+              cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
         continue;
-      }
     }
     av1_init_rd_stats(rd_stats);
 
@@ -10267,342 +2324,293 @@
     mbmi->compound_idx = 1;
     if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
 
-    mode_ctx =
-        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-
     mbmi->num_proj_ref = 0;
     mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->ref_mv_idx = ref_mv_idx;
 
-    if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
-      continue;
-    }
-
-    rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+    rd_stats->rate = base_rate;
     const int drl_cost =
         get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
     rd_stats->rate += drl_cost;
     mode_info[ref_mv_idx].drl_cost = drl_cost;
 
+    int rs = 0;
+    int compmode_interinter_cost = 0;
+
+    int_mv cur_mv[2];
+
+    // TODO(Cherma): Extend this speed feature to support compound mode
+    int skip_repeated_ref_mv =
+        is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+    if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
+      continue;
+    }
+
+    if (have_newmv_in_inter_mode(this_mode)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_newmv_time);
+#endif
+      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+          args->single_ref_first_pass == 0 && !is_comp_pred) {
+        const int ref0 = mbmi->ref_frame[0];
+        newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
+        cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
+        rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
+      } else {
+        newmv_ret_val =
+            handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
+      }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_newmv_time);
+#endif
+
+      if (newmv_ret_val != 0) continue;
+
+      rd_stats->rate += rate_mv;
+
+      if (cpi->sf.inter_sf.skip_repeated_newmv) {
+        if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+          int skip = 0;
+          int this_rate_mv = 0;
+          for (i = 0; i < ref_mv_idx; ++i) {
+            // Check if the motion search result same as previous results
+            if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
+                args->single_newmv_valid[i][refs[0]]) {
+              // If the compared mode has no valid rd, it is unlikely this
+              // mode will be the best mode
+              if (mode_info[i].rd == INT64_MAX) {
+                skip = 1;
+                break;
+              }
+              // Compare the cost difference including drl cost and mv cost
+              if (mode_info[i].mv.as_int != INVALID_MV) {
+                const int compare_cost =
+                    mode_info[i].rate_mv + mode_info[i].drl_cost;
+                const int_mv ref_mv = av1_get_ref_mv(x, 0);
+                this_rate_mv = av1_mv_bit_cost(
+                    &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                    x->mv_cost_stack, MV_COST_WEIGHT);
+                const int this_cost = this_rate_mv + drl_cost;
+
+                if (compare_cost <= this_cost) {
+                  skip = 1;
+                  break;
+                } else {
+                  // If the cost is less than current best result, make this
+                  // the best and update corresponding variables unless the
+                  // best_mv is the same as ref_mv. In this case we skip and
+                  // rely on NEAR(EST)MV instead
+                  if (best_mbmi.ref_mv_idx == i &&
+                      mode_info[i].mv.as_int != ref_mv.as_int) {
+                    assert(best_rd != INT64_MAX);
+                    best_mbmi.ref_mv_idx = ref_mv_idx;
+                    motion_mode_cand->rate_mv = this_rate_mv;
+                    best_rd_stats.rate += this_cost - compare_cost;
+                    best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+                                     best_rd_stats.dist);
+                    if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+                    break;
+                  }
+                }
+              }
+            }
+          }
+          if (skip) {
+            const THR_MODES mode_enum = get_prediction_mode_idx(
+                best_mbmi.mode, best_mbmi.ref_frame[0], best_mbmi.ref_frame[1]);
+            // Collect mode stats for multiwinner mode processing
+            store_winner_mode_stats(
+                &cpi->common, x, &best_mbmi, &best_rd_stats, &best_rd_stats_y,
+                &best_rd_stats_uv, mode_enum, NULL, bsize, best_rd,
+                cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+                do_tx_search);
+            args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+                args->modelled_rd[this_mode][i][refs[0]];
+            args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+                args->simple_rd[this_mode][i][refs[0]];
+            mode_info[ref_mv_idx].rd = mode_info[i].rd;
+            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+            mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+        }
+      }
+    }
+    for (i = 0; i < is_comp_pred + 1; ++i) {
+      mbmi->mv[i].as_int = cur_mv[i].as_int;
+    }
+
     if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
         mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
       continue;
     }
 
-    const RD_STATS backup_rd_stats = *rd_stats;
-
-    for (int comp_loop_idx = 0; comp_loop_idx <= do_two_loop_comp_search;
-         ++comp_loop_idx) {
-      int rs = 0;
-      int compmode_interinter_cost = 0;
-
-      if (is_comp_pred && comp_loop_idx == 1) *rd_stats = backup_rd_stats;
-
-      int_mv cur_mv[2];
-      if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
-        continue;
+    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred) {
+      // TODO(yunqing): Move this part to a separate function when it is done.
+      // Store MV result.
+      if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+        for (i = 0; i < is_comp_pred + 1; ++i)
+          save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
       }
-      if (have_newmv_in_inter_mode(this_mode)) {
-        if (comp_loop_idx == 1) {
-          cur_mv[0] = backup_mv[0];
-          cur_mv[1] = backup_mv[1];
-          rate_mv = backup_rate_mv;
-        }
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        start_timing(cpi, handle_newmv_time);
-#endif
-        if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
-            args->single_ref_first_pass == 0 && !is_comp_pred) {
-          const int ref0 = mbmi->ref_frame[0];
-          newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
-          cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
-          rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
-        } else if (comp_loop_idx == 0) {
-          newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
-                                       &rate_mv, args);
-
-          // Store cur_mv and rate_mv so that they can be restored in the next
-          // iteration of the loop
-          backup_mv[0] = cur_mv[0];
-          backup_mv[1] = cur_mv[1];
-          backup_rate_mv = rate_mv;
-        }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        end_timing(cpi, handle_newmv_time);
-#endif
-
-        if (newmv_ret_val != 0) {
-          continue;
-        } else {
-          rd_stats->rate += rate_mv;
-        }
-
-        if (cpi->sf.skip_repeated_newmv) {
-          if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
-            int skip = 0;
-            int this_rate_mv = 0;
-            for (i = 0; i < ref_mv_idx; ++i) {
-              // Check if the motion search result same as previous results
-              if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
-                // If the compared mode has no valid rd, it is unlikely this
-                // mode will be the best mode
-                if (mode_info[i].rd == INT64_MAX) {
-                  skip = 1;
-                  break;
-                }
-                // Compare the cost difference including drl cost and mv cost
-                if (mode_info[i].mv.as_int != INVALID_MV) {
-                  const int compare_cost =
-                      mode_info[i].rate_mv + mode_info[i].drl_cost;
-                  const int_mv ref_mv = av1_get_ref_mv(x, 0);
-                  this_rate_mv = av1_mv_bit_cost(
-                      &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                      x->mv_cost_stack, MV_COST_WEIGHT);
-                  const int this_cost = this_rate_mv + drl_cost;
-
-                  if (compare_cost < this_cost) {
-                    skip = 1;
-                    break;
-                  } else {
-                    // If the cost is less than current best result, make this
-                    // the best and update corresponding variables
-                    if (best_mbmi.ref_mv_idx == i) {
-                      assert(best_rd != INT64_MAX);
-                      best_mbmi.ref_mv_idx = ref_mv_idx;
-                      best_rd_stats.rate += this_cost - compare_cost;
-                      best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
-                                       best_rd_stats.dist);
-                      if (best_rd < ref_best_rd) ref_best_rd = best_rd;
-                      skip = 1;
-                      break;
-                    }
-                  }
-                }
-              }
-            }
-            if (skip) {
-              args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
-                  args->modelled_rd[this_mode][i][refs[0]];
-              args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
-                  args->simple_rd[this_mode][i][refs[0]];
-              mode_info[ref_mv_idx].rd = mode_info[i].rd;
-              mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-              mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
-
-              restore_dst_buf(xd, orig_dst, num_planes);
-              continue;
-            }
-          }
-        }
-      }
-      for (i = 0; i < is_comp_pred + 1; ++i) {
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
-      }
-      const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
-#if USE_DISCOUNT_NEWMV_TEST
-      // We don't include the cost of the second reference here, because there
-      // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
-      // other words if you present them in that order, the second one is always
-      // known if the first is known.
-      //
-      // Under some circumstances we discount the cost of new mv mode to
-      // encourage initiation of a motion field.
-      if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
-        // discount_newmv_test only applies discount on NEWMV mode.
-        assert(this_mode == NEWMV);
-        rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
-                                 cost_mv_ref(x, NEARESTMV, mode_ctx));
-      } else {
-        rd_stats->rate += ref_mv_cost;
-      }
-#else
-      rd_stats->rate += ref_mv_cost;
-#endif
-
-      if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-          mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
-        continue;
-      }
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      start_timing(cpi, compound_type_rd_time);
-#endif
-      int skip_build_pred = 0;
-      if (is_comp_pred) {
-        if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_AVERAGE)) {
-          // Only compound_average
-          mbmi->interinter_comp.type = COMPOUND_AVERAGE;
-          mbmi->num_proj_ref = 0;
-          mbmi->motion_mode = SIMPLE_TRANSLATION;
-          mbmi->comp_group_idx = 0;
-          mbmi->compound_idx = 1;
-          const int comp_index_ctx = get_comp_index_context(cm, xd);
-          compmode_interinter_cost +=
-              x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
-        } else if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_DISTWTD)) {
-          // Only compound_distwtd
-          if (!cm->seq_params.order_hint_info.enable_dist_wtd_comp ||
-              cpi->sf.use_dist_wtd_comp_flag == DIST_WTD_COMP_DISABLED ||
-              (do_two_loop_comp_search && mbmi->mode == GLOBAL_GLOBALMV))
-            continue;
-          mbmi->interinter_comp.type = COMPOUND_DISTWTD;
-          mbmi->num_proj_ref = 0;
-          mbmi->motion_mode = SIMPLE_TRANSLATION;
-          mbmi->comp_group_idx = 0;
-          mbmi->compound_idx = 0;
-          const int comp_index_ctx = get_comp_index_context(cm, xd);
-          compmode_interinter_cost +=
-              x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
-        } else {
-          // Find matching interp filter or set to default interp filter
-          const int need_search =
-              av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
-          int match_found = -1;
-          const InterpFilter assign_filter = cm->interp_filter;
-          int is_luma_interp_done = 0;
-          if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
-            match_found = find_interp_filter_in_stats(x, mbmi);
-          }
-          if (!need_search || match_found == -1) {
-            set_default_interp_filters(mbmi, assign_filter);
+      // Skip the evaluation if an MV match is found.
+      if (ref_mv_idx > 0) {
+        int match = 0;
+        for (int idx = 0; idx < ref_mv_idx; ++idx) {
+          int mv_diff = 0;
+          for (i = 0; i < 1 + is_comp_pred; ++i) {
+            mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+                       abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
           }
 
-          int64_t best_rd_compound;
-          compmode_interinter_cost = compound_type_rd(
-              cpi, x, bsize, mi_col, mi_row, cur_mv,
-              mode_search_mask[comp_loop_idx], masked_compound_used, &orig_dst,
-              &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, rd_stats,
-              ref_best_rd, &is_luma_interp_done);
-          if (ref_best_rd < INT64_MAX &&
-              (best_rd_compound >> 4) * (11 + 2 * do_two_loop_comp_search) >
-                  ref_best_rd) {
-            restore_dst_buf(xd, orig_dst, num_planes);
-            continue;
-          }
-          // No need to call av1_enc_build_inter_predictor for luma if
-          // COMPOUND_AVERAGE is selected because it is the first
-          // candidate in compound_type_rd, and the following
-          // compound types searching uses tmp_dst buffer
-
-          if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
-              is_luma_interp_done) {
-            if (num_planes > 1) {
-              av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
-                                            bsize, AOM_PLANE_U, num_planes - 1);
-            }
-            skip_build_pred = 1;
+          // If this mode is not the best one, and current MV is similar to
+          // previous stored MV, terminate this ref_mv_idx evaluation.
+          if (best_ref_mv_idx == -1 && mv_diff < 1) {
+            match = 1;
+            break;
           }
         }
+        if (match == 1) continue;
       }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      end_timing(cpi, compound_type_rd_time);
-#endif
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      start_timing(cpi, interpolation_filter_search_time);
-#endif
-      ret_val = interpolation_filter_search(
-          x, cpi, tile_data, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
-          args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
-          &skip_build_pred, args, ref_best_rd);
-#if CONFIG_COLLECT_COMPONENT_TIMING
-      end_timing(cpi, interpolation_filter_search_time);
-#endif
-      if (args->modelled_rd != NULL && !is_comp_pred) {
-        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
-      }
-      if (ret_val != 0) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        continue;
-      } else if (cpi->sf.model_based_post_interp_filter_breakout &&
-                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        break;
-      }
-
-      if (!is_comp_pred)
-        args->single_filter[this_mode][refs[0]] =
-            av1_extract_interp_filter(mbmi->interp_filters, 0);
-
-      if (args->modelled_rd != NULL) {
-        if (is_comp_pred) {
-          const int mode0 = compound_ref0_mode(this_mode);
-          const int mode1 = compound_ref1_mode(this_mode);
-          const int64_t mrd =
-              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
-                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
-          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
-            restore_dst_buf(xd, orig_dst, num_planes);
-            continue;
-          }
-        }
-      }
-      rd_stats->rate += compmode_interinter_cost;
-      if (skip_build_pred != 1) {
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
-                                      0, av1_num_planes(cm) - 1);
-      }
-
-      if (cpi->sf.second_loop_comp_fast_tx_search && comp_loop_idx == 1) {
-        // TODO(chengchen): this speed feature introduces big loss.
-        // Need better estimation of rate distortion.
-        int dummy_rate;
-        int64_t dummy_dist;
-        int plane_rate[MAX_MB_PLANE] = { 0 };
-        int64_t plane_sse[MAX_MB_PLANE] = { 0 };
-        int64_t plane_dist[MAX_MB_PLANE] = { 0 };
-
-        model_rd_sb_fn[MODELRD_TYPE_DIST_WTD_COMPOUND](
-            cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
-            &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
-            plane_dist);
-
-        rd_stats->rate += rs;
-        rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
-        rd_stats_y->rate = plane_rate[0];
-        rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
-        rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
-        rd_stats_y->sse = plane_sse[0];
-        rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
-        rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
-        rd_stats_y->dist = plane_dist[0];
-        rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
-      } else {
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        start_timing(cpi, motion_mode_rd_time);
-#endif
-        ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
-                                 rd_stats_uv, disable_skip, mi_row, mi_col,
-                                 args, ref_best_rd, refs, &rate_mv, &orig_dst,
-                                 best_est_rd, do_tx_search, inter_modes_info);
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        end_timing(cpi, motion_mode_rd_time);
-#endif
-      }
-      mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
-      mode_info[ref_mv_idx].rate_mv = rate_mv;
-      if (ret_val != INT64_MAX) {
-        int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-        mode_info[ref_mv_idx].rd = tmp_rd;
-        if (tmp_rd < best_rd) {
-          best_rd_stats = *rd_stats;
-          best_rd_stats_y = *rd_stats_y;
-          best_rd_stats_uv = *rd_stats_uv;
-          best_rd = tmp_rd;
-          best_mbmi = *mbmi;
-          best_disable_skip = *disable_skip;
-          best_xskip = x->skip;
-          memcpy(best_blk_skip, x->blk_skip,
-                 sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
-        }
-
-        if (tmp_rd < ref_best_rd) {
-          ref_best_rd = tmp_rd;
-        }
-      }
-      restore_dst_buf(xd, orig_dst, num_planes);
     }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, compound_type_rd_time);
+#endif
+    int skip_build_pred = 0;
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    if (is_comp_pred) {
+      // Find matching interp filter or set to default interp filter
+      const int need_search = av1_is_interp_needed(xd);
+      const InterpFilter assign_filter = cm->features.interp_filter;
+      int is_luma_interp_done = 0;
+      av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search,
+                                   args->interp_filter_stats,
+                                   args->interp_filter_stats_idx);
+
+      int64_t best_rd_compound;
+      int64_t rd_thresh;
+      const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+      const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+      rd_thresh = get_rd_thresh_from_best_rd(
+          ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale);
+      compmode_interinter_cost = av1_compound_type_rd(
+          cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used,
+          &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+          rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+      if (ref_best_rd < INT64_MAX &&
+          (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+              ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
+      // No need to call av1_enc_build_inter_predictor for luma if
+      // COMPOUND_AVERAGE is selected because it is the first
+      // candidate in av1_compound_type_rd, and the following
+      // compound types searching uses tmp_dst buffer
+
+      if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
+          is_luma_interp_done) {
+        if (num_planes > 1) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                        bsize, AOM_PLANE_U, num_planes - 1);
+        }
+        skip_build_pred = 1;
+      }
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, compound_type_rd_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, interpolation_filter_search_time);
+#endif
+    ret_val = av1_interpolation_filter_search(
+        x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+        &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, interpolation_filter_search_time);
+#endif
+    if (args->modelled_rd != NULL && !is_comp_pred) {
+      args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+    }
+    if (ret_val != 0) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+               ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    }
+
+    if (args->modelled_rd != NULL) {
+      if (is_comp_pred) {
+        const int mode0 = compound_ref0_mode(this_mode);
+        const int mode1 = compound_ref1_mode(this_mode);
+        const int64_t mrd =
+            AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                   args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+        if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+      }
+    }
+    rd_stats->rate += compmode_interinter_cost;
+    if (skip_build_pred != 1) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, motion_mode_rd_time);
+#endif
+    int rate2_nocoeff = rd_stats->rate;
+    ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
+                             rd_stats_uv, disable_skip, args, ref_best_rd,
+                             skip_rd, &rate_mv, &orig_dst, best_est_rd,
+                             do_tx_search, inter_modes_info, 0);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, motion_mode_rd_time);
+#endif
+
+    mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+    mode_info[ref_mv_idx].rate_mv = rate_mv;
+    if (ret_val != INT64_MAX) {
+      int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      mode_info[ref_mv_idx].rd = tmp_rd;
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum,
+          NULL, bsize, tmp_rd,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      if (tmp_rd < best_rd) {
+        best_rd_stats = *rd_stats;
+        best_rd_stats_y = *rd_stats_y;
+        best_rd_stats_uv = *rd_stats_uv;
+        best_rd = tmp_rd;
+        best_mbmi = *mbmi;
+        best_disable_skip = *disable_skip;
+        best_xskip = x->force_skip;
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(best_blk_skip[0]) * xd->height * xd->width);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map,
+                       xd->height * xd->width);
+        motion_mode_cand->rate_mv = rate_mv;
+        motion_mode_cand->rate2_nocoeff = rate2_nocoeff;
+      }
+
+      if (tmp_rd < ref_best_rd) {
+        ref_best_rd = tmp_rd;
+        best_ref_mv_idx = ref_mv_idx;
+      }
+    }
+    restore_dst_buf(xd, orig_dst, num_planes);
   }
 
   if (best_rd == INT64_MAX) return INT64_MAX;
@@ -10613,16 +2621,20 @@
   *rd_stats_uv = best_rd_stats_uv;
   *mbmi = best_mbmi;
   *disable_skip = best_disable_skip;
-  x->skip = best_xskip;
+  x->force_skip = best_xskip;
   assert(IMPLIES(mbmi->comp_group_idx == 1,
                  mbmi->interinter_comp.type != COMPOUND_AVERAGE));
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
+         sizeof(best_blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
 
-  return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  return rd_stats->rdcost;
 }
 
 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -10632,8 +2644,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
-  const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
@@ -10642,9 +2654,11 @@
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
-                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
-                   mi_col, mbmi_ext->mode_context);
-
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
   int_mv nearestmv, nearmv;
   av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
                                    0);
@@ -10657,16 +2671,16 @@
   }
 
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
-  if (dv_ref.as_int == 0)
-    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
+  if (dv_ref.as_int == 0) {
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row);
+  }
   // Ref DV should not have sub-pel.
   assert((dv_ref.as_mv.col & 7) == 0);
   assert((dv_ref.as_mv.row & 7) == 0);
   mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
 
   struct buf_2d yv12_mb[MAX_MB_PLANE];
-  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL,
-                       num_planes);
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes);
   for (int i = 0; i < num_planes; ++i) {
     xd->plane[i].pre[0] = yv12_mb[i];
   }
@@ -10679,62 +2693,77 @@
 
   MB_MODE_INFO best_mbmi = *mbmi;
   RD_STATS best_rdstats = *rd_stats;
-  int best_skip = x->skip;
-
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+
+  FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+  const search_site_config *lookahead_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+  av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
+                                     &dv_ref.as_mv, lookahead_search_sites);
+  fullms_params.is_intra_mode = 1;
+
   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
        dir < IBC_MOTION_DIRECTIONS; ++dir) {
-    const MvLimits tmp_mv_limits = x->mv_limits;
     switch (dir) {
       case IBC_MOTION_ABOVE:
-        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-        x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
-        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
-        x->mv_limits.row_max =
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
+            (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
+        fullms_params.mv_limits.row_max =
             (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
         break;
       case IBC_MOTION_LEFT:
-        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-        x->mv_limits.col_max =
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
             (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
         // TODO(aconverse@google.com): Minimize the overlap between above and
         // left areas.
-        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
         int bottom_coded_mi_edge =
             AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
-        x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        fullms_params.mv_limits.row_max =
+            (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
         break;
       default: assert(0);
     }
-    assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
-    assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
-    assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
-    assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
-    av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+    assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min);
+    assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max);
+    assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min);
+    assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max);
 
-    if (x->mv_limits.col_max < x->mv_limits.col_min ||
-        x->mv_limits.row_max < x->mv_limits.row_min) {
-      x->mv_limits = tmp_mv_limits;
+    av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv);
+
+    if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min ||
+        fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) {
       continue;
     }
 
-    int step_param = cpi->mv_step_param;
-    MV mvp_full = dv_ref.as_mv;
-    mvp_full.col >>= 3;
-    mvp_full.row >>= 3;
-    const int sadpb = x->sadperbit16;
-    int cost_list[5];
-    const int bestsme = av1_full_pixel_search(
-        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
-        sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
-        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1,
-        &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
+    const int step_param = cpi->mv_search_params.mv_step_param;
+    const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
+    IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
+    int_mv best_mv, best_hash_mv;
 
-    x->mv_limits = tmp_mv_limits;
+    int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param,
+                                        NULL, &best_mv.as_fullmv, NULL);
+    const int hashsme = av1_intrabc_hash_search(
+        cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
+    if (hashsme < bestsme) {
+      best_mv = best_hash_mv;
+      bestsme = hashsme;
+    }
+
     if (bestsme == INT_MAX) continue;
-    mvp_full = x->best_mv.as_mv;
-    const MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
-    if (mv_check_bounds(&x->mv_limits, &dv)) continue;
+    const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    if (!av1_is_fullmv_in_range(&fullms_params.mv_limits,
+                                get_fullmv_from_mv(&dv)))
+      continue;
     if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
                          cm->seq_params.mib_size_log2))
       continue;
@@ -10751,45 +2780,45 @@
     mbmi->mv[0].as_mv = dv;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
     mbmi->skip = 0;
-    x->skip = 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                   av1_num_planes(cm) - 1);
 
-    int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
-                       (int *)&cpi->dv_cost[1][MV_MAX] };
+    const IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
+    int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX],
+                       (int *)&dv_costs->mv_component[1][MV_MAX] };
     // TODO(aconverse@google.com): The full motion field defining discount
     // in MV_COST_WEIGHT is too large. Explore other values.
-    const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+    const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
                                         dvcost, MV_COST_WEIGHT_SUB);
     const int rate_mode = x->intrabc_cost[1];
     RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
-    if (!txfm_search(cpi, NULL, x, bsize, mi_row, mi_col, &rd_stats_yuv,
-                     &rd_stats_y, &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
+                         &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
       continue;
     rd_stats_yuv.rdcost =
         RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
     if (rd_stats_yuv.rdcost < best_rd) {
       best_rd = rd_stats_yuv.rdcost;
       best_mbmi = *mbmi;
-      best_skip = mbmi->skip;
       best_rdstats = rd_stats_yuv;
       memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
     }
   }
   *mbmi = best_mbmi;
   *rd_stats = best_rdstats;
-  x->skip = best_skip;
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
 #if CONFIG_RD_DEBUG
   mbmi->rd_stats = *rd_stats;
 #endif
   return best_rd;
 }
 
-void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -10798,60 +2827,58 @@
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
   int64_t dist_y = 0, dist_uv = 0;
-  TX_SIZE max_uv_tx_size;
 
-  ctx->skip = 0;
+  ctx->rd_stats.skip = 0;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->use_intrabc = 0;
   mbmi->mv[0].as_int = 0;
+  mbmi->skip_mode = 0;
 
   const int64_t intra_yrd =
-      rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
-                             &dist_y, &y_skip, bsize, best_rd, ctx);
+      av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                                 &y_skip, bsize, best_rd, ctx);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
   if (intra_yrd < best_rd) {
     // Only store reconstructed luma when there's chroma RDO. When there's no
     // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl.is_chroma_reference =
-        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
-                            cm->seq_params.subsampling_y);
     xd->cfl.store_y = store_cfl_required_rdo(cm, x);
     if (xd->cfl.store_y) {
       // Restore reconstructed luma values.
       memcpy(x->blk_skip, ctx->blk_skip,
              sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y,
-                                   cpi->optimize_seg_arr[mbmi->segment_id],
-                                   mi_row, mi_col);
+      av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, DRY_RUN_NORMAL,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
       xd->cfl.store_y = 0;
     }
     if (num_planes > 1) {
-      max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
       init_sbuv_mode(mbmi);
-      if (!x->skip_chroma_rd)
-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
-                                &uv_skip, bsize, max_uv_tx_size);
+      if (xd->is_chroma_ref) {
+        const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+        av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                    &dist_uv, &uv_skip, bsize, max_uv_tx_size);
+      }
     }
 
-    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
-      rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                      x->skip_cost[av1_get_skip_context(xd)][1];
-      rd_cost->dist = dist_y + dist_uv;
-    } else {
-      rd_cost->rate =
-          rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
-      rd_cost->dist = dist_y + dist_uv;
-    }
+    // Intra block is always coded as non-skip
+    rd_cost->rate =
+        rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+    rd_cost->dist = dist_y + dist_uv;
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+    rd_cost->skip = 0;
   } else {
     rd_cost->rate = INT_MAX;
   }
 
   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
     best_rd = rd_cost->rdcost;
-  if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
-    ctx->skip = x->skip;
+  if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->rd_stats.skip = mbmi->skip;
     memcpy(ctx->blk_skip, x->blk_skip,
            sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
     assert(rd_cost->rate != INT_MAX);
@@ -10859,62 +2886,20 @@
   if (rd_cost->rate == INT_MAX) return;
 
   ctx->mic = *xd->mi[0];
-  ctx->mbmi_ext = *x->mbmi_ext;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
 
-static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  int *const data = x->palette_buffer->kmeans_data_buf;
-  int centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  int r, c;
-  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
-  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride);
 
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-      }
-    }
-  }
-
-  for (r = 1; r < 3; ++r) {
-    for (c = 0; c < pmi->palette_size[1]; ++c) {
-      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
-    }
-  }
-
-  av1_calc_indices(data, centroids, color_map, rows * cols,
-                   pmi->palette_size[1], 2);
-  extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                           plane_block_height);
-}
-
-static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
-                                      const MACROBLOCKD *xd, int mi_row,
-                                      int mi_col, const uint8_t *above,
-                                      int above_stride, const uint8_t *left,
-                                      int left_stride);
-
-static void rd_pick_skip_mode(RD_STATS *rd_cost,
-                              InterModeSearchState *search_state,
-                              const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, int mi_row, int mi_col,
-                              struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+static AOM_INLINE void rd_pick_skip_mode(
+    RD_STATS *rd_cost, InterModeSearchState *search_state,
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *const cm = &cpi->common;
   const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
   const int num_planes = av1_num_planes(cm);
@@ -10935,14 +2920,16 @@
   const MV_REFERENCE_FRAME second_ref_frame =
       LAST_FRAME + skip_mode_info->ref_frame_idx_1;
   const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
-  const int mode_index =
+  const THR_MODES mode_index =
       get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
 
-  if (mode_index == -1) {
+  if (mode_index == THR_INVALID) {
     return;
   }
 
-  if (!cpi->oxcf.enable_onesided_comp && cpi->all_one_sided_refs) {
+  if ((!cpi->oxcf.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
     return;
   }
 
@@ -10958,12 +2945,15 @@
     }
     MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
     av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
-                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
-                     mi_col, mbmi_ext->mode_context);
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type);
   }
 
   assert(this_mode == NEAREST_NEARESTMV);
-  if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) {
     return;
   }
 
@@ -10976,7 +2966,7 @@
   mbmi->ref_mv_idx = 0;
   mbmi->skip_mode = mbmi->skip = 1;
 
-  set_default_interp_filters(mbmi, cm->interp_filter);
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   for (int i = 0; i < num_planes; i++) {
@@ -10991,20 +2981,23 @@
   }
 
   // Obtain the rdcost for skip_mode.
-  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst);
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst);
 
   // Compare the use of skip_mode with the best intra/inter mode obtained.
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
-  const int64_t best_intra_inter_mode_cost =
-      (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX)
-          ? RDCOST(x->rdmult,
-                   rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
-                   rd_cost->dist)
-          : INT64_MAX;
+  int64_t best_intra_inter_mode_cost = INT64_MAX;
+  if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
+    best_intra_inter_mode_cost =
+        RDCOST(x->rdmult, rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+               rd_cost->dist);
+    // Account for non-skip mode rate in total rd stats
+    rd_cost->rate += x->skip_mode_cost[skip_mode_ctx][0];
+    av1_rd_cost_update(x->rdmult, rd_cost);
+  }
 
   if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
       (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
-    assert(mode_index != -1);
+    assert(mode_index != THR_INVALID);
     search_state->best_mbmode.skip_mode = 1;
     search_state->best_mbmode = *mbmi;
 
@@ -11018,12 +3011,13 @@
 
     // Set up tx_size related variables for skip-specific loop filtering.
     search_state->best_mbmode.tx_size =
-        block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode)
-                                    : max_txsize_rect_lookup[bsize];
+        block_signals_txsize(bsize)
+            ? tx_size_from_tx_mode(bsize, x->tx_mode_search_type)
+            : max_txsize_rect_lookup[bsize];
     memset(search_state->best_mbmode.inter_tx_size,
            search_state->best_mbmode.tx_size,
            sizeof(search_state->best_mbmode.inter_tx_size));
-    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
                   search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
 
     // Set up color-related variables for skip mode.
@@ -11040,7 +3034,8 @@
         (INTERINTRA_MODE)(II_DC_PRED - 1);
     search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
 
-    set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter);
+    set_default_interp_filters(&search_state->best_mbmode,
+                               cm->features.interp_filter);
 
     search_state->best_mode_index = mode_index;
 
@@ -11053,105 +3048,159 @@
     search_state->best_skip2 = 1;
     search_state->best_mode_skippable = 1;
 
-    x->skip = 1;
+    x->force_skip = 1;
   }
 }
 
+// Get winner mode stats of given mode index
+static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+    MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
+    int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
+    RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
+    THR_MODES *winner_mode_index, int enable_multiwinner_mode_process,
+    int mode_idx) {
+  MB_MODE_INFO *winner_mbmi;
+  if (enable_multiwinner_mode_process) {
+    assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
+    WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
+    winner_mbmi = &winner_mode_stat->mbmi;
+
+    *winner_rd_cost = &winner_mode_stat->rd_cost;
+    *winner_rate_y = winner_mode_stat->rate_y;
+    *winner_rate_uv = winner_mode_stat->rate_uv;
+    *winner_mode_index = winner_mode_stat->mode_index;
+  } else {
+    winner_mbmi = best_mbmode;
+    *winner_rd_cost = best_rd_cost;
+    *winner_rate_y = best_rate_y;
+    *winner_rate_uv = best_rate_uv;
+    *winner_mode_index = *best_mode_index;
+  }
+  return winner_mbmi;
+}
+
 // speed feature: fast intra/inter transform type search
 // Used for speed >= 2
 // When this speed feature is on, in rd mode search, only DCT is used.
 // After the mode is determined, this function is called, to select
 // transform types and get accurate rdcost.
-static void sf_refine_fast_tx_type_search(
-    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-    int best_mode_index, MB_MODE_INFO *best_mbmode,
-    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y,
-    int best_rate_uv, int *best_skip2) {
+static AOM_INLINE void refine_winner_mode_tx(
+    const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+    PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
+    MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) {
   const AV1_COMMON *const cm = &cpi->common;
-  const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t best_rd;
   const int num_planes = av1_num_planes(cm);
 
-  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
-      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
-        is_inter_mode(best_mbmode->mode)) ||
-       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
-        !is_inter_mode(best_mbmode->mode)))) {
-    int skip_blk = 0;
-    RD_STATS rd_stats_y, rd_stats_uv;
-    const int skip_ctx = av1_get_skip_context(xd);
+  if (!is_winner_mode_processing_enabled(cpi, best_mbmode, best_mbmode->mode))
+    return;
 
-    x->use_default_inter_tx_type = 0;
-    x->use_default_intra_tx_type = 0;
+  // Set params for winner mode evaluation
+  set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
 
-    *mbmi = *best_mbmode;
+  // No best mode identified so far
+  if (*best_mode_index == THR_INVALID) return;
 
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+  for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) {
+    RD_STATS *winner_rd_stats = NULL;
+    int winner_rate_y = 0, winner_rate_uv = 0;
+    THR_MODES winner_mode_index = 0;
 
-    // Select prediction reference frames.
-    for (int i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-      if (has_second_ref(mbmi))
-        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-    }
+    // TODO(any): Combine best mode and multi-winner mode processing paths
+    // Get winner mode stats for current mode index
+    MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
+        x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
+        &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, mode_idx);
 
-    if (is_inter_mode(mbmi->mode)) {
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                    av1_num_planes(cm) - 1);
-      if (mbmi->motion_mode == OBMC_CAUSAL)
-        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    if (xd->lossless[winner_mbmi->segment_id] == 0 &&
+        winner_mode_index != THR_INVALID &&
+        is_winner_mode_processing_enabled(cpi, winner_mbmi,
+                                          winner_mbmi->mode)) {
+      RD_STATS rd_stats = *winner_rd_stats;
+      int skip_blk = 0;
+      RD_STATS rd_stats_y, rd_stats_uv;
+      const int skip_ctx = av1_get_skip_context(xd);
 
-      av1_subtract_plane(x, bsize, 0);
-      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
-                              INT64_MAX);
-        assert(rd_stats_y.rate != INT_MAX);
-      } else {
-        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-        for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
-          set_blk_skip(x, 0, i, rd_stats_y.skip);
+      *mbmi = *winner_mbmi;
+
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      for (int i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (has_second_ref(mbmi))
+          xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
       }
-    } else {
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    }
 
-    if (num_planes > 1) {
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-    } else {
-      av1_init_rd_stats(&rd_stats_uv);
-    }
+      if (is_inter_mode(mbmi->mode)) {
+        const int mi_row = xd->mi_row;
+        const int mi_col = xd->mi_col;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+        if (mbmi->motion_mode == OBMC_CAUSAL)
+          av1_build_obmc_inter_predictors_sb(cm, xd);
 
-    if (RDCOST(x->rdmult,
-               x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist)) >
-        RDCOST(x->rdmult, x->skip_cost[skip_ctx][1],
-               (rd_stats_y.sse + rd_stats_uv.sse))) {
-      skip_blk = 1;
-      rd_stats_y.rate = x->skip_cost[skip_ctx][1];
-      rd_stats_uv.rate = 0;
-      rd_stats_y.dist = rd_stats_y.sse;
-      rd_stats_uv.dist = rd_stats_uv.sse;
-    } else {
-      skip_blk = 0;
-      rd_stats_y.rate += x->skip_cost[skip_ctx][0];
-    }
+        av1_subtract_plane(x, bsize, 0);
+        if (x->tx_mode_search_type == TX_MODE_SELECT &&
+            !xd->lossless[mbmi->segment_id]) {
+          av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                              INT64_MAX);
+          assert(rd_stats_y.rate != INT_MAX);
+        } else {
+          av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                            INT64_MAX);
+          memset(mbmi->inter_tx_size, mbmi->tx_size,
+                 sizeof(mbmi->inter_tx_size));
+          for (int i = 0; i < xd->height * xd->width; ++i)
+            set_blk_skip(x, 0, i, rd_stats_y.skip);
+        }
+      } else {
+        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      }
 
-    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
-        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist))) {
-      best_mbmode->tx_size = mbmi->tx_size;
-      av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size);
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      av1_copy(best_mbmode->txk_type, mbmi->txk_type);
-      rd_cost->rate +=
-          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
-      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
-      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-      *best_skip2 = skip_blk;
+      if (num_planes > 1) {
+        av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+
+      if (is_inter_mode(mbmi->mode) &&
+          RDCOST(x->rdmult,
+                 x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate,
+                 (rd_stats_y.dist + rd_stats_uv.dist)) >
+              RDCOST(x->rdmult, x->skip_cost[skip_ctx][1],
+                     (rd_stats_y.sse + rd_stats_uv.sse))) {
+        skip_blk = 1;
+        rd_stats_y.rate = x->skip_cost[skip_ctx][1];
+        rd_stats_uv.rate = 0;
+        rd_stats_y.dist = rd_stats_y.sse;
+        rd_stats_uv.dist = rd_stats_uv.sse;
+      } else {
+        skip_blk = 0;
+        rd_stats_y.rate += x->skip_cost[skip_ctx][0];
+      }
+      int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
+                      winner_rate_y - winner_rate_uv;
+      int64_t this_rd =
+          RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist));
+      if (best_rd > this_rd) {
+        *best_mbmode = *mbmi;
+        *best_mode_index = winner_mode_index;
+        av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+        av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+        rd_cost->rate = this_rate;
+        rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+        rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        *best_skip2 = skip_blk;
+      }
     }
   }
 }
@@ -11168,15 +3217,15 @@
 } mode_skip_mask_t;
 
 // Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
-static void disable_reference(MV_REFERENCE_FRAME ref,
-                              bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+static AOM_INLINE void disable_reference(
+    MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
   for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
     ref_combo[ref][ref2 + 1] = true;
   }
 }
 
 // Update 'ref_combo' mask to disable all inter references except ALTREF.
-static void disable_inter_references_except_altref(
+static AOM_INLINE void disable_inter_references_except_altref(
     bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
   disable_reference(LAST_FRAME, ref_combo);
   disable_reference(LAST2_FRAME, ref_combo);
@@ -11206,7 +3255,8 @@
 
 typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
 
-static void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) {
+static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
+                                         REF_SET ref_set) {
   if (ref_set == REF_SET_FULL) {
     // Everything available by default.
     memset(mask, 0, sizeof(*mask));
@@ -11244,8 +3294,9 @@
   }
 }
 
-static void init_mode_skip_mask(mode_skip_mask_t *mask, const AV1_COMP *cpi,
-                                MACROBLOCK *x, BLOCK_SIZE bsize) {
+static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
+                                           const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -11254,7 +3305,7 @@
   const SPEED_FEATURES *const sf = &cpi->sf;
   REF_SET ref_set = REF_SET_FULL;
 
-  if (sf->use_real_time_ref_set)
+  if (sf->rt_sf.use_real_time_ref_set)
     ref_set = REF_SET_REALTIME;
   else if (cpi->oxcf.enable_reduced_reference_set)
     ref_set = REF_SET_REDUCED;
@@ -11263,8 +3314,21 @@
 
   int min_pred_mv_sad = INT_MAX;
   MV_REFERENCE_FRAME ref_frame;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
-    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  if (ref_set == REF_SET_REALTIME) {
+    // For real-time encoding, we only look at a subset of ref frames. So the
+    // threshold for pruning should be computed from this subset as well.
+    const int num_rt_refs =
+        sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos);
+    for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) {
+      const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+      if (ref != INTRA_FRAME) {
+        min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]);
+      }
+    }
+  } else {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+      min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
@@ -11297,9 +3361,9 @@
       mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
       int_mv near_mv, nearest_mv, global_mv;
-      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
 
       if (near_mv.as_int != global_mv.as_int)
         mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
@@ -11309,7 +3373,7 @@
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->alt_ref_search_fp) {
+    if (sf->inter_sf.alt_ref_search_fp) {
       assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
       mask->pred_modes[ALTREF_FRAME] = 0;
       disable_inter_references_except_altref(mask->ref_combo);
@@ -11317,77 +3381,81 @@
     }
   }
 
-  if (sf->alt_ref_search_fp)
-    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
-      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
-        mask->pred_modes[ALTREF_FRAME] |= INTER_ALL;
+  if (sf->inter_sf.alt_ref_search_fp) {
+    if (!cm->show_frame && x->best_pred_mv_sad < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad + (x->best_pred_mv_sad >> 3);
+      // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
+      // those are past frames
+      for (ref_frame = BWDREF_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+        if (cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+          if (x->pred_mv_sad[ref_frame] > sad_thresh)
+            mask->pred_modes[ref_frame] |= INTER_ALL;
+      }
+    }
+  }
 
-  if (sf->adaptive_mode_search) {
+  if (sf->inter_sf.adaptive_mode_search) {
     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
         cpi->rc.frames_since_golden >= 3)
       if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
         mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  if (bsize > sf->max_intra_bsize) {
+  if (bsize > sf->part_sf.max_intra_bsize) {
     disable_reference(INTRA_FRAME, mask->ref_combo);
   }
 
   mask->pred_modes[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+}
+
+static AOM_INLINE void init_pred_buf(const MACROBLOCK *const x,
+                                     HandleInterModeArgs *const args) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    const int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+  }
 }
 
 // Please add/modify parameter setting in this function, making it consistent
 // and easy to read and maintain.
-static void set_params_rd_pick_inter_mode(
+static AOM_INLINE void set_params_rd_pick_inter_mode(
     const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
-    int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
-    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
-    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+    BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
+    unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
+    struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   unsigned char segment_id = mbmi->segment_id;
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
-                                   MAX_SB_SIZE >> 1 };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
-                                    MAX_SB_SIZE >> 1 };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  for (int i = 0; i < MB_MODE_COUNT; ++i)
-    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
-
-  if (is_cur_buf_hbd(xd)) {
-    int len = sizeof(uint16_t);
-    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args->above_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
-    args->above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
-    args->left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
-    args->left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
-  } else {
-    args->above_pred_buf[0] = x->above_pred_buf;
-    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
-    args->left_pred_buf[0] = x->left_pred_buf;
-    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
-  }
-
+  init_pred_buf(x, args);
   av1_collect_neighbors_ref_counts(xd);
-
   estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
                            ref_costs_comp);
 
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   MV_REFERENCE_FRAME ref_frame;
+  x->best_pred_mv_sad = INT_MAX;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
@@ -11410,281 +3478,97 @@
         }
       }
       assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
-      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
-                                 yv12_mb);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
     }
+    // Store the best pred_mv_sad across all past frames
+    if (cpi->sf.inter_sf.alt_ref_search_fp &&
+        cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+      x->best_pred_mv_sad =
+          AOMMIN(x->best_pred_mv_sad, x->pred_mv_sad[ref_frame]);
   }
   // ref_frame = ALTREF_FRAME
-  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
-    x->mbmi_ext->mode_context[ref_frame] = 0;
-    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
-    const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
-    if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
-          (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
-      continue;
-    }
-
-    if (mbmi->partition != PARTITION_NONE &&
-        mbmi->partition != PARTITION_SPLIT) {
-      if (skip_ref_frame_mask & (1 << ref_frame)) {
+  if (!cpi->sf.rt_sf.use_real_time_ref_set) {
+    // No second reference on RT ref set, so no need to initialize
+    for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+      x->mbmi_ext->mode_context[ref_frame] = 0;
+      mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+      if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+            (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
         continue;
       }
+
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          continue;
+        }
+      }
+      av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                       xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                       mbmi_ext->mode_context);
+      // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+      // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+      av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
     }
-    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
-                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
-                     mi_col, mbmi_ext->mode_context);
   }
 
-  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-
-  if (check_num_overlappable_neighbors(mbmi) &&
-      is_motion_variation_allowed_bsize(bsize)) {
-    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
-                                        args->above_pred_buf, dst_width1,
-                                        dst_height1, args->above_pred_stride);
-    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
-                                       args->left_pred_buf, dst_width2,
-                                       dst_height2, args->left_pred_stride);
-    av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
-                         0, num_planes);
-    calc_target_weighted_pred(
-        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
-        args->above_pred_stride[0], args->left_pred_buf[0],
-        args->left_pred_stride[0]);
+  av1_count_overlappable_neighbors(cm, xd);
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
+                         cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  if (cpi->oxcf.enable_obmc && !cpi->sf.inter_sf.disable_obmc && !prune_obmc) {
+    if (check_num_overlappable_neighbors(mbmi) &&
+        is_motion_variation_allowed_bsize(bsize)) {
+      int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                       MAX_SB_SIZE >> 1 };
+      int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                        MAX_SB_SIZE >> 1 };
+      int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf,
+                                          dst_width1, dst_height1,
+                                          args->above_pred_stride);
+      av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf,
+                                         dst_width2, dst_height2,
+                                         args->left_pred_stride);
+      const int num_planes = av1_num_planes(cm);
+      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+                           mi_col, 0, num_planes);
+      calc_target_weighted_pred(
+          cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
+          args->left_pred_buf[0], args->left_pred_stride[0]);
+    }
   }
 
   init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
 
-  if (cpi->sf.tx_type_search.fast_intra_tx_type_search ||
-      cpi->oxcf.use_intra_default_tx_only)
-    x->use_default_intra_tx_type = 1;
-  else
-    x->use_default_intra_tx_type = 0;
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
 
-  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
-    x->use_default_inter_tx_type = 1;
-  else
-    x->use_default_inter_tx_type = 0;
-  if (cpi->sf.skip_repeat_interpolation_filter_search) {
-    x->interp_filter_stats_idx[0] = 0;
-    x->interp_filter_stats_idx[1] = 0;
-  }
   x->comp_rd_stats_idx = 0;
 }
 
-// TODO(kyslov): now this is very similar to set_params_rd_pick_inter_mode
-// (except that doesn't set ALTREF parameters)
-//               consider passing a flag to select non-rd path (similar to
-//               encode_sb_row)
-static void set_params_nonrd_pick_inter_mode(
-    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
-    int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
-    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
-    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  unsigned char segment_id = mbmi->segment_id;
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
-                                   MAX_SB_SIZE >> 1 };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
-                                    MAX_SB_SIZE >> 1 };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-  for (int i = 0; i < MB_MODE_COUNT; ++i)
-    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args->above_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
-    args->above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
-    args->left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
-    args->left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
-  } else {
-    args->above_pred_buf[0] = x->above_pred_buf;
-    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
-    args->left_pred_buf[0] = x->left_pred_buf;
-    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
-  }
-
-  av1_collect_neighbors_ref_counts(xd);
-
-  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
-                           ref_costs_comp);
-
-  MV_REFERENCE_FRAME ref_frame;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    x->pred_mv_sad[ref_frame] = INT_MAX;
-    x->mbmi_ext->mode_context[ref_frame] = 0;
-    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      if (mbmi->partition != PARTITION_NONE &&
-          mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          int skip = 1;
-          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-            if (!(skip_ref_frame_mask & (1 << r))) {
-              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-              if (rf[0] == ref_frame || rf[1] == ref_frame) {
-                skip = 0;
-                break;
-              }
-            }
-          }
-          if (skip) continue;
-        }
-      }
-      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
-      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
-                                 yv12_mb);
-    }
-  }
-  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-
-  if (check_num_overlappable_neighbors(mbmi) &&
-      is_motion_variation_allowed_bsize(bsize)) {
-    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
-                                        args->above_pred_buf, dst_width1,
-                                        dst_height1, args->above_pred_stride);
-    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
-                                       args->left_pred_buf, dst_width2,
-                                       dst_height2, args->left_pred_stride);
-    av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
-                         0, num_planes);
-    calc_target_weighted_pred(
-        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
-        args->above_pred_stride[0], args->left_pred_buf[0],
-        args->left_pred_stride[0]);
-  }
-  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
-
-  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
-    x->use_default_intra_tx_type = 1;
-  else
-    x->use_default_intra_tx_type = 0;
-
-  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
-    x->use_default_inter_tx_type = 1;
-  else
-    x->use_default_inter_tx_type = 0;
-  if (cpi->sf.skip_repeat_interpolation_filter_search) {
-    x->interp_filter_stats_idx[0] = 0;
-    x->interp_filter_stats_idx[1] = 0;
-  }
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  intra_search_state->skip_intra_modes = 0;
+  intra_search_state->best_intra_mode = DC_PRED;
+  intra_search_state->angle_stats_ready = 0;
+  av1_zero(intra_search_state->directional_mode_skip_mask);
+  intra_search_state->rate_uv_intra = INT_MAX;
+  av1_zero(intra_search_state->pmi_uv);
+  for (int i = 0; i < REFERENCE_MODES; ++i)
+    intra_search_state->best_pred_rd[i] = INT64_MAX;
 }
 
-static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                                int mi_col, RD_STATS *rd_cost,
-                                PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
-                                MB_MODE_INFO *const mbmi,
-                                PALETTE_MODE_INFO *const pmi,
-                                unsigned int *ref_costs_single,
-                                InterModeSearchState *search_state) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int rate2 = 0;
-  int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd,
-          best_model_rd_palette = INT64_MAX;
-  int skippable = 0, rate_overhead_palette = 0;
-  RD_STATS rd_stats_y;
-  TX_SIZE uv_tx = TX_4X4;
-  uint8_t *const best_palette_color_map =
-      x->palette_buffer->best_palette_color_map;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  MB_MODE_INFO best_mbmi_palette = *mbmi;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
+static AOM_INLINE void init_inter_mode_search_state(
+    InterModeSearchState *search_state, const AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
+  init_intra_mode_search_state(&search_state->intra_search_state);
 
-  mbmi->mode = DC_PRED;
-  mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  rate_overhead_palette = rd_pick_palette_intra_sby(
-      cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
-      &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
-      &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
-  if (pmi->palette_size[0] == 0) return;
-
-  memcpy(x->blk_skip, best_blk_skip,
-         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
-
-  memcpy(color_map, best_palette_color_map,
-         rows * cols * sizeof(best_palette_color_map[0]));
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd);
-  if (rd_stats_y.rate == INT_MAX) return;
-
-  skippable = rd_stats_y.skip;
-  distortion2 = rd_stats_y.dist;
-  rate2 = rd_stats_y.rate + rate_overhead_palette;
-  rate2 += ref_costs_single[INTRA_FRAME];
-  if (num_planes > 1) {
-    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
-    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
-          &search_state->rate_uv_tokenonly[uv_tx],
-          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
-          &search_state->mode_uv[uv_tx]);
-      search_state->pmi_uv[uv_tx] = *pmi;
-      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
-    }
-    mbmi->uv_mode = search_state->mode_uv[uv_tx];
-    pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
-    if (pmi->palette_size[1] > 0) {
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
-    skippable = skippable && search_state->skip_uvs[uv_tx];
-    distortion2 += search_state->dist_uvs[uv_tx];
-    rate2 += search_state->rate_uv_intra[uv_tx];
-  }
-
-  if (skippable) {
-    rate2 -= rd_stats_y.rate;
-    if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
-  } else {
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
-  }
-  this_rd = RDCOST(x->rdmult, rate2, distortion2);
-  if (this_rd < search_state->best_rd) {
-    search_state->best_mode_index = 3;
-    mbmi->mv[0].as_int = 0;
-    rd_cost->rate = rate2;
-    rd_cost->dist = distortion2;
-    rd_cost->rdcost = this_rd;
-    search_state->best_rd = this_rd;
-    search_state->best_mbmode = *mbmi;
-    search_state->best_skip2 = 0;
-    search_state->best_mode_skippable = skippable;
-    memcpy(ctx->blk_skip, x->blk_skip,
-           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-  }
-}
-
-static void init_inter_mode_search_state(InterModeSearchState *search_state,
-                                         const AV1_COMP *cpi,
-                                         const TileDataEnc *tile_data,
-                                         const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                         int64_t best_rd_so_far) {
   search_state->best_rd = best_rd_so_far;
+  search_state->best_skip_rd[0] = INT64_MAX;
+  search_state->best_skip_rd[1] = INT64_MAX;
 
   av1_zero(search_state->best_mbmode);
 
@@ -11696,14 +3580,12 @@
 
   search_state->best_skip2 = 0;
 
-  search_state->best_mode_index = -1;
+  search_state->best_mode_index = THR_INVALID;
 
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const unsigned char segment_id = mbmi->segment_id;
 
-  search_state->skip_intra_modes = 0;
-
   search_state->num_available_refs = 0;
   memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
   memset(search_state->dist_order_refs, -1,
@@ -11714,29 +3596,18 @@
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     search_state->mode_threshold[i] =
-        ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5;
+        ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+        RD_THRESH_FAC_FRAC_BITS;
 
-  search_state->best_intra_mode = DC_PRED;
   search_state->best_intra_rd = INT64_MAX;
 
-  search_state->angle_stats_ready = 0;
-  av1_zero(search_state->directional_mode_skip_mask);
-
   search_state->best_pred_sse = UINT_MAX;
 
-  for (int i = 0; i < TX_SIZES_ALL; i++)
-    search_state->rate_uv_intra[i] = INT_MAX;
-
-  av1_zero(search_state->pmi_uv);
-
-  for (int i = 0; i < REFERENCE_MODES; ++i)
-    search_state->best_pred_rd[i] = INT64_MAX;
-
   av1_zero(search_state->single_newmv);
   av1_zero(search_state->single_newmv_rate);
   av1_zero(search_state->single_newmv_valid);
   for (int i = 0; i < MB_MODE_COUNT; ++i) {
-    for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+    for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
       for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
         search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
         search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
@@ -11770,9 +3641,9 @@
   av1_zero(search_state->single_state_modelled_cnt);
 }
 
-bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
-                    const MV_REFERENCE_FRAME *ref_frame,
-                    const PREDICTION_MODE this_mode) {
+static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+                           const MV_REFERENCE_FRAME *ref_frame,
+                           const PREDICTION_MODE this_mode) {
   if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
     return true;
   }
@@ -11781,46 +3652,44 @@
 }
 
 static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, int mode_index) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const struct segmentation *const seg = &cm->seg;
-  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
-  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const unsigned char segment_id = mbmi->segment_id;
-  const int comp_pred = ref_frame[1] > INTRA_FRAME;
-
+                                      BLOCK_SIZE bsize,
+                                      PREDICTION_MODE curr_mode,
+                                      const MV_REFERENCE_FRAME *ref_frames) {
+  const int comp_pred = ref_frames[1] > INTRA_FRAME;
   if (comp_pred) {
+    if (!is_comp_ref_allowed(bsize)) return 1;
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) {
+      return 1;
+    }
+
+    const AV1_COMMON *const cm = &cpi->common;
     if (frame_is_intra_only(cm)) return 1;
 
+    const CurrentFrame *const current_frame = &cm->current_frame;
     if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
 
-    // Skip compound inter modes if ARF is not available.
-    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
-      return 1;
-
+    const struct segmentation *const seg = &cm->seg;
+    const unsigned char segment_id = x->e_mbd.mi[0]->segment_id;
     // Do not allow compound prediction if the segment level reference frame
     // feature is in use as in this case there can only be one reference.
     if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
-
-    if (!is_comp_ref_allowed(bsize)) return 1;
   }
 
-  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+  if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) {
     // Mode must be compatible
-    if (!is_interintra_allowed_mode(this_mode)) return 1;
     if (!is_interintra_allowed_bsize(bsize)) return 1;
+    if (!is_interintra_allowed_mode(curr_mode)) return 1;
   }
 
   return 0;
 }
 
 static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
-                                        BLOCK_SIZE bsize, int mib_size,
-                                        int mi_row, int mi_col) {
+                                        BLOCK_SIZE bsize, int mib_size) {
   const int sb_size_mask = mib_size - 1;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
   const int mi_row_in_sb = mi_row & sb_size_mask;
   const int mi_col_in_sb = mi_col & sb_size_mask;
   const int mi_w = mi_size_wide[bsize];
@@ -11838,32 +3707,41 @@
 // Case 2: return 1, means skip this mode completely
 // Case 3: return 2, means skip compound only, but still try single motion modes
 static int inter_mode_search_order_independent_skip(
-    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
-    int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
-    InterModeSearchState *search_state, int skip_ref_frame_mask) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
-  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
-  const int comp_pred = ref_frame[1] > INTRA_FRAME;
-  int skip_motion_mode = 0;
-
-  if (mask_says_skip(mode_skip_mask, ref_frame, this_mode)) {
+    const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask,
+    InterModeSearchState *search_state, int skip_ref_frame_mask,
+    PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) {
+  if (mask_says_skip(mode_skip_mask, ref_frame, mode)) {
     return 1;
   }
 
+  const int ref_type = av1_ref_frame_type(ref_frame);
+  if ((cpi->prune_ref_frame_mask >> ref_type) & 1) return 1;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+    return 1;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
+    return 1;
+  }
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+  if ((!cpi->oxcf.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      comp_pred && cpi->all_one_sided_refs) {
+    return 1;
+  }
+
+  const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
   // If no valid mode has been found so far in PARTITION_NONE when finding a
   // valid partition is required, do not skip mode.
   if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE &&
       x->must_find_valid_partition)
     return 0;
 
+  int skip_motion_mode = 0;
   if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
-    const int ref_type = av1_ref_frame_type(ref_frame);
     int skip_ref = skip_ref_frame_mask & (1 << ref_type);
     if (ref_type <= ALTREF_FRAME && skip_ref) {
       // Since the compound ref modes depends on the motion estimation result of
@@ -11871,418 +3749,65 @@
       // If current single ref mode is marked skip, we need to check if it will
       // be used in compound ref modes.
       for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-        if (!(skip_ref_frame_mask & (1 << r))) {
-          const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-          if (rf[0] == ref_type || rf[1] == ref_type) {
-            // Found a not skipped compound ref mode which contains current
-            // single ref. So this single ref can't be skipped completly
-            // Just skip it's motion mode search, still try it's simple
-            // transition mode.
-            skip_motion_mode = 1;
-            skip_ref = 0;
-            break;
-          }
+        if (skip_ref_frame_mask & (1 << r)) continue;
+        const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+        if (rf[0] == ref_type || rf[1] == ref_type) {
+          // Found a not skipped compound ref mode which contains current
+          // single ref. So this single ref can't be skipped completly
+          // Just skip it's motion mode search, still try it's simple
+          // transition mode.
+          skip_motion_mode = 1;
+          skip_ref = 0;
+          break;
         }
       }
     }
     if (skip_ref) return 1;
   }
 
-  if (cpi->two_pass_partition_search && !x->cb_partition_scan) {
-    const int mi_width = mi_size_wide[bsize];
-    const int mi_height = mi_size_high[bsize];
-    int found = 0;
-    // Search in the stats table to see if the ref frames have been used in the
-    // first pass of partition search.
-    for (int row = mi_row; row < mi_row + mi_width && !found;
-         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-      for (int col = mi_col; col < mi_col + mi_height && !found;
-           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
-        const int index = av1_first_partition_pass_stats_index(row, col);
-        const FIRST_PARTITION_PASS_STATS *const stats =
-            &x->first_partition_pass_stats[index];
-        if (stats->ref0_counts[ref_frame[0]] &&
-            (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
-          found = 1;
-          break;
-        }
-      }
-    }
-    if (!found) return 1;
-  }
-
-  // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
-    return 1;
-
+  const SPEED_FEATURES *const sf = &cpi->sf;
   if (ref_frame[0] == INTRA_FRAME) {
-    if (this_mode != DC_PRED) {
+    if (mode != DC_PRED) {
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
       // TODO(debargha): Specialize the threshold for super block sizes
       const unsigned int skip_intra_var_thresh = 64;
-      if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
           x->source_variance < skip_intra_var_thresh)
         return 1;
     }
   }
 
-  if (sf->selective_ref_frame) {
-    if (sf->selective_ref_frame >= 3 || x->cb_partition_scan) {
-      if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME)
-        if (get_relative_dist(
-                order_hint_info,
-                cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
-                current_frame->order_hint) < 0)
-          return 1;
-      if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME)
-        if (get_relative_dist(
-                order_hint_info,
-                cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME],
-                current_frame->order_hint) < 0)
-          return 1;
-    }
-
-    if (sf->selective_ref_frame >= 2 ||
-        (sf->selective_ref_frame == 1 && comp_pred)) {
-      if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME)
-        if (get_relative_dist(
-                order_hint_info,
-                cm->cur_frame->ref_order_hints[LAST3_FRAME - LAST_FRAME],
-                cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0)
-          return 1;
-      if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME)
-        if (get_relative_dist(
-                order_hint_info,
-                cm->cur_frame->ref_order_hints[LAST2_FRAME - LAST_FRAME],
-                cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0)
-          return 1;
-    }
-  }
-
-  // One-sided compound is used only when all reference frames are one-sided.
-  if ((sf->selective_ref_frame >= 2) && comp_pred && !cpi->all_one_sided_refs) {
-    unsigned int ref_offsets[2];
-    for (int i = 0; i < 2; ++i) {
-      const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame[i]);
-      assert(buf != NULL);
-      ref_offsets[i] = buf->order_hint;
-    }
-    if ((get_relative_dist(order_hint_info, ref_offsets[0],
-                           current_frame->order_hint) <= 0 &&
-         get_relative_dist(order_hint_info, ref_offsets[1],
-                           current_frame->order_hint) <= 0) ||
-        (get_relative_dist(order_hint_info, ref_offsets[0],
-                           current_frame->order_hint) > 0 &&
-         get_relative_dist(order_hint_info, ref_offsets[1],
-                           current_frame->order_hint) > 0))
-      return 1;
-  }
-
-  if (sf->selective_ref_frame >= 4 && comp_pred) {
-    // Check if one of the reference is ALTREF2_FRAME and BWDREF_FRAME is a
-    // valid reference.
-    if ((ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME) &&
-        (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
-      // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
-      if ((get_relative_dist(
-               order_hint_info,
-               cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
-               current_frame->order_hint) > 0) &&
-          (get_relative_dist(
-               order_hint_info,
-               cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME],
-               current_frame->order_hint) > 0)) {
-        // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
-        // reference to the current frame than ALTREF2_FRAME
-        if (get_relative_dist(
-                order_hint_info,
-                cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
-                cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME]) >=
-            0) {
-          const RefCntBuffer *const buf_arf2 =
-              get_ref_frame_buf(cm, ALTREF2_FRAME);
-          assert(buf_arf2 != NULL);
-          const RefCntBuffer *const buf_bwd =
-              get_ref_frame_buf(cm, BWDREF_FRAME);
-          assert(buf_bwd != NULL);
-          (void)buf_arf2;
-          (void)buf_bwd;
-          return 1;
-        }
-      }
-    }
-  }
-
-  if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
+  if (prune_ref_by_selective_ref_frame(cpi, x, ref_frame,
+                                       cm->cur_frame->ref_display_order_hint))
     return 1;
-  }
-  if (skip_motion_mode) {
-    return 2;
-  }
 
-  if (!cpi->oxcf.enable_global_motion &&
-      (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
-    return 1;
-  }
-
-  if (!cpi->oxcf.enable_onesided_comp && comp_pred && cpi->all_one_sided_refs) {
-    return 1;
-  }
+  if (skip_motion_mode) return 2;
 
   return 0;
 }
 
-static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+                             const MV_REFERENCE_FRAME *ref_frames,
                              const AV1_COMMON *cm) {
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
   mbmi->ref_mv_idx = 0;
-  mbmi->mode = this_mode;
+  mbmi->mode = curr_mode;
   mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0];
-  mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1];
+  mbmi->ref_frame[0] = ref_frames[0];
+  mbmi->ref_frame[1] = ref_frames[1];
   pmi->palette_size[0] = 0;
   pmi->palette_size[1] = 0;
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
   mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-  set_default_interp_filters(mbmi, cm->interp_filter);
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
 }
 
-static int64_t handle_intra_mode(InterModeSearchState *search_state,
-                                 const AV1_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 int ref_frame_cost,
-                                 const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                                 RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                                 RD_STATS *rd_stats_uv) {
-  const AV1_COMMON *cm = &cpi->common;
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(mbmi->ref_frame[0] == INTRA_FRAME);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  const int intra_cost_penalty = av1_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth);
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  const int num_planes = av1_num_planes(cm);
-  const int skip_ctx = av1_get_skip_context(xd);
-
-  int known_rate = intra_mode_cost[mbmi->mode];
-  known_rate += ref_frame_cost;
-  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
-    known_rate += intra_cost_penalty;
-  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
-  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
-  if (known_rd > search_state->best_rd) {
-    search_state->skip_intra_modes = 1;
-    return INT64_MAX;
-  }
-
-  TX_SIZE uv_tx;
-  int is_directional_mode = av1_is_directional_mode(mbmi->mode);
-  if (is_directional_mode && av1_use_angle_delta(bsize) &&
-      cpi->oxcf.enable_angle_delta) {
-    int rate_dummy;
-    int64_t model_rd = INT64_MAX;
-    if (sf->intra_angle_estimation && !search_state->angle_stats_ready) {
-      const int src_stride = x->plane[0].src.stride;
-      const uint8_t *src = x->plane[0].src.buf;
-      angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
-                       search_state->directional_mode_skip_mask);
-      search_state->angle_stats_ready = 1;
-    }
-    if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
-    av1_init_rd_stats(rd_stats_y);
-    rd_stats_y->rate = INT_MAX;
-    rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
-                            bsize, intra_mode_cost[mbmi->mode],
-                            search_state->best_rd, &model_rd);
-  } else {
-    av1_init_rd_stats(rd_stats_y);
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
-  }
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  memcpy(best_blk_skip, x->blk_skip,
-         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-  int try_filter_intra = 0;
-  int64_t best_rd_tmp = INT64_MAX;
-  if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
-    if (rd_stats_y->rate != INT_MAX) {
-      const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
-                           intra_mode_cost[mbmi->mode];
-      best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
-      try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
-    } else {
-      try_filter_intra = !(search_state->best_mbmode.skip);
-    }
-  }
-  if (try_filter_intra) {
-    RD_STATS rd_stats_y_fi;
-    int filter_intra_selected_flag = 0;
-    TX_SIZE best_tx_size = mbmi->tx_size;
-    TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
-    memcpy(best_txk_type, mbmi->txk_type,
-           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-    FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
-
-    mbmi->filter_intra_mode_info.use_filter_intra = 1;
-    for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
-         fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
-      int64_t this_rd_tmp;
-      mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-      super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
-      if (rd_stats_y_fi.rate == INT_MAX) {
-        continue;
-      }
-      const int this_rate_tmp =
-          rd_stats_y_fi.rate +
-          intra_mode_info_cost_y(cpi, x, mbmi, bsize,
-                                 intra_mode_cost[mbmi->mode]);
-      this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
-
-      if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
-        break;
-      }
-      if (this_rd_tmp < best_rd_tmp) {
-        best_tx_size = mbmi->tx_size;
-        memcpy(best_txk_type, mbmi->txk_type,
-               sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-        memcpy(best_blk_skip, x->blk_skip,
-               sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-        best_fi_mode = fi_mode;
-        *rd_stats_y = rd_stats_y_fi;
-        filter_intra_selected_flag = 1;
-        best_rd_tmp = this_rd_tmp;
-      }
-    }
-
-    mbmi->tx_size = best_tx_size;
-    memcpy(mbmi->txk_type, best_txk_type,
-           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-    memcpy(x->blk_skip, best_blk_skip,
-           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-    if (filter_intra_selected_flag) {
-      mbmi->filter_intra_mode_info.use_filter_intra = 1;
-      mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
-    } else {
-      mbmi->filter_intra_mode_info.use_filter_intra = 0;
-    }
-  }
-  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
-  const int mode_cost_y =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
-  av1_init_rd_stats(rd_stats);
-  av1_init_rd_stats(rd_stats_uv);
-  if (num_planes > 1) {
-    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
-    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
-      int rate_y =
-          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
-      const int64_t rdy =
-          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
-      if (search_state->best_rd < (INT64_MAX / 2) &&
-          rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
-        search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
-          &search_state->rate_uv_tokenonly[uv_tx],
-          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
-          &search_state->mode_uv[uv_tx]);
-      if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
-      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
-
-      const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
-      const int64_t uv_dist = search_state->dist_uvs[uv_tx];
-      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
-      if (uv_rd > search_state->best_rd) {
-        search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-    }
-
-    rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
-    rd_stats_uv->dist = search_state->dist_uvs[uv_tx];
-    rd_stats_uv->skip = search_state->skip_uvs[uv_tx];
-    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
-    mbmi->uv_mode = search_state->mode_uv[uv_tx];
-    if (try_palette) {
-      pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
-  }
-  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
-    // super_block_yrd above includes the cost of the tx_size in the
-    // tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size);
-  }
-  if (num_planes > 1 && !x->skip_chroma_rd) {
-    const int uv_mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode];
-    rd_stats->rate +=
-        rd_stats_uv->rate +
-        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
-  }
-  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
-    rd_stats->rate += intra_cost_penalty;
-  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
-
-  // Estimate the reference frame signaling cost and add it
-  // to the rolling cost variable.
-  rd_stats->rate += ref_frame_cost;
-  if (rd_stats->skip) {
-    // Back out the coefficient coding costs
-    rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate);
-    rd_stats_y->rate = 0;
-    rd_stats_uv->rate = 0;
-    // Cost the skip mb case
-    rd_stats->rate += x->skip_cost[skip_ctx][1];
-  } else {
-    // Add in the cost of the no skip flag.
-    rd_stats->rate += x->skip_cost[skip_ctx][0];
-  }
-  // Calculate the final RD estimate for this mode.
-  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  // Keep record of best intra rd
-  if (this_rd < search_state->best_intra_rd) {
-    search_state->best_intra_rd = this_rd;
-    search_state->best_intra_mode = mbmi->mode;
-  }
-
-  if (sf->skip_intra_in_interframe) {
-    if (search_state->best_rd < (INT64_MAX / 2) &&
-        this_rd > (search_state->best_rd + (search_state->best_rd >> 1)))
-      search_state->skip_intra_modes = 1;
-  }
-
-  if (!disable_skip) {
-    for (int i = 0; i < REFERENCE_MODES; ++i)
-      search_state->best_pred_rd[i] =
-          AOMMIN(search_state->best_pred_rd[i], this_rd);
-  }
-  return this_rd;
-}
-
-static void collect_single_states(MACROBLOCK *x,
-                                  InterModeSearchState *search_state,
-                                  const MB_MODE_INFO *const mbmi) {
+static AOM_INLINE void collect_single_states(MACROBLOCK *x,
+                                             InterModeSearchState *search_state,
+                                             const MB_MODE_INFO *const mbmi) {
   int i, j;
   const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   const PREDICTION_MODE this_mode = mbmi->mode;
@@ -12293,12 +3818,13 @@
   // Simple rd
   int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
   for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
-    int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+    const int64_t rd =
+        search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
     if (rd < simple_rd) simple_rd = rd;
   }
 
   // Insertion sort of single_state
-  SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+  const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
   SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
   i = search_state->single_state_cnt[dir][mode_offset];
   for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
@@ -12309,12 +3835,13 @@
   // Modelled rd
   int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
   for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
-    int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+    const int64_t rd =
+        search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
     if (rd < modelled_rd) modelled_rd = rd;
   }
 
   // Insertion sort of single_state_modelled
-  SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+  const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
   SingleInterModeState *state_m =
       search_state->single_state_modelled[dir][mode_offset];
   i = search_state->single_state_modelled_cnt[dir][mode_offset];
@@ -12324,42 +3851,41 @@
   search_state->single_state_modelled_cnt[dir][mode_offset]++;
 }
 
-static void analyze_single_states(const AV1_COMP *cpi,
-                                  InterModeSearchState *search_state) {
+static AOM_INLINE void analyze_single_states(
+    const AV1_COMP *cpi, InterModeSearchState *search_state) {
+  const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
+  assert(prune_level >= 1);
   int i, j, dir, mode;
-  if (cpi->sf.prune_comp_search_by_single_result >= 1) {
-    for (dir = 0; dir < 2; ++dir) {
-      int64_t best_rd;
-      SingleInterModeState(*state)[FWD_REFS];
-      const int prune_factor =
-          cpi->sf.prune_comp_search_by_single_result >= 2 ? 6 : 5;
 
-      // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
-      // reference frames for all the modes (NEARESTMV and NEARMV may not
-      // have same motion vectors). Always keep the best of each mode
-      // because it might form the best possible combination with other mode.
-      state = search_state->single_state[dir];
-      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
-                       state[INTER_OFFSET(GLOBALMV)][0].rd);
-      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
-        for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
-          if (state[mode][i].rd != INT64_MAX &&
-              (state[mode][i].rd >> 3) * prune_factor > best_rd) {
-            state[mode][i].valid = 0;
-          }
+  for (dir = 0; dir < 2; ++dir) {
+    int64_t best_rd;
+    SingleInterModeState(*state)[FWD_REFS];
+    const int prune_factor = prune_level >= 2 ? 6 : 5;
+
+    // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+    // reference frames for all the modes (NEARESTMV and NEARMV may not
+    // have same motion vectors). Always keep the best of each mode
+    // because it might form the best possible combination with other mode.
+    state = search_state->single_state[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
         }
       }
+    }
 
-      state = search_state->single_state_modelled[dir];
-      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
-                       state[INTER_OFFSET(GLOBALMV)][0].rd);
-      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
-        for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
-             ++i) {
-          if (state[mode][i].rd != INT64_MAX &&
-              (state[mode][i].rd >> 3) * prune_factor > best_rd) {
-            state[mode][i].valid = 0;
-          }
+    state = search_state->single_state_modelled[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
         }
       }
     }
@@ -12378,38 +3904,37 @@
       const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
       for (i = 0; i < state_cnt_s; ++i) {
         if (state_s[i].rd == INT64_MAX) break;
-        if (state_s[i].valid)
+        if (state_s[i].valid) {
           search_state->single_rd_order[dir][mode][count++] =
               state_s[i].ref_frame;
+        }
       }
-      if (count < max_candidates) {
-        for (i = 0; i < state_cnt_m; ++i) {
-          if (state_m[i].rd == INT64_MAX) break;
-          if (state_m[i].valid) {
-            int ref_frame = state_m[i].ref_frame;
-            int match = 0;
-            // Check if existing already
-            for (j = 0; j < count; ++j) {
-              if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
-                match = 1;
-                break;
-              }
-            }
-            if (!match) {
-              // Check if this ref_frame is removed in simple rd
-              int valid = 1;
-              for (j = 0; j < state_cnt_s; j++) {
-                if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
-                  valid = 0;
-                  break;
-                }
-              }
-              if (valid)
-                search_state->single_rd_order[dir][mode][count++] = ref_frame;
-            }
-            if (count >= max_candidates) break;
+      if (count >= max_candidates) continue;
+
+      for (i = 0; i < state_cnt_m && count < max_candidates; ++i) {
+        if (state_m[i].rd == INT64_MAX) break;
+        if (!state_m[i].valid) continue;
+        const int ref_frame = state_m[i].ref_frame;
+        int match = 0;
+        // Check if existing already
+        for (j = 0; j < count; ++j) {
+          if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+            match = 1;
+            break;
           }
         }
+        if (match) continue;
+        // Check if this ref_frame is removed in simple rd
+        int valid = 1;
+        for (j = 0; j < state_cnt_s; ++j) {
+          if (ref_frame == state_s[j].ref_frame) {
+            valid = state_s[j].valid;
+            break;
+          }
+        }
+        if (valid) {
+          search_state->single_rd_order[dir][mode][count++] = ref_frame;
+        }
       }
     }
   }
@@ -12423,24 +3948,29 @@
       search_state->single_state[dir][mode_offset];
   const SingleInterModeState *state_modelled =
       search_state->single_state_modelled[dir][mode_offset];
-  int max_candidates = 0;
-  int candidates;
 
+  int max_candidates = 0;
   for (int i = 0; i < FWD_REFS; ++i) {
     if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
     max_candidates++;
   }
 
-  candidates = max_candidates;
-  if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+  int candidates = max_candidates;
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) {
     candidates = AOMMIN(2, max_candidates);
   }
-  if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) {
     if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
         state[0].ref_frame == state_modelled[0].ref_frame)
       candidates = 1;
     if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
   }
+
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) {
+    // Limit the number of candidates to 1 in each direction for compound
+    // prediction
+    candidates = AOMMIN(1, candidates);
+  }
   return candidates;
 }
 
@@ -12473,338 +4003,655 @@
 
   const int ref_set = get_drl_refmv_count(x, refs, this_mode);
   for (i = 0; i < 2; ++i) {
-    if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
-      const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
-      int idential = 1;
-      for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
-        int_mv single_mv;
-        int_mv comp_mv;
-        get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
-                    x->mbmi_ext);
-        get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
-
-        idential &= (single_mv.as_int == comp_mv.as_int);
-        if (!idential) {
-          ref_mv_match[i] = 0;
-          break;
-        }
+    if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) {
+      continue;
+    }
+    const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+    for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+      int_mv single_mv;
+      int_mv comp_mv;
+      get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
+                  x->mbmi_ext);
+      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, x->mbmi_ext);
+      if (single_mv.as_int != comp_mv.as_int) {
+        ref_mv_match[i] = 0;
+        break;
       }
     }
   }
 
   for (i = 0; i < 2; ++i) {
-    if (ref_searched[i] && ref_mv_match[i]) {
-      const int candidates =
-          compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
-      const MV_REFERENCE_FRAME *ref_order =
-          search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
-      int match = 0;
-      for (j = 0; j < candidates; ++j) {
-        if (refs[i] == ref_order[j]) {
-          match = 1;
-          break;
-        }
+    if (!ref_searched[i] || !ref_mv_match[i]) continue;
+    const int candidates =
+        compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+    const MV_REFERENCE_FRAME *ref_order =
+        search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+    int match = 0;
+    for (j = 0; j < candidates; ++j) {
+      if (refs[i] == ref_order[j]) {
+        match = 1;
+        break;
       }
-      if (!match) return 1;
     }
+    if (!match) return 1;
   }
 
   return 0;
 }
 
-static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
-                                       InterModeSearchState *search_state) {
-  const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
-  const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
-  if (search_state->num_available_refs > 2) {
-    if ((ref_frame == search_state->dist_order_refs[0] &&
-         second_ref_frame == search_state->dist_order_refs[1]) ||
-        (ref_frame == search_state->dist_order_refs[1] &&
-         second_ref_frame == search_state->dist_order_refs[0]))
-      return 1;  // drop this pair of refs
-  }
-  return 0;
-}
-
-static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
-                                       const MODE_DEFINITION *mode,
-                                       int64_t distortion2) {
-  const PREDICTION_MODE this_mode = mode->mode;
-  MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
-  const int idx = ref_frame - LAST_FRAME;
-  if (idx && distortion2 > search_state->dist_refs[idx]) {
-    search_state->dist_refs[idx] = distortion2;
-    search_state->dist_order_refs[idx] = ref_frame;
-  }
-
-  // Reach the last single ref prediction mode
-  if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
-    // bubble sort dist_refs and the order index
-    for (int i = 0; i < REF_FRAMES; ++i) {
-      for (int k = i + 1; k < REF_FRAMES; ++k) {
-        if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
-          int64_t tmp_dist = search_state->dist_refs[i];
-          search_state->dist_refs[i] = search_state->dist_refs[k];
-          search_state->dist_refs[k] = tmp_dist;
-
-          int tmp_idx = search_state->dist_order_refs[i];
-          search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
-          search_state->dist_order_refs[k] = tmp_idx;
-        }
-      }
+// Check if ref frames of current block matches with given block.
+static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+                                   const MV_REFERENCE_FRAME *ref_frames,
+                                   int *const is_ref_match) {
+  if (is_inter_block(mbmi)) {
+    is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0];
+    is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0];
+    if (has_second_ref(mbmi)) {
+      is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1];
+      is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1];
     }
-    for (int i = 0; i < REF_FRAMES; ++i) {
-      if (search_state->dist_refs[i] == -1) break;
-      search_state->num_available_refs = i;
-    }
-    search_state->num_available_refs++;
   }
 }
 
-// sf->prune_single_motion_modes_by_simple_trans
-static int analyze_simple_trans_states(const AV1_COMP *cpi, MACROBLOCK *x) {
-  (void)cpi;
-  int64_t rdcosts[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX,
-                                  INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX };
-  int skip_ref = 0;
-  int64_t min_rd = INT64_MAX;
-  for (int i = 0; i < SINGLE_REF_MODES; ++i) {
-    const MODE_DEFINITION *mode_order = &av1_mode_order[i];
-    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
-    for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
-      const int64_t rd = x->simple_rd_state[i][k].rd_stats.rdcost;
-      rdcosts[ref_frame] = AOMMIN(rdcosts[ref_frame], rd);
-      min_rd = AOMMIN(min_rd, rd);
-    }
-  }
-  int valid_cnt = 0;
-  for (int i = 1; i < REF_FRAMES; ++i) {
-    if (rdcosts[i] == INT64_MAX) {
-      skip_ref |= (1 << i);
-    } else {
-      valid_cnt++;
-    }
-  }
-  if (valid_cnt < 2) {
+// Prune compound mode using ref frames of neighbor blocks.
+static INLINE int compound_skip_using_neighbor_refs(
+    MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
+    const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
     return 0;
+
+  int is_ref_match[2] = { 0 };  // 0 - match for forward refs
+                                // 1 - match for backward refs
+  // Check if ref frames of this block matches with left neighbor.
+  if (xd->left_available)
+    match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match);
+
+  // Check if ref frames of this block matches with above neighbor.
+  if (xd->up_available)
+    match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match);
+
+  // Combine ref frame match with neighbors in forward and backward refs.
+  const int track_ref_match = is_ref_match[0] + is_ref_match[1];
+
+  // Pruning based on ref frame match with neighbors.
+  if (track_ref_match >= prune_compound_using_neighbors) return 0;
+  return 1;
+}
+
+static int compare_int64(const void *a, const void *b) {
+  int64_t a64 = *((int64_t *)a);
+  int64_t b64 = *((int64_t *)b);
+  if (a64 < b64) {
+    return -1;
+  } else if (a64 == b64) {
+    return 0;
+  } else {
+    return 1;
   }
-  min_rd += (min_rd >> 1);
-  if (valid_cnt > 2) {
-    for (int i = 1; i < REF_FRAMES; ++i) {
-      if (rdcosts[i] > min_rd) {
-        skip_ref |= (1 << i);
+}
+
+static INLINE void update_search_state(
+    InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
+    PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
+    const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
+    THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int mode_is_intra =
+      (av1_mode_defs[new_best_mode].mode < INTRA_MODE_END);
+  const int skip = mbmi->skip && !mode_is_intra;
+
+  search_state->best_rd = new_best_rd_stats->rdcost;
+  search_state->best_mode_index = new_best_mode;
+  *best_rd_stats_dst = *new_best_rd_stats;
+  search_state->best_mbmode = *mbmi;
+  search_state->best_skip2 = skip;
+  search_state->best_mode_skippable = new_best_rd_stats->skip;
+  // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
+  // rate_uv because av1_txfm_search process is replaced by rd estimation.
+  // Therfore, we should avoid updating best_rate_y and best_rate_uv here.
+  // These two values will be updated when av1_txfm_search is called.
+  if (txfm_search_done) {
+    search_state->best_rate_y =
+        new_best_rd_stats_y->rate +
+        x->skip_cost[skip_ctx][new_best_rd_stats->skip || skip];
+    search_state->best_rate_uv = new_best_rd_stats_uv->rate;
+  }
+  memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+// Find the best RD for a reference frame (among single reference modes)
+// and store +10% of it in the 0-th element in ref_frame_rd.
+static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+  assert(ref_frame_rd[0] == INT64_MAX);
+  int64_t ref_copy[REF_FRAMES - 1];
+  memcpy(ref_copy, ref_frame_rd + 1,
+         sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1));
+  qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64);
+
+  int64_t cutoff = ref_copy[0];
+  // The cut-off is within 10% of the best.
+  if (cutoff != INT64_MAX) {
+    assert(cutoff < INT64_MAX / 200);
+    cutoff = (110 * cutoff) / 100;
+  }
+  ref_frame_rd[0] = cutoff;
+}
+
+// Check if either frame is within the cutoff.
+static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+                                        MV_REFERENCE_FRAME frame1,
+                                        MV_REFERENCE_FRAME frame2) {
+  assert(frame2 > 0);
+  return ref_frame_rd[frame1] <= ref_frame_rd[0] ||
+         ref_frame_rd[frame2] <= ref_frame_rd[0];
+}
+
+static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
+    HandleInterModeArgs *const args, TileDataEnc *const tile_data,
+    PICK_MODE_CONTEXT *const ctx,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    const motion_mode_best_st_candidate *const best_motion_mode_cands,
+    int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
+    InterModeSearchState *const search_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *const inter_modes_info = x->inter_modes_info;
+  const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand;
+
+  for (int cand = 0; cand < num_best_cand; cand++) {
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
+    av1_init_rd_stats(&rd_stats_y);
+    av1_init_rd_stats(&rd_stats_uv);
+    int disable_skip = 0, rate_mv;
+
+    rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
+    args->skip_motion_mode =
+        best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode;
+    *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi;
+    rd_stats.rate =
+        best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff;
+
+    // Continue if the best candidate is compound.
+    if (!is_inter_singleref_mode(mbmi->mode)) continue;
+
+    x->force_skip = 0;
+    const int mode_index = get_prediction_mode_idx(
+        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    struct macroblockd_plane *p = xd->plane;
+    const BUFFER_SET orig_dst = {
+      { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+      { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+    };
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    args->simple_rd_state = x->simple_rd_state[mode_index];
+    // Initialize motion mode to simple translation
+    // Calculation of switchable rate depends on it.
+    mbmi->motion_mode = 0;
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    int64_t skip_rd[2] = { search_state->best_skip_rd[0],
+                           search_state->best_skip_rd[1] };
+    int64_t ret_value = motion_mode_rd(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+        &disable_skip, args, search_state->best_rd, skip_rd, &rate_mv,
+        &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1);
+
+    if (ret_value != INT64_MAX) {
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+          mode_enum, NULL, bsize, rd_stats.rdcost,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      if (rd_stats.rdcost < search_state->best_rd) {
+        update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                            &rd_stats_uv, mode_enum, x, do_tx_search);
+        if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
       }
     }
   }
-  return skip_ref;
 }
 
-static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
-                                           CompoundTypeRdBuffers *const bufs) {
-  CHECK_MEM_ERROR(
-      cm, bufs->pred0,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
-  CHECK_MEM_ERROR(
-      cm, bufs->pred1,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->residual1,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->diff10,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
-  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
-                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
-                                        sizeof(*bufs->tmp_best_mask_buf)));
-}
+// Arguments for speed feature pruning of inter mode search
+typedef struct {
+  int *skip_motion_mode;
+  mode_skip_mask_t *mode_skip_mask;
+  InterModeSearchState *search_state;
+  int skip_ref_frame_mask;
+  int reach_first_comp_mode;
+  int mode_thresh_mul_fact;
+  int *intra_mode_idx_ls;
+  int *intra_mode_num;
+  int prune_cpd_using_sr_stats_ready;
+} InterModeSFArgs;
 
-static void release_compound_type_rd_buffers(
-    CompoundTypeRdBuffers *const bufs) {
-  aom_free(bufs->pred0);
-  aom_free(bufs->pred1);
-  aom_free(bufs->residual1);
-  aom_free(bufs->diff10);
-  aom_free(bufs->tmp_best_mask_buf);
-  av1_zero(*bufs);  // Set all pointers to NULL for safety.
-}
+static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+                           int64_t *ref_frame_rd, int midx,
+                           InterModeSFArgs *args) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  // Get the actual prediction mode we are trying in this iteration
+  const THR_MODES mode_enum = av1_default_mode_order[midx];
+  const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+  const PREDICTION_MODE this_mode = mode_def->mode;
+  const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+  const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+  const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+  const int comp_pred = second_ref_frame > INTRA_FRAME;
+  const int last_single_ref_mode_idx =
+      find_last_single_ref_mode_idx(av1_default_mode_order);
 
-// Enables do_tx_search on a per-mode basis.
-int do_tx_search_mode(int do_tx_search_global, int midx, int adaptive) {
-  if (!adaptive || do_tx_search_global) {
-    return do_tx_search_global;
+  // After we done with single reference modes, find the 2nd best RD
+  // for a reference frame. Only search compound modes that have a reference
+  // frame at least as good as the 2nd best.
+  if (sf->inter_sf.prune_compound_using_single_ref &&
+      midx == last_single_ref_mode_idx + 1) {
+    find_top_ref(ref_frame_rd);
+    args->prune_cpd_using_sr_stats_ready = 1;
   }
-  // A value of 2 indicates it is being turned on conditionally
-  // for the mode. Turn it on for the first 7 modes.
-  return midx < 7 ? 2 : 0;
+
+  // Check if this mode should be skipped because it is incompatible with the
+  // current frame
+  if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
+    return 1;
+  const int ret = inter_mode_search_order_independent_skip(
+      cpi, x, args->mode_skip_mask, args->search_state,
+      args->skip_ref_frame_mask, this_mode, mode_def->ref_frame);
+  if (ret == 1) return 1;
+  *(args->skip_motion_mode) = (ret == 2);
+
+  // We've reached the first compound prediction mode, get stats from the
+  // single reference predictors to help with pruning
+  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+      args->reach_first_comp_mode == 0) {
+    analyze_single_states(cpi, args->search_state);
+    args->reach_first_comp_mode = 1;
+  }
+
+  // Prune aggressively when best mode is skippable.
+  int mul_fact = args->search_state->best_mode_skippable
+                     ? args->mode_thresh_mul_fact
+                     : (1 << MODE_THRESH_QBITS);
+  int64_t mode_threshold =
+      (args->search_state->mode_threshold[mode_enum] * mul_fact) >>
+      MODE_THRESH_QBITS;
+
+  if (args->search_state->best_rd < mode_threshold) return 1;
+
+  // Skip this compound mode based on the RD results from the single prediction
+  // modes
+  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+    if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
+                                       ref_frame, second_ref_frame, x))
+      return 1;
+  }
+
+  // Speed features to prune out INTRA frames
+  if (ref_frame == INTRA_FRAME) {
+    if ((!cpi->oxcf.enable_smooth_intra || sf->intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      return 1;
+    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) return 1;
+    if (sf->inter_sf.adaptive_mode_search > 1)
+      if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+          args->search_state->best_pred_sse)
+        return 1;
+
+    // Intra modes will be handled in another loop later.
+    assert(*args->intra_mode_num < INTRA_MODES);
+    args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum;
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_compound_using_single_ref &&
+      args->prune_cpd_using_sr_stats_ready && comp_pred &&
+      !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) {
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
+    if (compound_skip_using_neighbor_refs(
+            xd, this_mode, ref_frames,
+            sf->inter_sf.prune_compound_using_neighbors))
+      return 1;
+  }
+
+  return 0;
+}
+
+static void record_best_compound(REFERENCE_MODE reference_mode,
+                                 RD_STATS *rd_stats, int comp_pred, int rdmult,
+                                 InterModeSearchState *search_state,
+                                 int compmode_cost) {
+  int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+  if (reference_mode == REFERENCE_MODE_SELECT) {
+    single_rate = rd_stats->rate - compmode_cost;
+    hybrid_rate = rd_stats->rate;
+  } else {
+    single_rate = rd_stats->rate;
+    hybrid_rate = rd_stats->rate + compmode_cost;
+  }
+
+  single_rd = RDCOST(rdmult, single_rate, rd_stats->dist);
+  hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
+
+  if (!comp_pred) {
+    if (single_rd <
+        search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE])
+      search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE] =
+          single_rd;
+  } else {
+    if (single_rd <
+        search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE])
+      search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE] =
+          single_rd;
+  }
+  if (hybrid_rd <
+      search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+    search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT] =
+        hybrid_rd;
+}
+
+// Indicates number of winner simple translation modes to be used
+static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 };
+
+// Adds a motion mode to the candidate list for motion_mode_for_winner_cand
+// speed feature. This list consists of modes that have only searched
+// SIMPLE_TRANSLATION. The final list will be used to search other motion
+// modes after the initial RD search.
+static void handle_winner_cand(
+    MB_MODE_INFO *const mbmi,
+    motion_mode_best_st_candidate *best_motion_mode_cands,
+    int max_winner_motion_mode_cand, int64_t this_rd,
+    motion_mode_candidate *motion_mode_cand, int skip_motion_mode) {
+  // Number of current motion mode candidates in list
+  const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand;
+  int valid_motion_mode_cand_loc = num_motion_mode_cand;
+
+  // find the best location to insert new motion mode candidate
+  for (int j = 0; j < num_motion_mode_cand; j++) {
+    if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) {
+      valid_motion_mode_cand_loc = j;
+      break;
+    }
+  }
+
+  // Insert motion mode if location is found
+  if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) {
+    if (num_motion_mode_cand > 0 &&
+        valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1)
+      memmove(
+          &best_motion_mode_cands
+               ->motion_mode_cand[valid_motion_mode_cand_loc + 1],
+          &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc],
+          (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) -
+           valid_motion_mode_cand_loc) *
+              sizeof(best_motion_mode_cands->motion_mode_cand[0]));
+    motion_mode_cand->mbmi = *mbmi;
+    motion_mode_cand->rd_cost = this_rd;
+    motion_mode_cand->skip_motion_mode = skip_motion_mode;
+    best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] =
+        *motion_mode_cand;
+    best_motion_mode_cands->num_motion_mode_cand =
+        AOMMIN(max_winner_motion_mode_cand,
+               best_motion_mode_cands->num_motion_mode_cand + 1);
+  }
 }
 
 void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
-                               MACROBLOCK *x, int mi_row, int mi_col,
-                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+                               MACROBLOCK *x, RD_STATS *rd_cost,
+                               const BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
   AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const struct segmentation *const seg = &cm->seg;
-  PREDICTION_MODE this_mode;
-  unsigned char segment_id = mbmi->segment_id;
   int i;
-  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
-  unsigned int ref_costs_single[REF_FRAMES];
-  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
-  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
-  mode_skip_mask_t mode_skip_mask;
-  uint8_t motion_mode_skip_mask = 0;  // second pass of single ref modes
+  const int *comp_inter_cost =
+      x->comp_inter_cost[av1_get_reference_mode_context(xd)];
 
   InterModeSearchState search_state;
-  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
-                               best_rd_so_far);
+  init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
   INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
     INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
     INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
   };
-  HandleInterModeArgs args = {
-    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
-    NULL,      NULL,
-    NULL,      search_state.modelled_rd,
-    { { 0 } }, INT_MAX,
-    INT_MAX,   search_state.simple_rd,
-    0,         interintra_modes,
-    1,         NULL
-  };
+  HandleInterModeArgs args = { { NULL },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+                               { NULL },
+                               { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                 MAX_SB_SIZE >> 1 },
+                               NULL,
+                               NULL,
+                               NULL,
+                               search_state.modelled_rd,
+                               INT_MAX,
+                               INT_MAX,
+                               search_state.simple_rd,
+                               0,
+                               interintra_modes,
+                               1,
+                               NULL,
+                               { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+                               0 };
+  // Indicates the appropriate number of simple translation winner modes for
+  // exhaustive motion mode evaluation
+  const int max_winner_motion_mode_cand =
+      num_winner_motion_modes[cpi->sf.winner_mode_sf
+                                  .motion_mode_for_winner_cand];
+  assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
+  motion_mode_candidate motion_mode_cand;
+  motion_mode_best_st_candidate best_motion_mode_cands;
+  // Initializing the number of motion mode candidates to zero.
+  best_motion_mode_cands.num_motion_mode_cand = 0;
+  for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i)
+    best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX;
+
   for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
 
   av1_invalid_rd_stats(rd_cost);
 
   // Ref frames that are selected by square partition blocks.
   int picked_ref_frames_mask = 0;
-  if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+  if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions &&
       mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
     // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
     // partition blocks. prune_ref_frame_for_rect_partitions >=2
     // implies prune for vert, horiz and extended partition blocks.
     if ((mbmi->partition != PARTITION_VERT &&
          mbmi->partition != PARTITION_HORZ) ||
-        cpi->sf.prune_ref_frame_for_rect_partitions >= 2) {
-      picked_ref_frames_mask = fetch_picked_ref_frames_mask(
-          x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+        cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+      picked_ref_frames_mask =
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size);
     }
   }
 
   // Skip ref frames that never selected by square blocks.
   const int skip_ref_frame_mask =
       picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
-
+  mode_skip_mask_t mode_skip_mask;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   // init params, set frame modes, speed features
-  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
-                                &mode_skip_mask, skip_ref_frame_mask,
-                                ref_costs_single, ref_costs_comp, yv12_mb);
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
+                                skip_ref_frame_mask, ref_costs_single,
+                                ref_costs_comp, yv12_mb);
 
   int64_t best_est_rd = INT64_MAX;
-  // TODO(angiebird): Turn this on when this speed feature is well tested
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
-  // If do_tx_search_global is 0, only estimated RD should be computed.
-  // If do_tx_search_global is 1, all modes have TX search performed.
-  // If do_tx_search_global is 2, some modes will have TX search performed.
-  const int do_tx_search_global =
-      !((cpi->sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
-        (cpi->sf.inter_mode_rd_model_estimation == 2 &&
-         x->source_variance < 512));
+  // If do_tx_search is 0, only estimated RD should be computed.
+  // If do_tx_search is 1, all modes have TX search performed.
+  const int do_tx_search =
+      !((cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 &&
+         num_pels_log2_lookup[bsize] > 8) ||
+        cpi->sf.rt_sf.force_tx_search_off);
   InterModesInfo *inter_modes_info = x->inter_modes_info;
   inter_modes_info->num = 0;
 
   int intra_mode_num = 0;
-  int intra_mode_idx_ls[MAX_MODES];
-  int reach_first_comp_mode = 0;
+  int intra_mode_idx_ls[INTRA_MODES];
 
   // Temporary buffers used by handle_inter_mode().
   uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
 
-  CompoundTypeRdBuffers rd_buffers;
-  alloc_compound_type_rd_buffers(cm, &rd_buffers);
+  // The best RD found for the reference frame, among single reference modes.
+  // Note that the 0-th element will contain a cut-off that is later used
+  // to determine if we should skip a compound mode.
+  int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX };
+  const int skip_ctx = av1_get_skip_context(xd);
 
-  for (int midx = 0; midx < MAX_MODES; ++midx) {
-    const int do_tx_search = do_tx_search_mode(
-        do_tx_search_global, midx, sf->inter_mode_rd_model_estimation_adaptive);
-    const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
-    this_mode = mode_order->mode;
-    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
-    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
-    const int comp_pred = second_ref_frame > INTRA_FRAME;
+  // Prepared stats used later to check if we could skip intra mode eval.
+  int64_t inter_cost = -1;
+  int64_t intra_cost = -1;
+  // Need to tweak the threshold for hdres speed 0 & 1.
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
-    // When single ref motion search ends:
-    // 1st pass: To evaluate single ref RD results and rewind to the beginning;
-    // 2nd pass: To continue with compound ref search.
-    if (sf->prune_single_motion_modes_by_simple_trans) {
-      if (comp_pred && args.single_ref_first_pass) {
-        args.single_ref_first_pass = 0;
-        // Reach the first comp ref mode
-        // Reset midx to start the 2nd pass for single ref motion search
-        midx = -1;
-        motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
-        continue;
-      }
-      if (!comp_pred) {  // single ref mode
-        if (args.single_ref_first_pass) {
-          // clear stats
-          for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
-            x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
-            x->simple_rd_state[midx][k].early_skipped = 0;
-          }
-        } else {
-          if (motion_mode_skip_mask & (1 << ref_frame)) {
-            continue;
+  // Obtain the relevant tpl stats for pruning inter modes
+  PruneInfoFromTpl inter_cost_info_from_tpl;
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->sf.inter_sf.prune_inter_modes_based_on_tpl) {
+    // x->search_ref_frame[id] = 1 => no pruning in
+    // prune_ref_by_selective_ref_frame()
+    // x->search_ref_frame[id] = 0  => ref frame can be pruned in
+    // prune_ref_by_selective_ref_frame()
+    // Populating valid_refs[idx] = 1 ensures that
+    // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
+    // pruned ref frame.
+    int valid_refs[INTER_REFS_PER_FRAME];
+    for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+      const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
+      valid_refs[frame - 1] =
+          x->search_ref_frame[frame] ||
+          !prune_ref_by_selective_ref_frame(
+              cpi, x, refs, cm->cur_frame->ref_display_order_hint);
+    }
+    av1_zero(inter_cost_info_from_tpl);
+    get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
+                              &inter_cost_info_from_tpl);
+  }
+#endif
+  const int do_pruning =
+      (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
+  if (do_pruning && sf->intra_sf.skip_intra_in_interframe) {
+    // Only consider full SB.
+    int len = tpl_blocks_in_sb(cm->seq_params.sb_size);
+    if (len == x->valid_cost_b) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const int tplw = mi_size_wide[tpl_bsize];
+      const int tplh = mi_size_high[tpl_bsize];
+      const int nw = mi_size_wide[bsize] / tplw;
+      const int nh = mi_size_high[bsize] / tplh;
+      if (nw >= 1 && nh >= 1) {
+        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+
+        for (int k = 0; k < nh; k++) {
+          for (int l = 0; l < nw; l++) {
+            inter_cost += x->inter_cost_b[start + k * x->cost_stride + l];
+            intra_cost += x->intra_cost_b[start + k * x->cost_stride + l];
           }
         }
+        inter_cost /= nw * nh;
+        intra_cost /= nw * nh;
       }
     }
+  }
 
-    // Reach the first compound prediction mode
-    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
-        reach_first_comp_mode == 0) {
-      analyze_single_states(cpi, &search_state);
-      reach_first_comp_mode = 1;
-    }
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0;
-    int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int64_t distortion2 = 0;
-    int skippable = 0;
-    int this_skip2 = 0;
+  // Initialize best mode stats for winner mode processing
+  av1_zero(x->winner_mode_stats);
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
+      best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+      0);
 
-    init_mbmi(mbmi, midx, cm);
+  int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
+  if (sf->inter_sf.prune_inter_modes_if_skippable) {
+    // Higher multiplication factor values for lower quantizers.
+    mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex];
+  }
 
-    x->skip = 0;
+  // Initialize arguments for mode loop speed features
+  InterModeSFArgs sf_args = { &args.skip_motion_mode,
+                              &mode_skip_mask,
+                              &search_state,
+                              skip_ref_frame_mask,
+                              0,
+                              mode_thresh_mul_fact,
+                              intra_mode_idx_ls,
+                              &intra_mode_num,
+                              0 };
+
+  // Here midx is just an iterator index that should not be used by itself
+  // except to keep track of the number of modes searched. It should be used
+  // with av1_default_mode_order to get the enum that defines the mode, which
+  // can be used with av1_mode_defs to get the prediction mode and the ref
+  // frames.
+  for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) {
+    // Get the actual prediction mode we are trying in this iteration
+    const THR_MODES mode_enum = av1_default_mode_order[midx];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+    const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+
+    const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+    const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+    const int is_single_pred =
+        ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME;
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+    init_mbmi(mbmi, this_mode, ref_frames, cm);
+
+    x->force_skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
-    if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+    // Apply speed features to decide if this inter mode can be skipped
+    if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
 
-    const int ret = inter_mode_search_order_independent_skip(
-        cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
-        skip_ref_frame_mask);
-    if (ret == 1) continue;
-    args.skip_motion_mode = (ret == 2);
-
-    if (sf->drop_ref && comp_pred) {
-      if (sf_check_is_drop_ref(mode_order, &search_state)) {
-        continue;
-      }
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-    if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->ref_mv_idx = 0;
 
-    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
-      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
-                                         ref_frame, second_ref_frame, x))
-        continue;
-    }
+    const int64_t ref_best_rd = search_state.best_rd;
+    int disable_skip = 0;
+    RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
 
     const int ref_frame_cost = comp_pred
                                    ? ref_costs_comp[ref_frame][second_ref_frame]
@@ -12815,186 +4662,93 @@
         cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
             ? compmode_cost
             : 0;
-
-    if (comp_pred) {
-      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          search_state.best_mode_index >= 0 &&
-          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
-        continue;
+    // Point to variables that are maintained between loop iterations
+    args.single_newmv = search_state.single_newmv;
+    args.single_newmv_rate = search_state.single_newmv_rate;
+    args.single_newmv_valid = search_state.single_newmv_valid;
+    args.single_comp_cost = real_compmode_cost;
+    args.ref_frame_cost = ref_frame_cost;
+    if (is_single_pred) {
+      args.simple_rd_state = x->simple_rd_state[mode_enum];
     }
 
-    if (ref_frame == INTRA_FRAME) {
-      if ((!cpi->oxcf.enable_smooth_intra || sf->disable_smooth_intra) &&
-          (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-           mbmi->mode == SMOOTH_V_PRED))
-        continue;
-      if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
-      if (sf->adaptive_mode_search > 1)
-        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
-            search_state.best_pred_sse)
-          continue;
+    int64_t skip_rd[2] = { search_state.best_skip_rd[0],
+                           search_state.best_skip_rd[1] };
+    int64_t this_rd = handle_inter_mode(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+        &disable_skip, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer,
+        &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand,
+        skip_rd, &inter_cost_info_from_tpl);
 
-      if (this_mode != DC_PRED) {
-        // Only search the oblique modes if the best so far is
-        // one of the neighboring directional modes
-        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
-          if (search_state.best_mode_index >= 0 &&
-              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
-            continue;
-        }
-        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
-            continue;
-        }
-      }
+    if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+        is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+      collect_single_states(x, &search_state, mbmi);
     }
 
-    // Select prediction reference frames.
-    for (i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    if (this_rd == INT64_MAX) continue;
+
+    if (mbmi->skip) {
+      rd_stats_y.rate = 0;
+      rd_stats_uv.rate = 0;
     }
 
-    if (ref_frame == INTRA_FRAME) {
-      intra_mode_idx_ls[intra_mode_num++] = midx;
-      continue;
-    } else {
-      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-      mbmi->filter_intra_mode_info.use_filter_intra = 0;
-      mbmi->ref_mv_idx = 0;
-      int64_t ref_best_rd = search_state.best_rd;
-      {
-        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
-        av1_init_rd_stats(&rd_stats);
-        rd_stats.rate = rate2;
-
-        // Point to variables that are maintained between loop iterations
-        args.single_newmv = search_state.single_newmv;
-        args.single_newmv_rate = search_state.single_newmv_rate;
-        args.single_newmv_valid = search_state.single_newmv_valid;
-        args.single_comp_cost = real_compmode_cost;
-        args.ref_frame_cost = ref_frame_cost;
-        if (midx < MAX_SINGLE_REF_MODES) {
-          args.simple_rd_state = x->simple_rd_state[midx];
-        }
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        start_timing(cpi, handle_inter_mode_time);
-#endif
-        this_rd = handle_inter_mode(
-            cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-            &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
-            &rd_buffers, &best_est_rd, do_tx_search, inter_modes_info);
-#if CONFIG_COLLECT_COMPONENT_TIMING
-        end_timing(cpi, handle_inter_mode_time);
-#endif
-        rate2 = rd_stats.rate;
-        skippable = rd_stats.skip;
-        distortion2 = rd_stats.dist;
-        rate_y = rd_stats_y.rate;
-        rate_uv = rd_stats_uv.rate;
-      }
-
-      if (sf->prune_comp_search_by_single_result > 0 &&
-          is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
-        collect_single_states(x, &search_state, mbmi);
-      }
-
-      if (this_rd == INT64_MAX) continue;
-
-      this_skip2 = mbmi->skip;
-      this_rd = RDCOST(x->rdmult, rate2, distortion2);
-      if (this_skip2) {
-        rate_y = 0;
-        rate_uv = 0;
-      }
+    if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred &&
+        this_rd < ref_frame_rd[ref_frame]) {
+      ref_frame_rd[ref_frame] = this_rd;
     }
 
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < search_state.best_rd || x->skip) {
-      int mode_excluded = 0;
-      if (comp_pred) {
-        mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE;
-      }
-      if (!mode_excluded) {
-        // Note index of best mode so far
-        search_state.best_mode_index = midx;
-
-        if (ref_frame == INTRA_FRAME) {
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        } else {
-          search_state.best_pred_sse = x->pred_sse[ref_frame];
-        }
-
-        rd_cost->rate = rate2;
-        rd_cost->dist = distortion2;
-        rd_cost->rdcost = this_rd;
-        search_state.best_rd = this_rd;
-        search_state.best_mbmode = *mbmi;
-        search_state.best_skip2 = this_skip2;
-        search_state.best_mode_skippable = skippable;
-        if (do_tx_search) {
-          // When do_tx_search == 0, handle_inter_mode won't provide correct
-          // rate_y and rate_uv because txfm_search process is replaced by
-          // rd estimation.
-          // Therfore, we should avoid updating best_rate_y and best_rate_uv
-          // here. These two values will be updated when txfm_search is called
-          search_state.best_rate_y =
-              rate_y +
-              x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
-          search_state.best_rate_uv = rate_uv;
-        }
-        memcpy(ctx->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      }
+    // Did this mode help, i.e., is it the new best mode
+    if (this_rd < search_state.best_rd) {
+      assert(IMPLIES(comp_pred,
+                     cm->current_frame.reference_mode != SINGLE_REFERENCE));
+      search_state.best_pred_sse = x->pred_sse[ref_frame];
+      update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, do_tx_search);
+      if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+      search_state.best_skip_rd[1] = skip_rd[1];
+    }
+    if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+      // Add this mode to motion mode candidate list for motion mode search
+      // if using motion_mode_for_winner_cand speed feature
+      handle_winner_cand(mbmi, &best_motion_mode_cands,
+                         max_winner_motion_mode_cand, this_rd,
+                         &motion_mode_cand, args.skip_motion_mode);
     }
 
     /* keep record of best compound/single-only prediction */
-    if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
-
-      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
-
-      if (!comp_pred) {
-        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
-          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
-      } else {
-        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
-          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
-      }
-      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
-        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    if (!disable_skip) {
+      record_best_compound(cm->current_frame.reference_mode, &rd_stats,
+                           comp_pred, x->rdmult, &search_state, compmode_cost);
     }
-    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
-      // Collect data from single ref mode, and analyze data.
-      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
-    }
-
-    if (x->skip && !comp_pred) break;
   }
 
-  release_compound_type_rd_buffers(&rd_buffers);
+  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    // For the single ref winner candidates, evaluate other motion modes (non
+    // simple translation).
+    evaluate_motion_mode_for_winner_candidates(
+        cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
+        &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
+        &search_state);
+  }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, do_tx_search_time);
 #endif
-  if (do_tx_search_global != 1) {
+  if (do_tx_search != 1) {
     inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
-    search_state.best_rd = INT64_MAX;
-
-    int64_t top_est_rd =
+    search_state.best_rd = best_rd_so_far;
+    search_state.best_mode_index = THR_INVALID;
+    // Initialize best mode stats for winner mode processing
+    x->winner_mode_count = 0;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
+        best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        do_tx_search);
+    inter_modes_info->num =
+        inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+            ? inter_modes_info->num
+            : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+    const int64_t top_est_rd =
         inter_modes_info->num > 0
             ? inter_modes_info
                   ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
@@ -13005,67 +4759,63 @@
       int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
       if (curr_est_rd * 0.80 > top_est_rd) break;
 
+      x->force_skip = 0;
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+      for (i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+      if (mbmi->motion_mode == OBMC_CAUSAL) {
+        av1_build_obmc_inter_predictors_sb(cm, xd);
+      }
+
       RD_STATS rd_stats;
       RD_STATS rd_stats_y;
       RD_STATS rd_stats_uv;
-
-      bool true_rd = inter_modes_info->true_rd_arr[data_idx];
-      if (true_rd) {
-        rd_stats = inter_modes_info->rd_cost_arr[data_idx];
-        rd_stats_y = inter_modes_info->rd_cost_y_arr[data_idx];
-        rd_stats_uv = inter_modes_info->rd_cost_uv_arr[data_idx];
-        memcpy(x->blk_skip, inter_modes_info->blk_skip_arr[data_idx],
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      } else {
-        const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-
-        x->skip = 0;
-        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-        // Select prediction reference frames.
-        const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
-        for (i = 0; i < num_planes; i++) {
-          xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-          if (is_comp_pred)
-            xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-        }
-
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                      av1_num_planes(cm) - 1);
-        if (mbmi->motion_mode == OBMC_CAUSAL)
-          av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
-        if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
-                         &rd_stats_y, &rd_stats_uv, mode_rate,
-                         search_state.best_rd)) {
-          continue;
-        } else if (cpi->sf.inter_mode_rd_model_estimation == 1) {
-          const int skip_ctx = av1_get_skip_context(xd);
-          inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
-                               rd_stats.dist,
-                               rd_stats_y.rate + rd_stats_uv.rate +
-                                   x->skip_cost[skip_ctx][mbmi->skip]);
-        }
-        rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+      int64_t skip_rd = INT64_MAX;
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        // Check if the mode is good enough based on skip RD
+        int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+        skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+        int eval_txfm =
+            check_txfm_eval(x, bsize, search_state.best_skip_rd[0], skip_rd,
+                            cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+        if (!eval_txfm) continue;
       }
 
+      if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+                           mode_rate, search_state.best_rd)) {
+        continue;
+      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Collect mode stats for multiwinner mode processing
+      const int txfm_search_done = 1;
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+          mode_enum, NULL, bsize, rd_stats.rdcost,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+          txfm_search_done);
+
       if (rd_stats.rdcost < search_state.best_rd) {
-        search_state.best_rd = rd_stats.rdcost;
-        // Note index of best mode so far
-        const int mode_index = get_prediction_mode_idx(
-            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-        search_state.best_mode_index = mode_index;
-        *rd_cost = rd_stats;
-        search_state.best_rd = rd_stats.rdcost;
-        search_state.best_mbmode = *mbmi;
-        search_state.best_skip2 = mbmi->skip;
-        search_state.best_mode_skippable = rd_stats.skip;
-        search_state.best_rate_y =
-            rd_stats_y.rate +
-            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
-        search_state.best_rate_uv = rd_stats_uv.rate;
-        memcpy(ctx->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+        update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                            &rd_stats_uv, mode_enum, x, txfm_search_done);
+        search_state.best_skip_rd[0] = skip_rd;
       }
     }
   }
@@ -13076,66 +4826,141 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, handle_intra_mode_time);
 #endif
-  for (int j = 0; j < intra_mode_num; ++j) {
-    const int mode_index = intra_mode_idx_ls[j];
-    const MV_REFERENCE_FRAME ref_frame =
-        av1_mode_order[mode_index].ref_frame[0];
-    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
-    assert(ref_frame == INTRA_FRAME);
-    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
-    init_mbmi(mbmi, mode_index, cm);
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
 
-    // Select prediction reference frames.
-    for (i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+  // Gate intra mode evaluation if best of inter is skip except when source
+  // variance is extremely low
+  if (sf->intra_sf.skip_intra_in_interframe &&
+      (x->source_variance > sf->intra_sf.src_var_thresh_intra_skip)) {
+    if (inter_cost >= 0 && intra_cost >= 0) {
+      aom_clear_system_state();
+      const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                       ? &av1_intrap_nn_config
+                                       : &av1_intrap_hd_nn_config;
+      float nn_features[6];
+      float scores[2] = { 0.0f };
+      float probs[2] = { 0.0f };
+
+      nn_features[0] = (float)search_state.best_mbmode.skip;
+      nn_features[1] = (float)mi_size_wide_log2[bsize];
+      nn_features[2] = (float)mi_size_high_log2[bsize];
+      nn_features[3] = (float)intra_cost;
+      nn_features[4] = (float)inter_cost;
+      const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+      const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+      nn_features[5] = (float)(ac_q_max / ac_q);
+
+      av1_nn_predict(nn_features, nn_config, 1, scores);
+      aom_clear_system_state();
+      av1_nn_softmax(scores, probs, 2);
+
+      if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
+    } else if ((search_state.best_mbmode.skip) &&
+               (sf->intra_sf.skip_intra_in_interframe >= 2)) {
+      search_state.intra_search_state.skip_intra_modes = 1;
+    }
+  }
+
+  const int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+  for (int j = 0; j < intra_mode_num; ++j) {
+    if (sf->intra_sf.skip_intra_in_interframe &&
+        search_state.intra_search_state.skip_intra_modes)
+      break;
+    const THR_MODES mode_enum = intra_mode_idx_ls[j];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+
+    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+    x->force_skip = 0;
+
+    if (this_mode != DC_PRED) {
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+        if (search_state.best_mode_index != THR_INVALID &&
+            search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(
+                this_mode, search_state.intra_search_state.best_intra_mode))
+          continue;
+      }
     }
 
     RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
-
-    const int ref_frame_cost = ref_costs_single[ref_frame];
-    intra_rd_stats.rdcost = handle_intra_mode(
-        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
-        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+    intra_rd_stats.rdcost = av1_handle_intra_mode(
+        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+        ctx, 0, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv,
+        search_state.best_rd, &search_state.best_intra_rd,
+        search_state.best_mbmode.skip);
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, &intra_rd_stats, &intra_rd_stats_y,
+        &intra_rd_stats_uv, mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
     if (intra_rd_stats.rdcost < search_state.best_rd) {
-      search_state.best_rd = intra_rd_stats.rdcost;
-      // Note index of best mode so far
-      search_state.best_mode_index = mode_index;
-      *rd_cost = intra_rd_stats;
-      search_state.best_rd = intra_rd_stats.rdcost;
-      search_state.best_mbmode = *mbmi;
-      search_state.best_skip2 = 0;
-      search_state.best_mode_skippable = intra_rd_stats.skip;
-      search_state.best_rate_y =
-          intra_rd_stats_y.rate +
-          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
-      search_state.best_rate_uv = intra_rd_stats_uv.rate;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      update_search_state(&search_state, rd_cost, ctx, &intra_rd_stats,
+                          &intra_rd_stats_y, &intra_rd_stats_uv, mode_enum, x,
+                          txfm_search_done);
     }
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, handle_intra_mode_time);
 #endif
 
-  // In effect only when speed >= 2.
-  sf_refine_fast_tx_type_search(
-      cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
+  int winner_mode_count = cpi->sf.winner_mode_sf.enable_multiwinner_mode_process
+                              ? x->winner_mode_count
+                              : 1;
+  // In effect only when fast tx search speed features are enabled.
+  refine_winner_mode_tx(
+      cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
       &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
-      search_state.best_rate_uv, &search_state.best_skip2);
+      search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
   // Only try palette mode when the best mode so far is an intra mode.
-  if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
-    search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
-                        ref_costs_single, &search_state);
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) &&
+      !is_inter_mode(search_state.best_mbmode.mode);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  RD_STATS this_rd_cost;
+  int this_skippable = 0;
+  if (try_palette) {
+    this_skippable = av1_search_palette_mode(
+        cpi, x, &this_rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single,
+        &search_state.intra_search_state, search_state.best_rd);
+    if (this_rd_cost.rdcost < search_state.best_rd) {
+      search_state.best_mode_index = THR_DC;
+      mbmi->mv[0].as_int = 0;
+      rd_cost->rate = this_rd_cost.rate;
+      rd_cost->dist = this_rd_cost.dist;
+      rd_cost->rdcost = this_rd_cost.rdcost;
+      search_state.best_rd = rd_cost->rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = this_skippable;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
   }
+
   search_state.best_mbmode.skip_mode = 0;
   if (cm->current_frame.skip_mode_info.skip_mode_flag &&
-      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
       is_comp_ref_allowed(bsize)) {
-    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
-                      yv12_mb);
+    const struct segmentation *const seg = &cm->seg;
+    unsigned char segment_id = mbmi->segment_id;
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb);
+    }
   }
 
   // Make sure that the ref_mv_idx is only nonzero when we're
@@ -13147,556 +4972,74 @@
     search_state.best_mbmode.ref_mv_idx = 0;
   }
 
-  if (search_state.best_mode_index < 0 ||
+  if (search_state.best_mode_index == THR_INVALID ||
       search_state.best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
   }
 
-  assert(
-      (cm->interp_filter == SWITCHABLE) ||
-      (cm->interp_filter ==
-       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
-      !is_inter_block(&search_state.best_mbmode));
-  assert(
-      (cm->interp_filter == SWITCHABLE) ||
-      (cm->interp_filter ==
-       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
-      !is_inter_block(&search_state.best_mbmode));
+  const InterpFilter interp_filter = features->interp_filter;
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.y_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
 
-  if (!cpi->rc.is_src_frame_alt_ref)
-    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize,
+  if (!cpi->rc.is_src_frame_alt_ref && cpi->sf.inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+                              sf->inter_sf.adaptive_rd_thresh, bsize,
                               search_state.best_mode_index);
+  }
 
   // macroblock modes
   *mbmi = search_state.best_mbmode;
-  x->skip |= search_state.best_skip2;
+  x->force_skip |= search_state.best_skip2;
 
   // Note: this section is needed since the mode may have been forced to
   // GLOBALMV by the all-zero mode handling of ref-mv.
   if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
     // Correct the interp filters for GLOBALMV
     if (is_nontrans_global_motion(xd, xd->mi[0])) {
-      assert(mbmi->interp_filters ==
-             av1_broadcast_interp_filter(
-                 av1_unswitchable_filter(cm->interp_filter)));
+      int_interpfilters filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+      assert(mbmi->interp_filters.as_int == filters.as_int);
+      (void)filters;
     }
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (search_state.best_pred_rd[i] == INT64_MAX)
+    if (search_state.intra_search_state.best_pred_rd[i] == INT64_MAX) {
       search_state.best_pred_diff[i] = INT_MIN;
-    else
+    } else {
       search_state.best_pred_diff[i] =
-          search_state.best_rd - search_state.best_pred_rd[i];
+          search_state.best_rd -
+          search_state.intra_search_state.best_pred_rd[i];
+    }
   }
 
-  x->skip |= search_state.best_mode_skippable;
+  x->force_skip |= search_state.best_mode_skippable;
 
-  assert(search_state.best_mode_index >= 0);
+  assert(search_state.best_mode_index != THR_INVALID);
 
+#if CONFIG_INTERNAL_STATS
   store_coding_context(x, ctx, search_state.best_mode_index,
                        search_state.best_pred_diff,
                        search_state.best_mode_skippable);
+#else
+  store_coding_context(x, ctx, search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+#endif  // CONFIG_INTERNAL_STATS
 
   if (pmi->palette_size[1] > 0) {
     assert(try_palette);
-    restore_uv_color_map(cpi, x);
+    av1_restore_uv_color_map(cpi, x);
   }
 }
 
-// TODO(kyslov): now this is very similar to av1_rd_pick_inter_mode_sb except:
-//                 it only checks non-compound mode and
-//                 it doesn't check palette mode
-//                 it doesn't refine tx search
-//               this function is likely to be heavily modified with nonrd mode
-//               decision
-void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
-                                  MACROBLOCK *x, int mi_row, int mi_col,
-                                  RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const struct segmentation *const seg = &cm->seg;
-  PREDICTION_MODE this_mode;
-  unsigned char segment_id = mbmi->segment_id;
-  int i;
-  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
-  unsigned int ref_costs_single[REF_FRAMES];
-  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
-  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
-  mode_skip_mask_t mode_skip_mask;
-  uint8_t motion_mode_skip_mask = 0;  // second pass of single ref modes
-
-  InterModeSearchState search_state;
-  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
-                               best_rd_so_far);
-  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
-    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
-    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
-  };
-  HandleInterModeArgs args = {
-    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
-    NULL,      NULL,
-    NULL,      search_state.modelled_rd,
-    { { 0 } }, INT_MAX,
-    INT_MAX,   search_state.simple_rd,
-    0,         interintra_modes,
-    1,         NULL
-  };
-  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
-
-  av1_invalid_rd_stats(rd_cost);
-
-  // Ref frames that are selected by square partition blocks.
-  int picked_ref_frames_mask = 0;
-  if (cpi->sf.prune_ref_frame_for_rect_partitions &&
-      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
-    // Don't enable for vert and horz partition blocks if current frame
-    // will be used as bwd or arf2.
-    if ((!cpi->refresh_bwd_ref_frame && !cpi->refresh_alt2_ref_frame) ||
-        (mbmi->partition != PARTITION_VERT &&
-         mbmi->partition != PARTITION_HORZ)) {
-      picked_ref_frames_mask = fetch_picked_ref_frames_mask(
-          x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
-    }
-  }
-
-  // Skip ref frames that never selected by square blocks.
-  const int skip_ref_frame_mask =
-      picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
-
-  // init params, set frame modes, speed features
-  set_params_nonrd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
-                                   &mode_skip_mask, skip_ref_frame_mask,
-                                   ref_costs_single, ref_costs_comp, yv12_mb);
-
-  int64_t best_est_rd = INT64_MAX;
-  InterModesInfo *inter_modes_info = x->inter_modes_info;
-  inter_modes_info->num = 0;
-
-  int intra_mode_num = 0;
-  int intra_mode_idx_ls[MAX_MODES];
-  int reach_first_comp_mode = 0;
-
-  // Temporary buffers used by handle_inter_mode().
-  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
-
-  CompoundTypeRdBuffers rd_buffers;
-  alloc_compound_type_rd_buffers(cm, &rd_buffers);
-
-  for (int midx = 0; midx < MAX_MODES; ++midx) {
-    const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
-    this_mode = mode_order->mode;
-    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
-    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
-    const int comp_pred = second_ref_frame > INTRA_FRAME;
-
-    if (second_ref_frame != NONE_FRAME) continue;
-
-    // When single ref motion search ends:
-    // 1st pass: To evaluate single ref RD results and rewind to the beginning;
-    // 2nd pass: To continue with compound ref search.
-    if (sf->prune_single_motion_modes_by_simple_trans) {
-      if (comp_pred && args.single_ref_first_pass) {
-        args.single_ref_first_pass = 0;
-        // Reach the first comp ref mode
-        // Reset midx to start the 2nd pass for single ref motion search
-        midx = -1;
-        motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
-        continue;
-      }
-      if (!comp_pred && ref_frame != INTRA_FRAME) {  // single ref mode
-        if (args.single_ref_first_pass) {
-          // clear stats
-          for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
-            x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
-            x->simple_rd_state[midx][k].early_skipped = 0;
-          }
-        } else {
-          if (motion_mode_skip_mask & (1 << ref_frame)) {
-            continue;
-          }
-        }
-      }
-    }
-
-    // Reach the first compound prediction mode
-    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
-        reach_first_comp_mode == 0) {
-      analyze_single_states(cpi, &search_state);
-      reach_first_comp_mode = 1;
-    }
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0;
-    int rate2 = 0;
-    int64_t distortion2 = 0;
-    int skippable = 0;
-    int this_skip2 = 0;
-
-    init_mbmi(mbmi, midx, cm);
-
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
-    if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
-
-    const int ret = inter_mode_search_order_independent_skip(
-        cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
-        skip_ref_frame_mask);
-    if (ret == 1) continue;
-    args.skip_motion_mode = (ret == 2);
-
-    if (sf->drop_ref && comp_pred) {
-      if (sf_check_is_drop_ref(mode_order, &search_state)) {
-        continue;
-      }
-    }
-
-    if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
-
-    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
-      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
-                                         ref_frame, second_ref_frame, x))
-        continue;
-    }
-
-    const int ref_frame_cost = comp_pred
-                                   ? ref_costs_comp[ref_frame][second_ref_frame]
-                                   : ref_costs_single[ref_frame];
-    const int compmode_cost =
-        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
-    const int real_compmode_cost =
-        cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
-            ? compmode_cost
-            : 0;
-
-    if (comp_pred) {
-      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          search_state.best_mode_index >= 0 &&
-          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
-        continue;
-    }
-
-    if (ref_frame == INTRA_FRAME) {
-      if (!cpi->oxcf.enable_smooth_intra &&
-          (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-           mbmi->mode == SMOOTH_V_PRED))
-        continue;
-      if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
-      if (sf->adaptive_mode_search > 1)
-        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
-            search_state.best_pred_sse)
-          continue;
-
-      if (this_mode != DC_PRED) {
-        // Only search the oblique modes if the best so far is
-        // one of the neighboring directional modes
-        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
-          if (search_state.best_mode_index >= 0 &&
-              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
-            continue;
-        }
-        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
-            continue;
-        }
-      }
-    }
-
-    // Select prediction reference frames.
-    for (i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
-    }
-
-    if (ref_frame == INTRA_FRAME) {
-      intra_mode_idx_ls[intra_mode_num++] = midx;
-      continue;
-    } else {
-      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-      mbmi->filter_intra_mode_info.use_filter_intra = 0;
-      mbmi->ref_mv_idx = 0;
-      int64_t ref_best_rd = search_state.best_rd;
-      {
-        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
-        av1_init_rd_stats(&rd_stats);
-        rd_stats.rate = rate2;
-
-        // Point to variables that are maintained between loop iterations
-        args.single_newmv = search_state.single_newmv;
-        args.single_newmv_rate = search_state.single_newmv_rate;
-        args.single_newmv_valid = search_state.single_newmv_valid;
-        args.single_comp_cost = real_compmode_cost;
-        args.ref_frame_cost = ref_frame_cost;
-        if (midx < MAX_SINGLE_REF_MODES) {
-          args.simple_rd_state = x->simple_rd_state[midx];
-        }
-        this_rd = handle_inter_mode(
-            cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-            &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
-            &rd_buffers, &best_est_rd, 0, inter_modes_info);
-        rate2 = rd_stats.rate;
-        skippable = rd_stats.skip;
-        distortion2 = rd_stats.dist;
-      }
-
-      if (sf->prune_comp_search_by_single_result > 0 &&
-          is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
-        collect_single_states(x, &search_state, mbmi);
-      }
-
-      if (this_rd == INT64_MAX) continue;
-
-      this_skip2 = mbmi->skip;
-      this_rd = RDCOST(x->rdmult, rate2, distortion2);
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < search_state.best_rd || x->skip) {
-      int mode_excluded = 0;
-      if (comp_pred) {
-        mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE;
-      }
-      if (!mode_excluded) {
-        // Note index of best mode so far
-        search_state.best_mode_index = midx;
-
-        if (ref_frame == INTRA_FRAME) {
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        } else {
-          search_state.best_pred_sse = x->pred_sse[ref_frame];
-        }
-
-        rd_cost->rate = rate2;
-        rd_cost->dist = distortion2;
-        rd_cost->rdcost = this_rd;
-        search_state.best_rd = this_rd;
-        search_state.best_mbmode = *mbmi;
-        search_state.best_skip2 = this_skip2;
-        search_state.best_mode_skippable = skippable;
-        memcpy(ctx->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      }
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
-
-      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
-
-      if (!comp_pred) {
-        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
-          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
-      } else {
-        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
-          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
-      }
-      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
-        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-    }
-    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
-      // Collect data from single ref mode, and analyze data.
-      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
-    }
-
-    if (x->skip && !comp_pred) break;
-  }
-
-  release_compound_type_rd_buffers(&rd_buffers);
-
-  inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
-  search_state.best_rd = INT64_MAX;
-
-  if (inter_modes_info->num > 0) {
-    const int data_idx = inter_modes_info->rd_idx_pair_arr[0].idx;
-    *mbmi = inter_modes_info->mbmi_arr[data_idx];
-    const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-    // Select prediction reference frames.
-    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
-    for (i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-    }
-
-    RD_STATS rd_stats;
-    RD_STATS rd_stats_y;
-    RD_STATS rd_stats_uv;
-
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                  av1_num_planes(cm) - 1);
-    if (mbmi->motion_mode == OBMC_CAUSAL)
-      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
-    if (txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
-                    &rd_stats_y, &rd_stats_uv, mode_rate,
-                    search_state.best_rd)) {
-      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
-        const int skip_ctx = av1_get_skip_context(xd);
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
-                             rd_stats.dist,
-                             rd_stats_y.rate + rd_stats_uv.rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
-      }
-      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
-
-      if (rd_stats.rdcost < search_state.best_rd) {
-        search_state.best_rd = rd_stats.rdcost;
-        // Note index of best mode so far
-        const int mode_index = get_prediction_mode_idx(
-            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-        search_state.best_mode_index = mode_index;
-        *rd_cost = rd_stats;
-        search_state.best_rd = rd_stats.rdcost;
-        search_state.best_mbmode = *mbmi;
-        search_state.best_skip2 = mbmi->skip;
-        search_state.best_mode_skippable = rd_stats.skip;
-        search_state.best_rate_y =
-            rd_stats_y.rate +
-            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
-        search_state.best_rate_uv = rd_stats_uv.rate;
-        memcpy(ctx->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      }
-    }
-  }
-
-  for (int j = 0; j < intra_mode_num; ++j) {
-    const int mode_index = intra_mode_idx_ls[j];
-    const MV_REFERENCE_FRAME ref_frame =
-        av1_mode_order[mode_index].ref_frame[0];
-    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
-    assert(ref_frame == INTRA_FRAME);
-    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
-    init_mbmi(mbmi, mode_index, cm);
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
-
-    // Select prediction reference frames.
-    for (i = 0; i < num_planes; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-    }
-
-    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
-
-    const int ref_frame_cost = ref_costs_single[ref_frame];
-    intra_rd_stats.rdcost = handle_intra_mode(
-        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
-        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
-    if (intra_rd_stats.rdcost < search_state.best_rd) {
-      search_state.best_rd = intra_rd_stats.rdcost;
-      // Note index of best mode so far
-      search_state.best_mode_index = mode_index;
-      *rd_cost = intra_rd_stats;
-      search_state.best_rd = intra_rd_stats.rdcost;
-      search_state.best_mbmode = *mbmi;
-      search_state.best_skip2 = 0;
-      search_state.best_mode_skippable = intra_rd_stats.skip;
-      search_state.best_rate_y =
-          intra_rd_stats_y.rate +
-          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
-      search_state.best_rate_uv = intra_rd_stats_uv.rate;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-    }
-  }
-
-  search_state.best_mbmode.skip_mode = 0;
-  if (cm->current_frame.skip_mode_info.skip_mode_flag &&
-      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-      is_comp_ref_allowed(bsize)) {
-    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
-                      yv12_mb);
-  }
-
-  // Make sure that the ref_mv_idx is only nonzero when we're
-  // using a mode which can support ref_mv_idx
-  if (search_state.best_mbmode.ref_mv_idx != 0 &&
-      !(search_state.best_mbmode.mode == NEWMV ||
-        search_state.best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
-    search_state.best_mbmode.ref_mv_idx = 0;
-  }
-
-  if (search_state.best_mode_index < 0 ||
-      search_state.best_rd >= best_rd_so_far) {
-    rd_cost->rate = INT_MAX;
-    rd_cost->rdcost = INT64_MAX;
-    return;
-  }
-
-  assert(
-      (cm->interp_filter == SWITCHABLE) ||
-      (cm->interp_filter ==
-       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
-      !is_inter_block(&search_state.best_mbmode));
-  assert(
-      (cm->interp_filter == SWITCHABLE) ||
-      (cm->interp_filter ==
-       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
-      !is_inter_block(&search_state.best_mbmode));
-
-  if (!cpi->rc.is_src_frame_alt_ref)
-    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize,
-                              search_state.best_mode_index);
-
-  // macroblock modes
-  *mbmi = search_state.best_mbmode;
-  x->skip |= search_state.best_skip2;
-
-  // Note: this section is needed since the mode may have been forced to
-  // GLOBALMV by the all-zero mode handling of ref-mv.
-  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
-    // Correct the interp filters for GLOBALMV
-    if (is_nontrans_global_motion(xd, xd->mi[0])) {
-      assert(mbmi->interp_filters ==
-             av1_broadcast_interp_filter(
-                 av1_unswitchable_filter(cm->interp_filter)));
-    }
-  }
-
-  for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (search_state.best_pred_rd[i] == INT64_MAX)
-      search_state.best_pred_diff[i] = INT_MIN;
-    else
-      search_state.best_pred_diff[i] =
-          search_state.best_rd - search_state.best_pred_rd[i];
-  }
-
-  x->skip |= search_state.best_mode_skippable;
-
-  assert(search_state.best_mode_index >= 0);
-
-  store_coding_context(x, ctx, search_state.best_mode_index,
-                       search_state.best_pred_diff,
-                       search_state.best_mode_skippable);
-}
-
 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                                         TileDataEnc *tile_data, MACROBLOCK *x,
                                         int mi_row, int mi_col,
@@ -13704,6 +5047,7 @@
                                         PICK_MODE_CONTEXT *ctx,
                                         int64_t best_rd_so_far) {
   const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   unsigned char segment_id = mbmi->segment_id;
@@ -13719,6 +5063,7 @@
   const int64_t distortion2 = 0;
   (void)mi_row;
   (void)mi_col;
+  (void)tile_data;
 
   av1_collect_neighbors_ref_counts(xd);
 
@@ -13745,48 +5090,50 @@
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->mv[0].as_int =
       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
-                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                           cm->cur_frame_force_integer_mv)
+                           features->allow_high_precision_mv, bsize, mi_col,
+                           mi_row, features->cur_frame_force_integer_mv)
           .as_int;
   mbmi->tx_size = max_txsize_lookup[bsize];
-  x->skip = 1;
+  x->force_skip = 1;
 
   mbmi->ref_mv_idx = 0;
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+  av1_count_overlappable_neighbors(cm, xd);
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
     // Select the samples according to motion vector difference
     if (mbmi->num_proj_ref > 1)
-      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
-                                         mbmi->num_proj_ref, bsize);
+      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                             mbmi->num_proj_ref, bsize);
   }
 
-  set_default_interp_filters(mbmi, cm->interp_filter);
+  const InterpFilter interp_filter = features->interp_filter;
+  set_default_interp_filters(mbmi, interp_filter);
 
-  if (cm->interp_filter != SWITCHABLE) {
-    best_filter = cm->interp_filter;
+  if (interp_filter != SWITCHABLE) {
+    best_filter = interp_filter;
   } else {
     best_filter = EIGHTTAP_REGULAR;
-    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
-        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+    if (av1_is_interp_needed(xd) &&
+        x->source_variance >=
+            cpi->sf.interp_sf.disable_filter_search_var_thresh) {
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filters = av1_broadcast_interp_filter(i);
-        rs = av1_get_switchable_rate(cm, x, xd);
+        rs = av1_get_switchable_rate(x, xd, interp_filter);
         if (rs < best_rs) {
           best_rs = rs;
-          best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+          best_filter = mbmi->interp_filters.as_filters.y_filter;
         }
       }
     }
   }
   // Set the appropriate filter
   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
-  rate2 += av1_get_switchable_rate(cm, x, xd);
+  rate2 += av1_get_switchable_rate(x, xd, interp_filter);
 
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
     rate2 += comp_inter_cost[comp_pred];
@@ -13806,16 +5153,22 @@
     return;
   }
 
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter ==
-          av1_extract_interp_filter(mbmi->interp_filters, 0)));
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter == mbmi->interp_filters.as_filters.y_filter));
 
-  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV);
+  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+                              cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
+                              THR_GLOBALMV);
+  }
 
   av1_zero(best_pred_diff);
 
+#if CONFIG_INTERNAL_STATS
   store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
+#else
+  store_coding_context(x, ctx, best_pred_diff, 0);
+#endif  // CONFIG_INTERNAL_STATS
 }
 
 struct calc_target_weighted_pred_ctxt {
@@ -13826,15 +5179,17 @@
 };
 
 static INLINE void calc_target_weighted_pred_above(
-    MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi,
-    void *fun_ctxt, const int num_planes) {
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
   (void)num_planes;
+  (void)rel_mi_row;
+  (void)dir;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
@@ -13846,7 +5201,7 @@
     for (int row = 0; row < ctxt->overlap; ++row) {
       const uint8_t m0 = mask1d[row];
       const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
         wsrc[col] = m1 * tmp[col];
         mask[col] = m0;
       }
@@ -13860,7 +5215,7 @@
     for (int row = 0; row < ctxt->overlap; ++row) {
       const uint8_t m0 = mask1d[row];
       const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
         wsrc[col] = m1 * tmp16[col];
         mask[col] = m0;
       }
@@ -13872,15 +5227,17 @@
 }
 
 static INLINE void calc_target_weighted_pred_left(
-    MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi,
-    void *fun_ctxt, const int num_planes) {
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
   (void)num_planes;
+  (void)rel_mi_col;
+  (void)dir;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
@@ -13889,7 +5246,7 @@
   const int is_hbd = is_cur_buf_hbd(xd);
 
   if (!is_hbd) {
-    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
       for (int col = 0; col < ctxt->overlap; ++col) {
         const uint8_t m0 = mask1d[col];
         const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
@@ -13904,7 +5261,7 @@
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
-    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
       for (int col = 0; col < ctxt->overlap; ++col) {
         const uint8_t m0 = mask1d[col];
         const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
@@ -13957,21 +5314,20 @@
 //  error(x, y) =
 //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
 //
-static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
-                                      const MACROBLOCKD *xd, int mi_row,
-                                      int mi_col, const uint8_t *above,
-                                      int above_stride, const uint8_t *left,
-                                      int left_stride) {
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride) {
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int bw = xd->n4_w << MI_SIZE_LOG2;
-  const int bh = xd->n4_h << MI_SIZE_LOG2;
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const int bh = xd->height << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
   const int is_hbd = is_cur_buf_hbd(xd);
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 
-  // plane 0 should not be subsampled
+  // plane 0 should not be sub-sampled
   assert(xd->plane[0].subsampling_x == 0);
   assert(xd->plane[0].subsampling_y == 0);
 
@@ -13984,7 +5340,7 @@
         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
     struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
                                                    overlap };
-    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
                                   max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                   calc_target_weighted_pred_above, &ctxt);
   }
@@ -14000,7 +5356,7 @@
         AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
     struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
                                                    overlap };
-    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
                                  max_neighbor_obmc[mi_size_high_log2[bsize]],
                                  calc_target_weighted_pred_left, &ctxt);
   }
@@ -14045,7 +5401,8 @@
    2 * (src)[(i) + (stride) * ((j) + 1)] -  /* NOLINT */ \
    (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
 
-sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd) {
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
+                   bool high_bd) {
   int16_t s_x;
   int16_t s_y;
   if (high_bd) {
@@ -14065,8 +5422,8 @@
 DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 2,  12, 30, 40,
                                                                30, 12, 2,  0 };
 
-void gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                   uint8_t *dst, bool high_bd, int bd) {
+void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
+                       uint8_t *dst, bool high_bd, int bd) {
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
   InterpFilterParams filter = { .filter_ptr = gauss_filter,
                                 .taps = 8,
@@ -14077,6 +5434,7 @@
   assert(w % 8 == 0);
   // Because we use an eight tap filter, the stride should be at least 7 + w.
   assert(src_stride >= w + 7);
+#if CONFIG_AV1_HIGHBITDEPTH
   if (high_bd) {
     av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride,
                               CONVERT_TO_SHORTPTR(dst), w, w, h, &filter,
@@ -14085,6 +5443,11 @@
     av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
                        &conv_params);
   }
+#else
+  (void)high_bd;
+  av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
+                     &conv_params);
+#endif
 }
 
 static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
@@ -14098,7 +5461,7 @@
   // Ignore the 1 pixel border around the image for the computation.
   for (int j = 1; j < h - 1; ++j) {
     for (int i = 1; i < w - 1; ++i) {
-      sobel_xy g = sobel(input, w, i, j, high_bd);
+      sobel_xy g = av1_sobel(input, w, i, j, high_bd);
       // Scale down to 8-bit to get same output regardless of bit depth.
       int16_t g_x = g.x >> (bd - 8);
       int16_t g_y = g.y >> (bd - 8);
@@ -14127,7 +5490,7 @@
   } else {
     blurred = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * w * h);
   }
-  gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd);
+  av1_gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd);
   // Skip the non-maximum suppression step in Canny edge detection. We just
   // want a probability of an edge existing in the buffer, which is determined
   // by the strongest edge in it -- we don't need to eliminate the weaker

diff --git a/libaom/av1/encoder/rdopt.h b/libaom/av1/encoder/rdopt.h
index 7ba1b18..c7c99ac 100644
--- a/libaom/av1/encoder/rdopt.h
+++ b/libaom/av1/encoder/rdopt.h

@@ -21,45 +21,20 @@
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MAX_REF_MV_SERCH 3
-#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
-#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
-#define DEFAULT_INTERP_SKIP_FLAG \
-  (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
+#define COMP_TYPE_RD_THRESH_SCALE 11
+#define COMP_TYPE_RD_THRESH_SHIFT 4
+#define MAX_WINNER_MOTION_MODES 10
 
 struct TileInfo;
 struct macroblock;
 struct RD_STATS;
 
-#if CONFIG_RD_DEBUG
-static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
-                                             TX_SIZE tx_size, int blk_row,
-                                             int blk_col, int txb_coeff_cost) {
-  (void)blk_row;
-  (void)blk_col;
-  (void)tx_size;
-  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
-
-  {
-    const int txb_h = tx_size_high_unit[tx_size];
-    const int txb_w = tx_size_wide_unit[tx_size];
-    int idx, idy;
-    for (idy = 0; idy < txb_h; ++idy)
-      for (idx = 0; idx < txb_w; ++idx)
-        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
-
-    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
-  }
-  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
-  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
-}
-#endif
-
 // Returns the number of colors in 'src'.
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
                      int *val_count);
@@ -67,13 +42,6 @@
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
                             int bit_depth, int *val_count);
 
-#if CONFIG_DIST_8X8
-int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x,
-                     const uint8_t *src, int src_stride, const uint8_t *dst,
-                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
-                     int bsh, int visible_w, int visible_h, int qindex);
-#endif
-
 static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
                                     int plane, TX_SIZE tx_size) {
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -83,32 +51,9 @@
   return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
 }
 
-static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int plane, int block, TX_SIZE tx_size,
-                                  const TX_TYPE tx_type,
-                                  const TXB_CTX *const txb_ctx,
-                                  int use_fast_coef_costing) {
-#if TXCOEFF_COST_TIMER
-  struct aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-#endif
-  (void)use_fast_coef_costing;
-  const int cost =
-      av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx);
-#if TXCOEFF_COST_TIMER
-  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
-  aom_usec_timer_mark(&timer);
-  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
-  tmp_cm->txcoeff_cost_timer += elapsed_time;
-  ++tmp_cm->txcoeff_cost_count;
-#endif
-  return cost;
-}
-
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
-                               int mi_row, int mi_col, struct RD_STATS *rd_cost,
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd);
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                            const struct buf_2d *ref,
@@ -119,13 +64,16 @@
 
 void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                struct TileDataEnc *tile_data,
-                               struct macroblock *x, int mi_row, int mi_col,
-                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+                               struct macroblock *x, struct RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
+
+void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
 void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                   struct TileDataEnc *tile_data,
-                                  struct macroblock *x, int mi_row, int mi_col,
+                                  struct macroblock *x,
                                   struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far);
@@ -155,8 +103,8 @@
 /** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
  * tests.
  */
-void gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                   uint8_t *dst, bool high_bd, int bd);
+void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
+                       uint8_t *dst, bool high_bd, int bd);
 
 /* Applies standard 3x3 Sobel matrix. */
 typedef struct {
@@ -164,11 +112,131 @@
   int16_t y;
 } sobel_xy;
 
-sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd);
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
+                   bool high_bd);
 
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
+#if !CONFIG_REALTIME_ONLY
+static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+  return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
+}
+#endif
+
+static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a,
+                                                int b) {
+  if (!oh->enable_order_hint) return 0;
+
+  assert(a >= 0 && b >= 0);
+  return (a - b);
+}
+
+// This function will return number of mi's in a superblock.
+static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+  const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
+  int sb_mi_rows =
+      (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) /
+      mi_alloc_size_1d;
+  assert(mi_size_wide[cm->seq_params.sb_size] ==
+         mi_size_high[cm->seq_params.sb_size]);
+  int sb_mi_size = sb_mi_rows * sb_mi_rows;
+
+  return sb_mi_size;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+    const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+    MV_REFERENCE_FRAME ref_frame) {
+  memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
+// This function prunes the mode if either of the reference frame falls in the
+// pruning list
+static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+                            const OrderHintInfo *const order_hint_info,
+                            const unsigned int *const ref_display_order_hint,
+                            const unsigned int frame_display_order_hint,
+                            const int *ref_frame_list) {
+  for (int i = 0; i < 2; i++) {
+    if (ref_frame_list[i] == NONE_FRAME) continue;
+
+    if (ref_frame[0] == ref_frame_list[i] ||
+        ref_frame[1] == ref_frame_list[i]) {
+      if (av1_encoder_get_relative_dist(
+              order_hint_info,
+              ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
+              frame_display_order_hint) < 0)
+        return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int prune_ref_by_selective_ref_frame(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const MV_REFERENCE_FRAME *const ref_frame,
+    const unsigned int *const ref_display_order_hint) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (!sf->inter_sf.selective_ref_frame) return 0;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+  if (sf->inter_sf.selective_ref_frame >= 2 ||
+      (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) {
+    int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
+
+    if (x != NULL) {
+      if (x->search_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->search_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME;
+    }
+
+    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+                  ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  if (sf->inter_sf.selective_ref_frame >= 3) {
+    int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
+
+    if (x != NULL) {
+      if (x->search_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->search_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME;
+    }
+
+    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+                  ref_display_order_hint[LAST_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  return 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
+static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+    MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
+    const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type],
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type];
+  mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libaom/av1/encoder/rdopt_data_defs.h b/libaom/av1/encoder/rdopt_data_defs.h
new file mode 100644
index 0000000..ca7ef81
--- /dev/null
+++ b/libaom/av1/encoder/rdopt_data_defs.h

@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = {
+  THR_DC,         // DC_PRED,
+  THR_V_PRED,     // V_PRED,
+  THR_H_PRED,     // H_PRED,
+  THR_D45_PRED,   // D45_PRED,
+  THR_D135_PRED,  // D135_PRED,
+  THR_D113_PRED,  // D113_PRED,
+  THR_D157_PRED,  // D157_PRED,
+  THR_D203_PRED,  // D203_PRED,
+  THR_D67_PRED,   // D67_PRED,
+  THR_SMOOTH,     // SMOOTH_PRED,
+  THR_SMOOTH_V,   // SMOOTH_V_PRED,
+  THR_SMOOTH_H,   // SMOOTH_H_PRED,
+  THR_PAETH,      // PAETH_PRED,
+};
+
+/* clang-format off */
+static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+    [REF_FRAMES] = {
+    // NEARESTMV,
+        { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
+        THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
+        // NEARMV,
+        { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3,
+        THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
+        // GLOBALMV,
+        { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
+        THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
+        // NEWMV,
+        { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3,
+        THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+    [REF_FRAMES] = {
+    // NEAREST_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
+            THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
+            THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL2B,
+            THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL3B,
+            THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTGB,
+            THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
+            THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
+            THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL2B,
+            THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL3B,
+            THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARGB,
+            THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAREST_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
+            THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
+            THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL2B,
+            THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL3B,
+            THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWGB,
+            THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
+            THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
+            THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL2B,
+            THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL3B,
+            THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTGB,
+            THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
+            THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
+            THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL2B,
+            THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL3B,
+            THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWGB,
+            THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
+            THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
+            THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL2B,
+            THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL3B,
+            THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARGB,
+            THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // GLOBAL_GLOBALMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
+            THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
+            THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B,
+            THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B,
+            THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALGB,
+            THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
+            THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
+            THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL2B,
+            THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL3B,
+            THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWGB,
+            THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_

diff --git a/libaom/av1/encoder/rdopt_utils.h b/libaom/av1/encoder/rdopt_utils.h
new file mode 100644
index 0000000..53b410a
--- /dev/null
+++ b/libaom/av1/encoder/rdopt_utils.h

@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_RDOPT_UTILS_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/common/cfl.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/rdopt_data_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SEARCH 3
+#define INTER_INTRA_RD_THRESH_SCALE 9
+#define INTER_INTRA_RD_THRESH_SHIFT 4
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+// This array defines the mapping from the enums in THR_MODES to the actual
+// prediction modes and refrence frames
+static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
+  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEWMV, { LAST_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEARMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  // intra modes
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+                                       const int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].dst.buf = dst.plane[i];
+    xd->plane[i].dst.stride = dst.stride[i];
+  }
+}
+
+/* clang-format on */
+// Calculate rd threshold based on ref best rd and relevant scaling factors
+static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+                                                     int mul_factor,
+                                                     int div_factor) {
+  int64_t rd_thresh = ref_best_rd;
+  if (div_factor != 0) {
+    rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
+                    ? ((ref_best_rd / div_factor) * mul_factor)
+                    : INT64_MAX;
+  }
+  return rd_thresh;
+}
+
+static AOM_INLINE THR_MODES
+get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+                        MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return THR_INVALID;
+}
+
+static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+      bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+    return -1;
+  }
+  return 1;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                                          BLOCK_SIZE plane_bsize, int blk_row,
+                                          int blk_col, BLOCK_SIZE tx_bsize,
+                                          int *width, int *height,
+                                          int *visible_width,
+                                          int *visible_height) {
+  assert(tx_bsize <= plane_bsize);
+  const int txb_height = block_size_high[tx_bsize];
+  const int txb_width = block_size_wide[tx_bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+  // than the MI size
+  if (xd->mb_to_bottom_edge >= 0) {
+    *visible_height = txb_height;
+  } else {
+    const int block_height = block_size_high[plane_bsize];
+    const int block_rows =
+        (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+    *visible_height =
+        clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height);
+  }
+  if (height) *height = txb_height;
+
+  if (xd->mb_to_right_edge >= 0) {
+    *visible_width = txb_width;
+  } else {
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_cols =
+        (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+    *visible_width =
+        clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width);
+  }
+  if (width) *width = txb_width;
+}
+
+static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
+  return num_blk;
+}
+
+static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int64_t best_skip_rd, int64_t skip_rd,
+                                  int level, int is_luma_only) {
+  int eval_txfm = 1;
+  // Derive aggressiveness factor for gating the transform search
+  // Lower value indicates more aggressiveness. Be more conservative (high
+  // value) for (i) low quantizers (ii) regions where prediction is poor
+  const int scale[5] = { INT_MAX, 4, 3, 3, 2 };
+  const int qslope = 2 * (!is_luma_only);
+  int aggr_factor = 1;
+  if (!is_luma_only) {
+    aggr_factor = AOMMAX(
+        1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS);
+  }
+  if (best_skip_rd >
+      (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS)))
+    aggr_factor *= scale[level];
+  // For level setting 1, be more conservative for luma only case even when
+  // prediction is good
+  else if ((level <= 1) && !is_luma_only)
+    aggr_factor *= 2;
+
+  // Be more conservative for luma only cases (called from compound type rd)
+  // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
+  // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
+  // interpolation filter search
+  const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 };
+  int mul_factor = is_luma_only ? luma_mul[level] : 16;
+  int64_t rd_thresh =
+      (best_skip_rd == INT64_MAX)
+          ? best_skip_rd
+          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 4);
+  if (skip_rd > rd_thresh) eval_txfm = 0;
+  return eval_txfm;
+}
+
+static TX_MODE select_tx_mode(
+    const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) {
+  if (cm->features.coded_lossless) return ONLY_4X4;
+  if (tx_size_search_method == USE_LARGESTALL) {
+    return TX_MODE_LARGEST;
+  } else {
+    assert(tx_size_search_method == USE_FULL_RD ||
+           tx_size_search_method == USE_FAST_RD);
+    return TX_MODE_SELECT;
+  }
+}
+// Checks the conditions to enable winner mode processing
+static INLINE int is_winner_mode_processing_enabled(
+    const struct AV1_COMP *cpi, MB_MODE_INFO *const mbmi,
+    const PREDICTION_MODE best_mode) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+
+  // TODO(any): Move block independent condition checks to frame level
+  if (is_inter_block(mbmi)) {
+    if (is_inter_mode(best_mode) &&
+        sf->tx_sf.tx_type_search.fast_inter_tx_type_search &&
+        !cpi->oxcf.use_inter_dct_only)
+      return 1;
+  } else {
+    if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
+        !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only)
+      return 1;
+  }
+
+  // Check speed feature related to winner mode processing
+  if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)
+    return 1;
+  if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1;
+
+  return 0;
+}
+
+static INLINE void set_tx_size_search_method(
+    const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
+    MACROBLOCK *x, int enable_winner_mode_for_tx_size_srch,
+    int is_winner_mode) {
+  // Populate transform size search method/transform mode appropriately
+  x->tx_size_search_method =
+      winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
+  if (enable_winner_mode_for_tx_size_srch) {
+    if (is_winner_mode)
+      x->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
+    else
+      x->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[MODE_EVAL];
+  }
+  x->tx_mode_search_type = select_tx_mode(cm, x->tx_size_search_method);
+}
+
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, MACROBLOCK *x,
+                                     int enable_winner_mode_tx_type_pruning,
+                                     int is_winner_mode) {
+  // Populate prune transform mode appropriately
+  x->prune_mode = sf->tx_sf.tx_type_search.prune_mode;
+  if (enable_winner_mode_tx_type_pruning) {
+    if (is_winner_mode)
+      x->prune_mode = NO_PRUNE;
+    else
+      x->prune_mode = PRUNE_2D_AGGRESSIVE;
+  }
+}
+
+static INLINE void set_tx_domain_dist_params(
+    const WinnerModeParams *winner_mode_params, MACROBLOCK *x,
+    int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+  if (!enable_winner_mode_for_tx_domain_dist) {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
+    return;
+  }
+
+  if (is_winner_mode) {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
+  } else {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
+  }
+}
+
+// This function sets mode parameters for different mode evaluation stages
+static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        MODE_EVAL_TYPE mode_eval_type) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+
+  switch (mode_eval_type) {
+    case DEFAULT_EVAL:
+      x->use_default_inter_tx_type = 0;
+      x->use_default_intra_tx_type = 0;
+      x->predict_skip_level =
+          winner_mode_params->predict_skip_level[DEFAULT_EVAL];
+      // Set default transform domain distortion type
+      set_tx_domain_dist_params(winner_mode_params, x, 0, 0);
+
+      // Get default threshold for R-D optimization of coefficients
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold, 0, 0);
+      // Set default transform size search method
+      set_tx_size_search_method(cm, winner_mode_params, x, 0, 0);
+      // Set default transform type prune
+      set_tx_type_prune(sf, x, 0, 0);
+      break;
+    case MODE_EVAL:
+      x->use_default_intra_tx_type =
+          (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
+           cpi->oxcf.use_intra_default_tx_only);
+      x->use_default_inter_tx_type =
+          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
+      x->predict_skip_level = winner_mode_params->predict_skip_level[MODE_EVAL];
+
+      // Set transform domain distortion type for mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
+
+      // Get threshold for R-D optimization of coefficients during mode
+      // evaluation
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+      // Set the transform size search method for mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
+      // Set transform type prune for mode evaluation
+      set_tx_type_prune(
+          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
+          0);
+      break;
+    case WINNER_MODE_EVAL:
+      x->use_default_inter_tx_type = 0;
+      x->use_default_intra_tx_type = 0;
+      x->predict_skip_level =
+          winner_mode_params->predict_skip_level[WINNER_MODE_EVAL];
+
+      // Set transform domain distortion type for winner mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
+
+      // Get threshold for R-D optimization of coefficients for winner mode
+      // evaluation
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+      // Set the transform size search method for winner mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+      // Set default transform type prune mode for winner mode evaluation
+      set_tx_type_prune(
+          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
+          1);
+
+      // Reset hash state for winner mode processing. Winner mode and subsequent
+      // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
+      // the decisions would have been sub-optimal
+      // TODO(any): Move the evaluation of palette/IntraBC modes before winner
+      // mode is processed and clean-up the code below
+      reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+
+      break;
+    default: assert(0);
+  }
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+
+  if (!xd->is_chroma_ref) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
+}
+
+static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+// Store best mode stats for winner mode processing
+static INLINE void store_winner_mode_stats(
+    const AV1_COMMON *const cm, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
+    THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
+    int enable_multiwinner_mode_process, int txfm_search_done) {
+  WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
+  int mode_idx = 0;
+  int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
+  // Mode stat is not required when multiwinner mode processing is disabled
+  if (!enable_multiwinner_mode_process) return;
+  // Ignore mode with maximum rd
+  if (this_rd == INT64_MAX) return;
+  // TODO(any): Winner mode processing is currently not applicable for palette
+  // mode in Inter frames. Clean-up the following code, once support is added
+  if (!frame_is_intra_only(cm) && is_palette_mode) return;
+
+  const int max_winner_mode_count = frame_is_intra_only(cm)
+                                        ? MAX_WINNER_MODE_COUNT_INTRA
+                                        : MAX_WINNER_MODE_COUNT_INTER;
+  assert(x->winner_mode_count >= 0 &&
+         x->winner_mode_count <= max_winner_mode_count);
+
+  if (x->winner_mode_count) {
+    // Find the mode which has higher rd cost than this_rd
+    for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++)
+      if (winner_mode_stats[mode_idx].rd > this_rd) break;
+
+    if (mode_idx == max_winner_mode_count) {
+      // No mode has higher rd cost than this_rd
+      return;
+    } else if (mode_idx < max_winner_mode_count - 1) {
+      // Create a slot for current mode and move others to the next slot
+      memmove(
+          &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx],
+          (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats));
+    }
+  }
+  // Add a mode stat for winner mode processing
+  winner_mode_stats[mode_idx].mbmi = *mbmi;
+  winner_mode_stats[mode_idx].rd = this_rd;
+  winner_mode_stats[mode_idx].mode_index = mode_index;
+
+  // Update rd stats required for inter frame
+  if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
+    const int skip = mbmi->skip && !is_intra_mode;
+
+    winner_mode_stats[mode_idx].rd_cost = *rd_cost;
+    if (txfm_search_done) {
+      winner_mode_stats[mode_idx].rate_y =
+          rd_cost_y->rate + x->skip_cost[skip_ctx][rd_cost->skip || skip];
+      winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
+    }
+  }
+
+  if (color_map) {
+    // Store color_index_map for palette mode
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int block_width, block_height;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+    memcpy(winner_mode_stats[mode_idx].color_index_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+  }
+
+  x->winner_mode_count =
+      AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_UTILS_H_

diff --git a/libaom/av1/encoder/reconinter_enc.c b/libaom/av1/encoder/reconinter_enc.c
index 4b477ce..231b020 100644
--- a/libaom/av1/encoder/reconinter_enc.c
+++ b/libaom/av1/encoder/reconinter_enc.c

@@ -20,404 +20,201 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/obmc.h"
 #include "av1/encoder/reconinter_enc.h"
 
-static INLINE void calc_subpel_params(
-    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
-    int plane, const int pre_x, const int pre_y, int x, int y,
-    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
-    int bw, int bh) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int is_scaled = av1_is_scaled(sf);
-  if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
-    orig_pos_x += mv.col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
-    pos_x += SCALE_EXTRA_OFF;
-    pos_y += SCALE_EXTRA_OFF;
+static void enc_calc_subpel_params(const MV *const src_mv,
+                                   InterPredParams *const inter_pred_params,
+                                   MACROBLOCKD *xd, int mi_x, int mi_y, int ref,
+                                   uint8_t **pre, SubpelParams *subpel_params,
+                                   int *src_stride) {
+  // These are part of the function signature to use this function through a
+  // function pointer. See typedef of 'CalcSubpelParamsFunc'.
+  (void)xd;
+  (void)mi_x;
+  (void)mi_y;
+  (void)ref;
 
-    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                       << SCALE_SUBPEL_BITS;
-    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-    pos_y = clamp(pos_y, top, bottom);
-    pos_x = clamp(pos_x, left, right);
+  const struct scale_factors *sf = inter_pred_params->scale_factors;
 
-    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-           (pos_x >> SCALE_SUBPEL_BITS);
-    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
-    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
-    subpel_params->xs = sf->x_step_q4;
-    subpel_params->ys = sf->y_step_q4;
-  } else {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
-    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-           (x + (mv_q4.col >> SUBPEL_BITS));
-  }
+  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+  int ssx = inter_pred_params->subsampling_x;
+  int ssy = inter_pred_params->subsampling_y;
+  int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+  orig_pos_y += src_mv->row * (1 << (1 - ssy));
+  int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+  orig_pos_x += src_mv->col * (1 << (1 - ssx));
+  int pos_y = sf->scale_value_y(orig_pos_y, sf);
+  int pos_x = sf->scale_value_x(orig_pos_x, sf);
+  pos_x += SCALE_EXTRA_OFF;
+  pos_y += SCALE_EXTRA_OFF;
+
+  const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+  const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+  const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  pos_y = clamp(pos_y, top, bottom);
+  pos_x = clamp(pos_x, left, right);
+
+  subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+  subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+  subpel_params->xs = sf->x_step_q4;
+  subpel_params->ys = sf->y_step_q4;
+  *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+         (pos_x >> SCALE_SUBPEL_BITS);
+  *src_stride = pre_buf->stride;
 }
 
-static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int plane, const MB_MODE_INFO *mi,
-                                          int build_for_obmc, int bw, int bh,
-                                          int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int is_compound = has_second_ref(mi);
-  int ref;
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  int is_global[2] = { 0, 0 };
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
-                     (block_size_high[bsize] < 8 && ss_y);
-
-  if (is_intrabc) sub8x8_inter = 0;
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  sub8x8_inter = sub8x8_inter && !build_for_obmc;
-  if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
-      for (int col = col_start; col <= 0; ++col) {
-        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
-        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
-      }
-    }
-  }
-
-  if (sub8x8_inter) {
-    // block size
-    const int b4_w = block_size_wide[bsize] >> ss_x;
-    const int b4_h = block_size_high[bsize] >> ss_y;
-    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
-    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
-    const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    assert(!is_compound);
-
-    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
-    int row = row_start;
-    for (int y = 0; y < b8_h; y += b4_h) {
-      int col = col_start;
-      for (int x = 0; x < b8_w; x += b4_w) {
-        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        is_compound = has_second_ref(this_mbmi);
-        int tmp_dst_stride = 8;
-        assert(bw < 8 || bh < 8);
-        ConvolveParams conv_params = get_conv_params_no_round(
-            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_dist_wtd_comp_avg = 0;
-        struct buf_2d *const dst_buf = &pd->dst;
-        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
-        ref = 0;
-        const RefCntBuffer *ref_buf =
-            get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
-        const struct scale_factors *ref_scale_factors =
-            get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
-
-        pd->pre[ref].buf0 =
-            (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
-        pd->pre[ref].buf =
-            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
-                                                     ref_buf->buf.uv_stride,
-                                                     ref_scale_factors);
-        pd->pre[ref].width = ref_buf->buf.uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf.uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf.uv_stride;
-
-        const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : ref_scale_factors;
-        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
-        const MV mv = this_mbmi->mv[ref].as_mv;
-
-        uint8_t *pre;
-        SubpelParams subpel_params;
-        WarpTypesAllowed warp_types;
-        warp_types.global_warp_allowed = is_global[ref];
-        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
-        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                           &subpel_params, bw, bh);
-        conv_params.do_average = ref;
-        if (is_masked_compound_type(mi->interinter_comp.type)) {
-          // masked compound type has its own average mechanism
-          conv_params.do_average = 0;
-        }
-
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
-            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
-            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
-            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
-        ++col;
-      }
-      ++row;
-    }
-
-    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
-    return;
-  }
-
-  {
-    ConvolveParams conv_params = get_conv_params_no_round(
-        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
-        &conv_params.use_dist_wtd_comp_avg, is_compound);
-
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf;
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-      const MV mv = mi->mv[ref].as_mv;
-
-      uint8_t *pre;
-      SubpelParams subpel_params;
-      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
-                         &subpel_params, bw, bh);
-
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global[ref];
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
-        // masked compound type has its own average mechanism
-        conv_params.do_average = 0;
-        av1_make_masked_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, plane, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
-            cm->allow_warped_motion);
-      } else {
-        conv_params.do_average = ref;
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
-            mi, build_for_obmc, xd, cm->allow_warped_motion);
-      }
-    }
-  }
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params) {
+  av1_build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params,
+                                NULL /* xd */, 0 /* mi_x */, 0 /* mi_y */,
+                                0 /* ref */, enc_calc_subpel_params);
 }
 
-static void build_inter_predictors_for_plane(const AV1_COMMON *cm,
-                                             MACROBLOCKD *xd, int mi_row,
-                                             int mi_col, const BUFFER_SET *ctx,
-                                             BLOCK_SIZE bsize, int plane_idx) {
-  const struct macroblockd_plane *pd = &xd->plane[plane_idx];
-  if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                           pd->subsampling_y))
-    return;
+static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                       int plane, const MB_MODE_INFO *mi,
+                                       int bw, int bh, int mi_x, int mi_y) {
+  av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh,
+                             mi_x, mi_y, enc_calc_subpel_params);
+}
 
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
-  build_inter_predictors(cm, xd, plane_idx, xd->mi[0], 0, pd->width, pd->height,
-                         mi_x, mi_y);
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  InterPredParams inter_pred_params;
 
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
-    if (!ctx) {
-      default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
-      default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
-      ctx = &default_ctx;
-    }
-    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
-                                        xd->plane[plane_idx].dst.stride, ctx,
-                                        plane_idx, bsize);
-  }
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+  const MV mv = xd->mi[0]->mv[0].as_mv;
+  const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
+
+  av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), false, sf, pd->pre,
+                        xd->mi[0]->interp_filters);
+
+  inter_pred_params.conv_params = get_conv_params_no_round(
+      0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
+
+  inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+  av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params);
 }
 
 void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col,
                                    const BUFFER_SET *ctx, BLOCK_SIZE bsize,
                                    int plane_from, int plane_to) {
-  for (int plane_idx = plane_from; plane_idx <= plane_to; ++plane_idx) {
-    build_inter_predictors_for_plane(cm, xd, mi_row, mi_col, ctx, bsize,
-                                     plane_idx);
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width,
+                               xd->plane[plane].height, mi_x, mi_y);
+
+    if (is_interintra_pred(xd->mi[0])) {
+      BUFFER_SET default_ctx = {
+        { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+        { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+          xd->plane[2].dst.stride }
+      };
+      if (!ctx) {
+        ctx = &default_ctx;
+      }
+      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+                                     xd->plane[plane].dst.stride, ctx, plane,
+                                     bsize);
+    }
   }
 }
 
-// TODO(sarahparker):
-// av1_build_inter_predictor should be combined with
-// av1_make_inter_predictor
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  mv.col += SCALE_EXTRA_OFF;
-  mv.row += SCALE_EXTRA_OFF;
-
-  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                       mv.col & SCALE_SUBPEL_MASK,
-                                       mv.row & SCALE_SUBPEL_MASK };
-  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
-         (mv.col >> SCALE_SUBPEL_BITS);
-
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
-                           w, h, conv_params, interp_filters, warp_types, p_col,
-                           p_row, plane, ref, xd->mi[0], 0, xd,
-                           can_use_previous);
-}
-
-static INLINE void build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
-    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+                                         int rel_mi_col, uint8_t op_mi_size,
+                                         int dir, MB_MODE_INFO *above_mbmi,
+                                         void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *above_mbmi;
+  av1_setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+                             num_planes);
 
-  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
-                                           &backup_mbmi, ctxt, num_planes);
-  mi_x = above_mi_col << MI_SIZE_LOG2;
-  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+  const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
+  const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
 
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
+  InterPredParams inter_pred_params;
+
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
-    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
-                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+    int bw = 0, bh = 0;
 
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                           mi_y);
+    if (dir) {
+      // prepare left reference block size
+      bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                 block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+      bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+    } else {
+      // prepare above reference block size
+      bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+      bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                 block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+    }
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue;
+
+    const struct buf_2d *const pre_buf = &pd->pre[0];
+    const MV mv = above_mbmi->mv[0].as_mv;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[0], pre_buf,
+                          above_mbmi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
+
+    av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
+                                      &inter_pred_params);
   }
 }
 
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
                                          int tmp_width[MAX_MB_PLANE],
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->up_available) return;
-
-  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
-  // prediction block. This is half the height of the original block,
-  // except for 128-wide blocks, where we only use a height of 32.
-  int this_height = xd->n4_h * MI_SIZE;
-  int pred_height = AOMMIN(this_height / 2, 32);
-  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_right_edge };
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_above(cm, xd, mi_col,
+  foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                build_prediction_by_above_pred, &ctxt);
-
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
-  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
-}
-
-static INLINE void build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
-    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *left_mbmi;
-
-  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
-                                          &backup_mbmi, ctxt, num_planes);
-  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
-  mi_y = left_mi_row << MI_SIZE_LOG2;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
-  for (int j = 0; j < num_planes; ++j) {
-    const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
-                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
-    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                           mi_y);
-  }
+                                build_obmc_prediction, &ctxt);
 }
 
 void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
                                         uint8_t *tmp_buf[MAX_MB_PLANE],
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->left_available) return;
-
-  // Adjust mb_to_right_edge to have the correct value for the OBMC
-  // prediction block. This is half the width of the original block,
-  // except for 128-wide blocks, where we only use a width of 32.
-  int this_width = xd->n4_w * MI_SIZE;
-  int pred_width = AOMMIN(this_width / 2, 32);
-  xd->mb_to_right_edge += (this_width - pred_width) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_bottom_edge };
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_left(cm, xd, mi_row,
+  foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               build_prediction_by_left_pred, &ctxt);
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
-  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+                               build_obmc_prediction, &ctxt);
 }
 
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col) {
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   const int num_planes = av1_num_planes(cm);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -447,65 +244,55 @@
     dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
     dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
   }
-  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_width1, dst_height1, dst_stride1);
-  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_width2, dst_height2, dst_stride2);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+                                      dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+                                     dst_stride2);
   av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
                        mi_row, mi_col, 0, num_planes);
-  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
-                                  dst_buf2, dst_stride2);
+  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+                                  dst_stride2);
 }
 
-// Builds the inter-predictor for the single ref case
-// for use in the encoder to search the wedges efficiently.
-static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
-                                              int bw, int bh, int x, int y,
-                                              int w, int h, int mi_x, int mi_y,
-                                              int ref, uint8_t *const ext_dst,
-                                              int ext_dst_stride,
-                                              int can_use_previous) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const MB_MODE_INFO *mi = xd->mi[0];
-
-  const struct scale_factors *const sf = xd->block_ref_scale_factors[ref];
-  struct buf_2d *const pre_buf = &pd->pre[ref];
-  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
-  const MV mv = mi->mv[ref].as_mv;
-
-  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
   WarpTypesAllowed warp_types;
   const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
   warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
   warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-  const int pre_x = (mi_x) >> pd->subsampling_x;
-  const int pre_y = (mi_y) >> pd->subsampling_y;
-  uint8_t *pre;
-  SubpelParams subpel_params;
-  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                     &subpel_params, bw, bh);
 
-  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
-                           &subpel_params, sf, w, h, &conv_params,
-                           mi->interp_filters, &warp_types, pre_x + x,
-                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
-}
-
-void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
-    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
-                                      mi_y, ref, ext_dst[plane],
-                                      ext_dst_stride[plane], can_use_previous);
+
+    InterPredParams inter_pred_params;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[ref], &pd->pre[ref],
+                          mi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+    av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]);
+    const MV mv = mi->mv[ref].as_mv;
+
+    av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv,
+                                      &inter_pred_params);
   }
 }
 
@@ -523,6 +310,7 @@
                      mask, block_size_wide[sb_type], w, h, subw, subh);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void build_masked_compound_highbd(
     uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
     const uint8_t *src1_8, int src1_stride,
@@ -539,6 +327,7 @@
                             src1_stride, mask, block_size_wide[sb_type], w, h,
                             subw, subh, bd);
 }
+#endif
 
 static void build_wedge_inter_predictor_from_buf(
     MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
@@ -565,7 +354,7 @@
             ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
       }
     }
-
+#if CONFIG_AV1_HIGHBITDEPTH
     if (is_hbd) {
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
@@ -576,7 +365,13 @@
                             ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
                             h, w);
     }
+#else
+    build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                          ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                          h, w);
+#endif
   } else {
+#if CONFIG_AV1_HIGHBITDEPTH
     if (is_hbd) {
       aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                                dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
@@ -585,6 +380,10 @@
       aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                         0, NULL, 0, w, h);
     }
+#else
+    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0,
+                      NULL, 0, w, h);
+#endif
   }
 }
 
@@ -595,6 +394,7 @@
                                               uint8_t *ext_dst1[3],
                                               int ext_dst_stride1[3]) {
   int plane;
+  assert(bsize < BLOCK_SIZES_ALL);
   for (plane = plane_from; plane <= plane_to; ++plane) {
     const BLOCK_SIZE plane_bsize = get_plane_block_size(
         bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);

diff --git a/libaom/av1/encoder/reconinter_enc.h b/libaom/av1/encoder/reconinter_enc.h
index 5687168..fdc1f31 100644
--- a/libaom/av1/encoder/reconinter_enc.h
+++ b/libaom/av1/encoder/reconinter_enc.h

@@ -13,94 +13,50 @@
 #define AOM_AV1_ENCODER_RECONINTER_ENC_H_
 
 #include "aom/aom_integer.h"
-#include "av1/common/filter.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
 #include "av1/common/warped_motion.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// Build single or compound reference inter predictors for all planes.
+// Can build inter-intra predictors, masked predictors etc as well.
 void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col,
                                    const BUFFER_SET *ctx, BLOCK_SIZE bsize,
                                    int plane_from, int plane_to);
 
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous);
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
 
-// Detect if the block have sub-pixel level motion vectors
-// per component.
-#define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
-                                          const MACROBLOCKD *const xd,
-                                          int dir) {
-#if CHECK_SUBPEL
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int plane;
-  int ref = (dir >> 1);
-
-  if (dir & 0x01) {
-    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
-  } else {
-    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
-  }
-
-  return 0;
-#else
-  (void)mbmi;
-  (void)xd;
-  (void)dir;
-  return 1;
-#endif
-}
-
-static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
-  MB_MODE_INFO *const mi = xd->mi[0];
-  const int is_compound = has_second_ref(mi);
-  int ref;
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    int row_col;
-    for (row_col = 0; row_col < 2; ++row_col) {
-      const int dir = (ref << 1) + row_col;
-      if (has_subpel_mv_component(mi, xd, dir)) {
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
+// Build one inter predictor. It is called for building predictor for single
+// reference case, or just the 1st or 2nd reference in compound reference case.
+// Can build both regular and masked predictors.
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params);
 
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
                                          int tmp_width[MAX_MB_PLANE],
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]);
 
 void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
                                         uint8_t *tmp_buf[MAX_MB_PLANE],
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]);
 
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col);
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous);
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]);
 
 void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int plane_from, int plane_to,

diff --git a/libaom/av1/encoder/segmentation.c b/libaom/av1/encoder/segmentation.c
index 6d0c654..0c029c0 100644
--- a/libaom/av1/encoder/segmentation.c
+++ b/libaom/av1/encoder/segmentation.c

@@ -49,16 +49,15 @@
                        unsigned (*temporal_predictor_count)[2],
                        unsigned *t_unpred_seg_counts, int bw, int bh,
                        int mi_row, int mi_col) {
-  int segment_id;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
   xd->mi = mi;
-  segment_id = xd->mi[0]->segment_id;
-
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
 
   // Count the number of hits on each segment with no prediction
+  const int segment_id = xd->mi[0]->segment_id;
   no_pred_segcounts[segment_id]++;
 
   // Temporal prediction not allowed on key frames
@@ -67,7 +66,8 @@
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id =
         cm->last_frame_seg_map
-            ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col)
+            ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+                             mi_col)
             : 0;
     const int pred_flag = pred_segment_id == segment_id;
     const int pred_context = av1_get_pred_context_seg_id(xd);
@@ -88,12 +88,13 @@
                           unsigned (*temporal_predictor_count)[2],
                           unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
-  const int mis = cm->mi_stride;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mis = mi_params->mi_stride;
   const int bs = mi_size_wide[bsize], hbs = bs / 2;
   PARTITION_TYPE partition;
   const int qbs = bs / 4;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
 #define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff)                              \
   count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff),               \
@@ -138,19 +139,20 @@
       CSEGS(bs, qbs, 0, 0);
       CSEGS(bs, qbs, qbs, 0);
       CSEGS(bs, qbs, 2 * qbs, 0);
-      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
       break;
 
     case PARTITION_VERT_4:
       CSEGS(qbs, bs, 0, 0);
       CSEGS(qbs, bs, 0, qbs);
       CSEGS(qbs, bs, 0, 2 * qbs);
-      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
       break;
 
     case PARTITION_SPLIT: {
       const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       int n;
+      assert(subsize < BLOCK_SIZES_ALL);
 
       for (n = 0; n < 4; n++) {
         const int mi_dc = hbs * (n & 1);
@@ -177,26 +179,31 @@
   unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
   (void)xd;
-
+  int scale_up = cm->prev_frame && (cm->width > cm->prev_frame->width ||
+                                    cm->height > cm->prev_frame->height);
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
-    TileInfo tile_info;
-    av1_tile_set_row(&tile_info, cm, tile_row);
-    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-      MB_MODE_INFO **mi_ptr;
-      av1_tile_set_col(&tile_info, cm, tile_col);
-      mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
-               tile_info.mi_col_start;
-      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += cm->seq_params.mib_size,
-          mi_ptr += cm->seq_params.mib_size * cm->mi_stride) {
-        MB_MODE_INFO **mi = mi_ptr;
-        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) {
-          count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
-                        temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                        mi_col, cm->seq_params.sb_size);
+  if (!scale_up) {
+    for (tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      TileInfo tile_info;
+      av1_tile_set_row(&tile_info, cm, tile_row);
+      for (tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+        MB_MODE_INFO **mi_ptr;
+        av1_tile_set_col(&tile_info, cm, tile_col);
+        mi_ptr = cm->mi_params.mi_grid_base +
+                 tile_info.mi_row_start * cm->mi_params.mi_stride +
+                 tile_info.mi_col_start;
+        for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+             mi_row += cm->seq_params.mib_size,
+            mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) {
+          MB_MODE_INFO **mi = mi_ptr;
+          for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+               mi_col += cm->seq_params.mib_size,
+              mi += cm->seq_params.mib_size) {
+            count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                          temporal_predictor_count, t_unpred_seg_counts, mi_row,
+                          mi_col, cm->seq_params.sb_size);
+          }
         }
       }
     }
@@ -209,7 +216,7 @@
     no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
 
   // Frames without past dependency cannot use temporal prediction
-  if (cm->primary_ref_frame != PRIMARY_REF_NONE) {
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
     int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
     for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
       av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
@@ -226,7 +233,7 @@
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
-    assert(!cm->error_resilient_mode);
+    assert(!cm->features.error_resilient_mode);
     seg->temporal_update = 1;
   } else {
     seg->temporal_update = 0;

diff --git a/libaom/av1/encoder/speed_features.c b/libaom/av1/encoder/speed_features.c
index 5dfc585..e03faec 100644
--- a/libaom/av1/encoder/speed_features.c
+++ b/libaom/av1/encoder/speed_features.c

@@ -11,6 +11,8 @@
 
 #include <limits.h>
 
+#include "av1/common/reconintra.h"
+
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/rdopt.h"
@@ -29,9 +31,6 @@
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
     };
-static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
-  50, 50, 25, 15, 5, 1
-};
 
 // TODO(huisu@google.com): These settings are pretty relaxed, tune them for
 // each speed setting
@@ -43,24 +42,68 @@
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
 };
-static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
-                                                            25,  25,  10 };
 
 // Threshold values to be used for pruning the txfm_domain_distortion
 // based on block MSE
+// Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
 // TODO(any): Experiment the threshold logic based on variance metric
-static unsigned int tx_domain_dist_thresholds[MAX_TX_DOMAIN_EVAL_SPEED + 1] = {
-  UINT_MAX, 162754, 22026, 22026, 22026, 0
+static unsigned int tx_domain_dist_thresholds[3][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX }, { 22026, 22026, 22026 }, { 0, 0, 0 }
 };
+
+// Transform domain distortion type to be used for default, mode and winner mode
+// evaluation Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode
+// evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+static unsigned int tx_domain_dist_types[3][MODE_EVAL_TYPES] = { { 0, 2, 0 },
+                                                                 { 1, 2, 0 },
+                                                                 { 2, 2, 0 } };
+
 // Threshold values to be used for disabling coeff RD-optimization
-// based on block MSE
-// TODO(any): Experiment the threshold logic based on variance metric
-static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, 162754, 162754,
-                                                     22026, 22026 };
-// scaling values to be used for gating wedge/compound segment based on best
-// approximate rd
-static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
-static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+// based on block MSE / qstep^2.
+// TODO(any): Experiment the threshold logic based on variance metric.
+// For each row, the indices are as follows.
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc)
+// Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+// feature is ON
+// There are 6 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 3200, 250, UINT_MAX },
+  { 1728, 142, UINT_MAX },
+  { 864, 142, UINT_MAX },
+  { 432, 86, UINT_MAX },
+  { 216, 86, UINT_MAX }
+};
+
+// Transform size to be used for default, mode and winner mode evaluation
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+// feature is ON
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[3][MODE_EVAL_TYPES] = {
+  { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }
+};
+
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                                { 1, 1, 1 },
+                                                                { 1, 2, 1 } };
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -68,25 +111,15 @@
   return frame_is_kf_gf_arf(cpi);
 }
 
-// Sets a partition size down to which the auto partition code will always
-// search (can go lower), based on the image dimensions. The logic here
-// is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
-// limited by transform block size but the screen area take up by a given block
-// size will be larger for a small image format stretched to full screen.
-static BLOCK_SIZE set_partition_min_limit(const AV1_COMMON *const cm) {
-  unsigned int screen_area = (cm->width * cm->height);
-
-  // Select block size based on image format size.
-  if (screen_area < 1280 * 720) {
-    // Formats smaller in area than 720P
-    return BLOCK_4X4;
-  } else if (screen_area < 1920 * 1080) {
-    // Format >= 720P and < 1080P
-    return BLOCK_8X8;
-  } else {
-    // Formats 1080P and up
-    return BLOCK_16X16;
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
   }
 }
 
@@ -95,93 +128,146 @@
   const AV1_COMMON *const cm = &cpi->common;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
 
   if (is_480p_or_larger) {
-    sf->use_square_partition_only_threshold = BLOCK_128X128;
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     if (is_720p_or_larger)
-      sf->auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+      sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
     else
-      sf->auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+      sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
   } else {
-    sf->use_square_partition_only_threshold = BLOCK_64X64;
-    sf->auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+  }
+
+  if (is_4k_or_larger) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
   }
 
   // TODO(huisu@google.com): train models for 720P and above.
   if (!is_720p_or_larger) {
-    sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
-    sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
-    sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
-    sf->ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
-    sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
-  }
-
-  if (is_720p_or_larger && speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL_START &&
-      speed < CONFIG_2PASS_PARTITION_SEARCH_LVL_END) {
-    sf->two_pass_partition_search = 1;
+    sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->part_sf.ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    sf->part_sf.ml_early_term_after_part_split_level = 1;
   }
 
   if (speed >= 1) {
     if (is_720p_or_larger) {
-      sf->use_square_partition_only_threshold = BLOCK_128X128;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     } else if (is_480p_or_larger) {
-      sf->use_square_partition_only_threshold = BLOCK_64X64;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
     } else {
-      sf->use_square_partition_only_threshold = BLOCK_32X32;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
     }
 
     if (!is_720p_or_larger) {
-      sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
-      sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
-      sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
-      sf->ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
-      sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
-
-      sf->firstpass_simple_motion_search_early_term = 1;
+      sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->part_sf.ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
     }
+    sf->part_sf.ml_early_term_after_part_split_level = 2;
   }
 
   if (speed >= 2) {
     if (is_720p_or_larger) {
-      sf->use_square_partition_only_threshold = BLOCK_64X64;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
     } else if (is_480p_or_larger) {
-      sf->use_square_partition_only_threshold = BLOCK_32X32;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
     } else {
-      // TODO(chiyotsai@google.com): Setting the threshold to BLOCK_16X16 incurs
-      // a large loss (about 0.584%). Try increasing the threshold on boosted
-      // frame and see if it improves the performance.
-      sf->use_square_partition_only_threshold = BLOCK_32X32;
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
     }
 
     if (is_720p_or_larger) {
-      sf->adaptive_pred_interp_filter = 0;
-      sf->partition_search_breakout_dist_thr = (1 << 24);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 22);
-      sf->partition_search_breakout_rate_thr = 100;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+      sf->part_sf.partition_search_breakout_rate_thr = 100;
     }
-    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_obmc_prob_thresh = 16;
+    } else {
+      sf->inter_sf.prune_obmc_prob_thresh = 8;
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+    }
   }
 
   if (speed >= 3) {
+    sf->part_sf.ml_early_term_after_part_split_level = 0;
+
     if (is_720p_or_larger) {
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-      sf->partition_search_breakout_rate_thr = 200;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+      sf->part_sf.partition_search_breakout_rate_thr = 200;
     } else {
-      sf->max_intra_bsize = BLOCK_32X32;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->part_sf.max_intra_bsize = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
     }
-    sf->use_first_partition_pass_interintra_stats =
-        sf->two_pass_partition_search;
   }
 
   if (speed >= 4) {
     if (is_720p_or_larger) {
-      sf->partition_search_breakout_dist_thr = (1 << 26);
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+    }
+
+    sf->inter_sf.prune_obmc_prob_thresh = 16;
+  }
+
+  if (speed >= 5) {
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 16;
+    } else if (is_480p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 8;
+    }
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+                                                     SPEED_FEATURES *const sf,
+                                                     int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+  (void)is_720p_or_larger;  // Not used so far
+
+  if (!is_360p_or_larger) {
+    if (speed >= 6) sf->rt_sf.force_tx_search_off = 1;
+    if (speed >= 8) {
+      sf->rt_sf.use_modeled_non_rd_cost = 0;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+    }
+  }
+  if (is_360p_or_larger) {
+    if (speed >= 7) {
+      sf->interp_sf.disable_filter_search_var_thresh = 0;
+    }
+  }
+  if (!is_480p_or_larger) {
+    if (speed == 7) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (speed >= 8) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+
+      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     }
   }
 }
@@ -189,227 +275,330 @@
 static void set_good_speed_features_framesize_independent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
   const int boosted = frame_is_boosted(cpi);
   const int is_boosted_arf2_bwd_type =
-      boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame;
+      boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+  if (!cpi->oxcf.large_scale_tile) {
+    sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
+  }
 
   // Speed 0 for all speed features that give neutral coding performance change.
-  sf->reduce_inter_modes = 1;
-  sf->prune_ext_partition_types_search_level = 1;
-  sf->ml_prune_rect_partition = 1;
-  sf->ml_prune_ab_partition = 1;
-  sf->ml_prune_4_partition = 1;
-  sf->simple_motion_search_prune_rect = 1;
-  sf->adaptive_txb_search_level = 1;
-  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
-  sf->model_based_prune_tx_search_level = 1;
-  sf->model_based_post_interp_filter_breakout = 1;
-  sf->model_based_motion_mode_rd_breakout = 1;
+  sf->gm_sf.gm_disable_recode = 1;
+  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
 
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_4_partition = 1;
+  sf->part_sf.ml_prune_ab_partition = 1;
+  sf->part_sf.ml_prune_rect_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.simple_motion_search_prune_rect = 1;
+
+  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+  sf->inter_sf.disable_wedge_search_var_thresh = 0;
   // TODO(debargha): Test, tweak and turn on either 1 or 2
-  sf->inter_mode_rd_model_estimation = 1;
-  sf->inter_mode_rd_model_estimation_adaptive = 0;
+  sf->inter_sf.inter_mode_rd_model_estimation = 1;
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 1;
+  sf->inter_sf.prune_mode_search_simple_translation = 1;
+  sf->inter_sf.prune_motion_mode_level = 1;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions =
+      (boosted || (allow_screen_content_tools))
+          ? 0
+          : (is_boosted_arf2_bwd_type ? 1 : 2);
+  sf->inter_sf.prune_wedge_pred_diff_based = 1;
+  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.selective_ref_frame = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
 
-  sf->two_loop_comp_search = 0;
-  sf->prune_ref_frame_for_rect_partitions =
-      boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
-  sf->less_rectangular_check_level = 1;
-  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
-  sf->gm_disable_recode = 1;
-  sf->use_fast_interpolation_filter_search = 1;
-  sf->intra_tx_size_search_init_depth_sqr = 1;
-  sf->intra_angle_estimation = 1;
-  sf->selective_ref_frame = 1;
-  sf->prune_wedge_pred_diff_based = 1;
-  sf->disable_wedge_search_var_thresh = 0;
-  sf->disable_wedge_search_edge_thresh = 0;
-  sf->prune_motion_mode_level = 1;
-  sf->cb_pred_filter_search = 0;
-  sf->use_nonrd_pick_mode = 0;
-  sf->use_real_time_ref_set = 0;
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
+
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 24);
+  else
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+
+  sf->rd_sf.perform_coeff_opt = 1;
 
   if (speed >= 1) {
-    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
-    sf->selective_ref_frame = 2;
+    sf->gm_sf.disable_adaptive_warp_error_thresh = 0;
+    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+    sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
 
-    sf->intra_tx_size_search_init_depth_rect = 1;
-    sf->tx_size_search_lgr_block = 1;
+    sf->part_sf.intra_cnn_split = 1;
+    sf->part_sf.simple_motion_search_early_term_none = 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split in partition search function and set the
+    // speed feature accordingly
+    sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
 
-    sf->prune_ext_partition_types_search_level = 2;
-    sf->skip_repeat_interpolation_filter_search = 1;
-    sf->tx_type_search.skip_tx_search = 1;
-    sf->tx_type_search.ml_tx_split_thresh = 40;
-    sf->model_based_prune_tx_search_level = 0;
-    sf->adaptive_txb_search_level = 2;
-    sf->use_intra_txb_hash = 1;
-    sf->optimize_b_precheck = 1;
-    sf->dual_sgr_penalty_level = 1;
-    sf->use_accurate_subpel_search = USE_4_TAPS;
-    sf->reuse_inter_intra_mode = 1;
-    sf->prune_comp_search_by_single_result = 1;
-    sf->skip_repeated_newmv = 1;
-    sf->obmc_full_pixel_search_level = 1;
-    // TODO(anyone): Following speed feature will be further explored to
-    // identify the appropriate tradeoff between encoder performance and its
-    // speed.
-    sf->prune_single_motion_modes_by_simple_trans = 1;
+    sf->mv_sf.exhaustive_searches_thresh <<= 1;
+    sf->mv_sf.obmc_full_pixel_search_level = 1;
+    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
 
-    sf->simple_motion_search_split_only = 1;
-    sf->simple_motion_search_early_term_none = 1;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+    sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
+    sf->inter_sf.prune_motion_mode_level = 2;
+    sf->inter_sf.prune_ref_frame_for_rect_partitions =
+        (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
+            ? 0
+            : (boosted ? 1 : 2);
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+    sf->inter_sf.reuse_inter_intra_mode = 1;
+    sf->inter_sf.selective_ref_frame = 2;
+    sf->inter_sf.skip_repeated_newmv = 1;
 
-    sf->disable_wedge_search_var_thresh = 0;
-    sf->disable_wedge_search_edge_thresh = 0;
-    sf->disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
-    sf->prune_comp_type_by_comp_avg = 1;
-    sf->prune_motion_mode_level = 2;
-    sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
-    sf->cb_pred_filter_search = 1;
-    sf->use_transform_domain_distortion = boosted ? 0 : 1;
-    sf->perform_coeff_opt = boosted ? 0 : 1;
-    sf->use_inter_txb_hash = 0;
+    sf->interp_sf.cb_pred_filter_search = 0;
+    sf->interp_sf.use_interp_filter = 1;
+    sf->intra_sf.prune_palette_search_level = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 1;
+
+    sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
+    sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+    sf->lpf_sf.enable_sgr_ep_pruning = 1;
+
+    // TODO(any, yunqing): move this feature to speed 0.
+    sf->tpl_sf.skip_alike_starting_mv = 1;
   }
 
   if (speed >= 2) {
-    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
 
-    sf->selective_ref_frame = 3;
-    sf->inter_tx_size_search_init_depth_rect = 1;
-    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->part_sf.allow_partition_search_skip = 1;
 
-    sf->fast_cdef_search = 1;
+    sf->mv_sf.auto_mv_step_size = 1;
+    sf->mv_sf.subpel_iters_per_step = 1;
 
-    sf->adaptive_rd_thresh = 1;
-    sf->mv.auto_mv_step_size = 1;
-    sf->mv.subpel_iters_per_step = 1;
-    sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    // TODO(chiyotsai@google.com): We can get 10% speed up if we move
+    // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
+    // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
+    // bit more closely to figure out why.
+    sf->inter_sf.adaptive_rd_thresh = 1;
+    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = 1;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.disable_wedge_search_var_thresh = 100;
+    sf->inter_sf.fast_interintra_wedge_search = 1;
+    sf->inter_sf.fast_wedge_sign_estimate = 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
+    sf->inter_sf.prune_compound_using_neighbors = 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+    sf->inter_sf.prune_warp_using_wmtype = 1;
+    sf->inter_sf.selective_ref_frame = 3;
+    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
 
-    sf->partition_search_breakout_rate_thr = 80;
-    sf->allow_partition_search_skip = 1;
-    sf->disable_wedge_search_var_thresh = 100;
-    sf->disable_wedge_search_edge_thresh = 0;
-    sf->disable_interinter_wedge_newmv_search = 1;
-    sf->fast_wedge_sign_estimate = 1;
-    sf->disable_dual_filter = 1;
-    sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
-    sf->prune_comp_type_by_comp_avg = 2;
     // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
-    sf->cb_pred_filter_search = 0;
-    sf->adaptive_interp_filter_search = 1;
-    sf->perform_coeff_opt = boosted ? 0 : 2;
+    sf->interp_sf.adaptive_interp_filter_search = 1;
+    sf->interp_sf.disable_dual_filter = 1;
+    sf->interp_sf.disable_filter_search_var_thresh = 100;
+
+    sf->intra_sf.disable_smooth_intra =
+        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+    sf->lpf_sf.prune_sgr_based_on_wiener = !allow_screen_content_tools;
   }
 
   if (speed >= 3) {
-    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
-    sf->less_rectangular_check_level = 2;
-    sf->adaptive_pred_interp_filter = 1;
+    sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+    sf->part_sf.simple_motion_search_prune_agg = 1;
+    sf->part_sf.prune_4_partition_using_split_info =
+        !allow_screen_content_tools;
+
     // adaptive_motion_search breaks encoder multi-thread tests.
     // The values in x->pred_mv[] differ for single and multi-thread cases.
     // See aomedia:1778.
-    // sf->adaptive_motion_search = 1;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->use_transform_domain_distortion = boosted ? 1 : 2;
-    sf->use_accurate_subpel_search = USE_2_TAPS;
-    sf->adaptive_rd_thresh = 2;
-    if (cpi->oxcf.enable_smooth_interintra) {
-      sf->disable_smooth_interintra =
-          (boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame)
-              ? 0
-              : 1;
-    }
-    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
-    sf->gm_search_type = GM_DISABLE_SEARCH;
-    sf->prune_comp_search_by_single_result = 2;
-    sf->prune_motion_mode_level = boosted ? 2 : 3;
-    sf->prune_warp_using_wmtype = 1;
+    // sf->mv_sf.adaptive_motion_search = 1;
+    sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+    sf->mv_sf.search_method = DIAMOND;
+
+    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
-    sf->disable_wedge_interintra_search = 1;
+    sf->inter_sf.disable_wedge_interintra_search = 1;
     // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
     // and clean-up the speed feature
-    sf->perform_best_rd_based_gating_for_chroma = 1;
-    sf->prune_ref_frame_for_rect_partitions =
-        frame_is_intra_only(&cpi->common) ? 0 : (boosted ? 1 : 2);
-    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 3;
-    sf->prune_comp_type_by_model_rd = boosted ? 0 : 1;
-    // TODO(Venkat): Clean-up frame type dependency for
-    // simple_motion_search_split_only in partition search function and set the
-    // speed feature accordingly
-    // TODO(Venkat): Evaluate this speed feature for speed 1 & 2
-    sf->simple_motion_search_split_only =
-        cm->allow_screen_content_tools ? 1 : 2;
-    sf->disable_smooth_intra =
-        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+    sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
+    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
+    sf->inter_sf.selective_ref_frame = 4;
+    sf->inter_sf.skip_repeated_ref_mv = 1;
+    sf->inter_sf.skip_repeated_full_newmv = 1;
+    if (cpi->oxcf.enable_smooth_interintra)
+      sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
+    sf->inter_sf.reuse_compound_type_decision = 1;
+    sf->inter_sf.txfm_rd_gate_level = (boosted || allow_screen_content_tools)
+                                          ? 0
+                                          : (is_boosted_arf2_bwd_type ? 1 : 2);
+
+    sf->intra_sf.prune_palette_search_level = 2;
+
+    sf->tpl_sf.skip_alike_starting_mv = 2;
+    sf->tpl_sf.prune_intra_modes = 1;
+    sf->tpl_sf.reduce_first_step_size = 6;
+
+    sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction =
+        allow_screen_content_tools ? 1 : 2;
+
+    // TODO(any): Refactor the code related to following winner mode speed
+    // features
+    sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+    // TODO(any): Experiment with this speed feature by enabling for key frames
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist =
+        !allow_screen_content_tools;
+    sf->winner_mode_sf.motion_mode_for_winner_cand =
+        boosted
+            ? 0
+            : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1
+                                                                         : 2;
+
+    // TODO(any): evaluate if these lpf features can be moved to speed 2.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 0 : 2;
+    sf->lpf_sf.disable_loop_restoration_chroma =
+        (boosted || allow_screen_content_tools) ? 0 : 1;
+    sf->lpf_sf.reduce_wiener_window_size = !boosted;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+
+    sf->hl_sf.second_alt_ref_filtering = 0;
   }
 
   if (speed >= 4) {
-    sf->use_intra_txb_hash = 0;
-    sf->tx_type_search.fast_intra_tx_type_search = 1;
-    sf->disable_loop_restoration_chroma =
-        (boosted || cm->allow_screen_content_tools) ? 0 : 1;
-    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
-    sf->adaptive_pred_interp_filter = 0;
-    sf->cb_pred_filter_search = 1;
-    sf->adaptive_mode_search = 1;
-    sf->alt_ref_search_fp = 1;
-    sf->skip_sharp_interp_filter_search = 1;
-    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 4;
-    sf->adaptive_txb_search_level = boosted ? 2 : 3;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->part_sf.simple_motion_search_prune_agg = 2;
+    sf->part_sf.prune_ab_partition_using_split_info =
+        !allow_screen_content_tools;
+
+    sf->inter_sf.adaptive_mode_search = 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
+    sf->inter_sf.prune_ref_mv_idx_search = 1;
+    sf->inter_sf.txfm_rd_gate_level =
+        (boosted || allow_screen_content_tools) ? 0 : 3;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
+    sf->inter_sf.prune_compound_using_neighbors = 2;
+    sf->inter_sf.disable_smooth_interintra = 1;
+
+    sf->interp_sf.cb_pred_filter_search = 1;
+    sf->interp_sf.skip_sharp_interp_filter_search = 1;
+    sf->interp_sf.use_interp_filter = 2;
+    sf->interp_sf.adaptive_interp_filter_search = 2;
+
+    sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    // TODO(any): Experiment with this speed feature set to 2 for higher quality
+    // presets as well
+    sf->intra_sf.skip_intra_in_interframe = 2;
+
+    sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+    // TODO(any): Experiment with enabling of this speed feature as hash state
+    // is reset during winner mode processing
+    sf->tx_sf.use_intra_txb_hash = 0;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 5;
+    sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.enable_multiwinner_mode_process =
+        frame_is_intra_only(&cpi->common) ? 1 : 0;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+
+    sf->lpf_sf.cdef_pick_method = allow_screen_content_tools
+                                      ? CDEF_FAST_SEARCH_LVL1
+                                      : CDEF_FAST_SEARCH_LVL2;
+
+    // TODO(any): The following features have no impact on quality and speed,
+    // and are disabled.
+    // sf->part_sf.partition_search_breakout_rate_thr = 300;
+    // sf->interp_sf.disable_filter_search_var_thresh = 200;
+    // sf->rd_sf.use_fast_coef_costing = 1;
+
+    // TODO(any): The following features give really bad quality/speed trade
+    // off. Needs to be re-worked.
+    // sf->mv_sf.search_method = BIGDIA;
+    // sf->inter_sf.adaptive_rd_thresh = 4;
+    // sf->rd_sf.tx_domain_dist_level = 2;
+    // sf->rt_sf.mode_search_skip_flags =
+    //     (cm->current_frame.frame_type == KEY_FRAME)
+    //     ? 0
+    //     : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+    //     FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+    //     FLAG_EARLY_TERMINATE;
   }
 
   if (speed >= 5) {
-    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-    sf->tx_size_search_method = USE_LARGESTALL;
-    sf->mv.search_method = BIGDIA;
-    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_search_skip_flags =
-        (cm->current_frame.frame_type == KEY_FRAME)
-            ? 0
-            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
-                  FLAG_EARLY_TERMINATE;
-    sf->disable_filter_search_var_thresh = 200;
-    sf->use_fast_coef_costing = 1;
-    sf->partition_search_breakout_rate_thr = 300;
-    sf->use_transform_domain_distortion = 2;
+    sf->part_sf.simple_motion_search_prune_agg = 3;
+    sf->part_sf.ext_partition_eval_thresh =
+        allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+    sf->inter_sf.disable_interinter_wedge = 1;
+    sf->inter_sf.disable_obmc = 1;
+    sf->inter_sf.disable_onesided_comp = 1;
+    sf->inter_sf.txfm_rd_gate_level =
+        (boosted || allow_screen_content_tools) ? 0 : 4;
+    sf->inter_sf.prune_inter_modes_if_skippable = 1;
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->lpf_sf.disable_lr_filter = 1;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.prune_mesh_search = 1;
+    sf->mv_sf.reduce_search_range = 1;
+
+    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
   }
 
   if (speed >= 6) {
-    int i;
-    sf->optimize_coefficients = NO_TRELLIS_OPT;
-    sf->mv.search_method = HEX;
-    sf->disable_filter_search_var_thresh = 500;
-    for (i = 0; i < TX_SIZES; ++i) {
-      sf->intra_y_mode_mask[i] = INTRA_DC;
-      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
-    }
-    sf->partition_search_breakout_rate_thr = 500;
-    sf->mv.reduce_first_step_size = 1;
-    sf->simple_model_rd_from_var = 1;
-  }
-  if (speed >= 7) {
-    sf->default_max_partition_size = BLOCK_32X32;
-    sf->default_min_partition_size = BLOCK_8X8;
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
-    sf->frame_parameter_update = 0;
-    sf->mv.search_method = FAST_HEX;
-    sf->partition_search_type = REFERENCE_PARTITION;
-    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
-    // TODO(any): evaluate adaptive_mode_search=1 for speed 7 & 8
-    sf->adaptive_mode_search = 2;
-  }
-  if (speed >= 8) {
-    sf->mv.search_method = FAST_DIAMOND;
-    sf->mv.subpel_force_stop = HALF_PEL;
-    sf->lpf_pick = LPF_PICK_FROM_Q;
   }
 }
 
@@ -424,374 +613,559 @@
   const int boosted = frame_is_boosted(cpi);
 
   // Speed 0 for all speed features that give neutral coding performance change.
-  sf->reduce_inter_modes = 1;
-  sf->prune_ext_partition_types_search_level = 1;
-  sf->ml_prune_rect_partition = 1;
-  sf->ml_prune_ab_partition = 1;
-  sf->ml_prune_4_partition = 1;
-  sf->adaptive_txb_search_level = 1;
-  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
-  sf->model_based_prune_tx_search_level = 1;
-  sf->model_based_post_interp_filter_breakout = 1;
-  sf->model_based_motion_mode_rd_breakout = 1;
+  sf->gm_sf.gm_disable_recode = 1;
+  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_4_partition = 1;
+  sf->part_sf.ml_prune_ab_partition = 1;
+  sf->part_sf.ml_prune_rect_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
 
   // TODO(debargha): Test, tweak and turn on either 1 or 2
-  sf->inter_mode_rd_model_estimation = 0;
-  sf->inter_mode_rd_model_estimation_adaptive = 0;
-  sf->two_loop_comp_search = 0;
+  sf->inter_sf.inter_mode_rd_model_estimation = 0;
+  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+  sf->inter_sf.disable_wedge_search_var_thresh = 0;
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 0;
+  sf->inter_sf.prune_mode_search_simple_translation = 1;
+  sf->inter_sf.prune_motion_mode_level = 1;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
+  sf->inter_sf.prune_wedge_pred_diff_based = 1;
+  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.selective_ref_frame = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
 
-  sf->prune_ref_frame_for_rect_partitions = !boosted;
-  sf->less_rectangular_check_level = 1;
-  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
-  sf->gm_disable_recode = 1;
-  sf->use_fast_interpolation_filter_search = 1;
-  sf->intra_tx_size_search_init_depth_sqr = 1;
-  sf->intra_angle_estimation = 1;
-  sf->selective_ref_frame = 1;
-  sf->prune_wedge_pred_diff_based = 1;
-  sf->disable_wedge_search_var_thresh = 0;
-  sf->disable_wedge_search_edge_thresh = 0;
-  sf->prune_motion_mode_level = 1;
-  sf->cb_pred_filter_search = 0;
-  sf->use_nonrd_pick_mode = 0;
-  sf->use_real_time_ref_set = 0;
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
+
+  sf->mv_sf.full_pixel_search_level = 1;
+  sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+
+  sf->rt_sf.check_intra_pred_nonrd = 1;
+  sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+  sf->rt_sf.hybrid_intra_pickmode = 0;
+  sf->rt_sf.nonrd_prune_ref_frame_search = 0;
+  sf->rt_sf.reuse_inter_pred_nonrd = 0;
+  sf->rt_sf.use_comp_ref_nonrd = 1;
+  sf->rt_sf.use_nonrd_filter_search = 1;
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
 
   if (speed >= 1) {
-    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
-    sf->selective_ref_frame = 2;
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
 
-    sf->intra_tx_size_search_init_depth_rect = 1;
-    sf->tx_size_search_lgr_block = 1;
-    sf->prune_ext_partition_types_search_level = 2;
-    sf->skip_repeat_interpolation_filter_search = 1;
-    sf->tx_type_search.skip_tx_search = 1;
-    sf->tx_type_search.ml_tx_split_thresh = 40;
-    sf->model_based_prune_tx_search_level = 0;
-    sf->adaptive_txb_search_level = 2;
-    sf->use_intra_txb_hash = 1;
-    sf->optimize_b_precheck = 1;
-    sf->dual_sgr_penalty_level = 1;
-    sf->use_accurate_subpel_search = USE_4_TAPS;
-    sf->reuse_inter_intra_mode = 1;
-    sf->prune_comp_search_by_single_result = 1;
-    sf->skip_repeated_newmv = 1;
-    sf->obmc_full_pixel_search_level = 1;
-    // TODO(anyone): Following speed feature will be further explored to
-    // identify the appropriate tradeoff between encoder performance and its
-    // speed.
-    sf->prune_single_motion_modes_by_simple_trans = 1;
+    sf->part_sf.prune_ext_partition_types_search_level = 2;
+    sf->part_sf.simple_motion_search_prune_rect = 1;
 
-    sf->simple_motion_search_prune_rect = 1;
+    sf->mv_sf.obmc_full_pixel_search_level = 1;
+    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
 
-    sf->disable_wedge_search_var_thresh = 0;
-    sf->disable_wedge_search_edge_thresh = 0;
-    sf->prune_comp_type_by_comp_avg = 1;
-    sf->prune_motion_mode_level = 2;
-    sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
-    sf->cb_pred_filter_search = 1;
-    sf->use_transform_domain_distortion = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = 1;
+    sf->inter_sf.reuse_inter_intra_mode = 1;
+    sf->inter_sf.selective_ref_frame = 2;
+    sf->inter_sf.skip_repeated_newmv = 1;
+    sf->inter_sf.disable_wedge_search_var_thresh = 0;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+    sf->inter_sf.prune_motion_mode_level = 2;
+    sf->inter_sf.prune_single_motion_modes_by_simple_trans = 1;
+
+    sf->interp_sf.cb_pred_filter_search = 1;
+    sf->interp_sf.use_interp_filter = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.tx_size_search_lgr_block = 1;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 1;
+
+    sf->rd_sf.optimize_b_precheck = 1;
+    sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
   }
 
   if (speed >= 2) {
-    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
 
-    sf->selective_ref_frame = 3;
-    sf->inter_tx_size_search_init_depth_rect = 1;
-    sf->inter_tx_size_search_init_depth_sqr = 1;
-    sf->fast_cdef_search = 1;
+    sf->part_sf.allow_partition_search_skip = 1;
+    sf->part_sf.partition_search_breakout_rate_thr = 80;
 
-    sf->adaptive_rd_thresh = 1;
-    sf->mv.auto_mv_step_size = 1;
-    sf->mv.subpel_iters_per_step = 1;
-    sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    sf->mv_sf.auto_mv_step_size = 1;
+    sf->mv_sf.subpel_iters_per_step = 1;
 
-    sf->partition_search_breakout_rate_thr = 80;
-    sf->allow_partition_search_skip = 1;
-    sf->disable_wedge_search_var_thresh = 100;
-    sf->disable_wedge_search_edge_thresh = 0;
-    sf->fast_wedge_sign_estimate = 1;
-    sf->disable_dual_filter = 1;
-    sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
-    sf->prune_comp_type_by_comp_avg = 2;
-    sf->cb_pred_filter_search = 0;
-    sf->adaptive_interp_filter_search = 1;
+    sf->inter_sf.adaptive_rd_thresh = 1;
+    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.disable_wedge_search_var_thresh = 100;
+    sf->inter_sf.fast_wedge_sign_estimate = 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+    sf->inter_sf.selective_ref_frame = 3;
+    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+
+    sf->interp_sf.adaptive_interp_filter_search = 1;
+    sf->interp_sf.cb_pred_filter_search = 0;
+    sf->interp_sf.disable_dual_filter = 1;
+    sf->interp_sf.disable_filter_search_var_thresh = 100;
+
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
   }
 
   if (speed >= 3) {
-    sf->selective_ref_frame = 4;
-    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
-    sf->less_rectangular_check_level = 2;
-    sf->adaptive_pred_interp_filter = 1;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+
+    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
     // adaptive_motion_search breaks encoder multi-thread tests.
     // The values in x->pred_mv[] differ for single and multi-thread cases.
     // See aomedia:1778.
-    // sf->adaptive_motion_search = 1;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->use_transform_domain_distortion = 1;
-    sf->use_accurate_subpel_search = USE_2_TAPS;
-    sf->adaptive_rd_thresh = 2;
-    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
-    sf->gm_search_type = GM_DISABLE_SEARCH;
-    sf->prune_comp_search_by_single_result = 2;
-    sf->prune_motion_mode_level = boosted ? 2 : 3;
-    sf->prune_warp_using_wmtype = 1;
+    // sf->mv_sf.adaptive_motion_search = 1;
+
+    sf->inter_sf.adaptive_rd_thresh = 2;
+    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
-    sf->disable_wedge_interintra_search = 1;
+    sf->inter_sf.disable_wedge_interintra_search = 1;
+    sf->inter_sf.prune_comp_search_by_single_result = 2;
+    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
+    sf->inter_sf.prune_warp_using_wmtype = 1;
+    sf->inter_sf.selective_ref_frame = 4;
+
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+
+    sf->rd_sf.tx_domain_dist_level = 1;
+
+    sf->winner_mode_sf.tx_size_search_level = boosted ? 0 : 2;
   }
 
   if (speed >= 4) {
-    sf->use_intra_txb_hash = 0;
-    sf->use_mb_rd_hash = 0;
-    sf->tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_type_search.fast_inter_tx_type_search = 1;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
-    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
-    sf->adaptive_pred_interp_filter = 0;
-    sf->adaptive_mode_search = 1;
-    sf->alt_ref_search_fp = 1;
-    sf->skip_sharp_interp_filter_search = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+    sf->inter_sf.adaptive_mode_search = 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
+
+    sf->interp_sf.skip_sharp_interp_filter_search = 1;
+
+    sf->tx_sf.tx_type_search.fast_inter_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 0;
+
+    sf->rd_sf.use_mb_rd_hash = 0;
+
+    sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
   }
 
   if (speed >= 5) {
-    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-    sf->tx_size_search_method = USE_LARGESTALL;
-    sf->mv.search_method = BIGDIA;
-    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_search_skip_flags =
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFMAXBW;
+
+    sf->inter_sf.adaptive_rd_thresh = 4;
+    sf->interp_sf.disable_filter_search_var_thresh = 200;
+
+    sf->rd_sf.use_fast_coef_costing = 1;
+    sf->rd_sf.tx_domain_dist_level = 2;
+    sf->rd_sf.tx_domain_dist_thres_level = 2;
+    sf->winner_mode_sf.tx_size_search_level = 1;
+
+    sf->rt_sf.mode_search_skip_flags =
         (cm->current_frame.frame_type == KEY_FRAME)
             ? 0
             : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
                   FLAG_EARLY_TERMINATE;
-    sf->disable_filter_search_var_thresh = 200;
-    sf->use_fast_coef_costing = 1;
-    sf->partition_search_breakout_rate_thr = 300;
-    sf->use_transform_domain_distortion = 2;
+    sf->hl_sf.frame_parameter_update = 0;
+
+    sf->part_sf.default_max_partition_size = BLOCK_128X128;
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.max_intra_bsize = BLOCK_32X32;
+    sf->part_sf.partition_search_breakout_rate_thr = 500;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->part_sf.adjust_var_based_rd_partitioning = 2;
+
+    sf->mv_sf.search_method = FAST_DIAMOND;
+    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.use_fullpel_costlist = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->inter_sf.adaptive_mode_search = 2;
+    sf->inter_sf.inter_mode_rd_model_estimation = 2;
+
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+    }
+
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.use_inter_txb_hash = 0;
+    sf->tx_sf.refine_fast_tx_search_results = 0;
+
+    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+    sf->rd_sf.simple_model_rd_from_var = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.num_inter_modes_for_tx_search = 5;
+    sf->rt_sf.skip_interp_filter_search = 1;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.use_real_time_ref_set = 1;
+    sf->rt_sf.use_simple_rd_model = 1;
   }
 
   if (speed >= 6) {
-    int i;
-    sf->optimize_coefficients = NO_TRELLIS_OPT;
-    sf->mv.search_method = HEX;
-    sf->disable_filter_search_var_thresh = 500;
-    for (i = 0; i < TX_SIZES; ++i) {
-      sf->intra_y_mode_mask[i] = INTRA_DC;
-      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
-    }
-    sf->partition_search_breakout_rate_thr = 500;
-    sf->mv.reduce_first_step_size = 1;
-    sf->simple_model_rd_from_var = 1;
+    sf->part_sf.adjust_var_based_rd_partitioning = 1;
   }
+
   if (speed >= 7) {
-    sf->default_max_partition_size = BLOCK_32X32;
-    sf->default_min_partition_size = BLOCK_8X8;
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
-    sf->frame_parameter_update = 0;
-    sf->mv.search_method = FAST_HEX;
-    sf->partition_search_type = REFERENCE_PARTITION;
-    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->hl_sf.frame_parameter_update = 0;
+
+    sf->part_sf.default_max_partition_size = BLOCK_128X128;
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+
+    sf->mv_sf.search_method = FAST_DIAMOND;
+    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+    sf->inter_sf.inter_mode_rd_model_estimation = 2;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+    sf->rt_sf.reuse_inter_pred_nonrd = 0;
+    sf->rt_sf.short_circuit_low_temp_var = 0;
+    sf->rt_sf.skip_interp_filter_search = 0;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.use_nonrd_altref_frame = 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    sf->rt_sf.hybrid_intra_pickmode = 1;
   }
+
   if (speed >= 8) {
-    sf->mv.search_method = FAST_DIAMOND;
-    sf->lpf_pick = LPF_PICK_FROM_Q;
-    sf->default_max_partition_size = BLOCK_128X128;
-    sf->default_min_partition_size = BLOCK_8X8;
-    sf->partition_search_type = VAR_BASED_PARTITION;
-    sf->use_real_time_ref_set = 1;
-    // Can't use LARGEST TX mode with pre-calculated partition
-    // and disabled TX64
-    if (!cpi->oxcf.enable_tx64) sf->tx_size_search_method = USE_FAST_RD;
-    sf->use_nonrd_pick_mode = 1;
-    sf->inter_mode_rd_model_estimation = 2;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.short_circuit_low_temp_var = 1;
+    sf->rt_sf.reuse_inter_pred_nonrd = 1;
+    sf->rt_sf.use_nonrd_altref_frame = 0;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    sf->rt_sf.use_modeled_non_rd_cost = 1;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    sf->interp_sf.cb_pred_filter_search = 1;
   }
 }
 
+static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+  // best quality defaults
+  hl_sf->frame_parameter_update = 1;
+  hl_sf->recode_loop = ALLOW_RECODE;
+  hl_sf->disable_overlay_frames = 0;
+  hl_sf->adaptive_overlay_encoding = 1;
+  // Recode loop tolerance %.
+  hl_sf->recode_tolerance = 25;
+  hl_sf->high_precision_mv_usage = CURRENT_Q;
+  hl_sf->second_alt_ref_filtering = 1;
+}
+
+static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+  tpl_sf->prune_intra_modes = 0;
+  tpl_sf->reduce_first_step_size = 0;
+  tpl_sf->skip_alike_starting_mv = 0;
+  tpl_sf->subpel_force_stop = EIGHTH_PEL;
+}
+
+static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+  gm_sf->gm_erroradv_type = GM_ERRORADV_TR_0;
+  gm_sf->disable_adaptive_warp_error_thresh = 1;
+  gm_sf->selective_ref_gm = 1;
+  gm_sf->gm_search_type = GM_FULL_SEARCH;
+  gm_sf->gm_disable_recode = 0;
+  gm_sf->prune_ref_frame_for_gm_search = 0;
+}
+
+static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->auto_min_partition_based_on_simple_motion = 0;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->allow_partition_search_skip = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->always_this_block_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->ml_prune_rect_partition = 0;
+  part_sf->ml_prune_ab_partition = 0;
+  part_sf->ml_prune_4_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = 0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->intra_cnn_split = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->prune_4_partition_using_split_info = 0;
+  part_sf->prune_ab_partition_using_split_info = 0;
+}
+
+static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+  mv_sf->full_pixel_search_level = 0;
+  mv_sf->adaptive_motion_search = 0;
+  mv_sf->auto_mv_step_size = 0;
+  mv_sf->exhaustive_searches_thresh = 0;
+  mv_sf->obmc_full_pixel_search_level = 0;
+  mv_sf->prune_mesh_search = 0;
+  mv_sf->reduce_search_range = 0;
+  mv_sf->search_method = NSTEP;
+  mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_iters_per_step = 2;
+  mv_sf->subpel_search_method = SUBPEL_TREE;
+  mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+  mv_sf->use_fullpel_costlist = 0;
+}
+
+static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+  inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  inter_sf->adaptive_rd_thresh = 0;
+  inter_sf->model_based_post_interp_filter_breakout = 0;
+  inter_sf->reduce_inter_modes = 0;
+  inter_sf->adaptive_mode_search = 0;
+  inter_sf->alt_ref_search_fp = 0;
+  inter_sf->selective_ref_frame = 0;
+  inter_sf->prune_ref_frame_for_rect_partitions = 0;
+  inter_sf->disable_wedge_search_edge_thresh = 0;
+  inter_sf->disable_wedge_search_var_thresh = 0;
+  inter_sf->fast_wedge_sign_estimate = 0;
+  inter_sf->prune_wedge_pred_diff_based = 0;
+  inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
+  inter_sf->reuse_inter_intra_mode = 0;
+  inter_sf->disable_sb_level_coeff_cost_upd = 0;
+  inter_sf->disable_sb_level_mv_cost_upd = 0;
+  inter_sf->prune_inter_modes_based_on_tpl = 0;
+  inter_sf->prune_comp_search_by_single_result = 0;
+  inter_sf->skip_repeated_ref_mv = 0;
+  inter_sf->skip_repeated_newmv = 0;
+  inter_sf->skip_repeated_full_newmv = 0;
+  inter_sf->prune_single_motion_modes_by_simple_trans = 0;
+  inter_sf->inter_mode_rd_model_estimation = 0;
+  inter_sf->prune_compound_using_single_ref = 0;
+  inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->disable_onesided_comp = 0;
+  inter_sf->prune_mode_search_simple_translation = 0;
+  inter_sf->prune_comp_type_by_comp_avg = 0;
+  inter_sf->disable_interinter_wedge_newmv_search = 0;
+  inter_sf->enable_interinter_diffwtd_newmv_search = 0;
+  inter_sf->disable_smooth_interintra = 0;
+  inter_sf->prune_motion_mode_level = 0;
+  inter_sf->prune_warp_using_wmtype = 0;
+  inter_sf->disable_wedge_interintra_search = 0;
+  inter_sf->fast_interintra_wedge_search = 0;
+  inter_sf->prune_comp_type_by_model_rd = 0;
+  inter_sf->perform_best_rd_based_gating_for_chroma = 0;
+  inter_sf->prune_obmc_prob_thresh = 0;
+  inter_sf->disable_obmc = 0;
+  inter_sf->disable_interinter_wedge = 0;
+  inter_sf->prune_ref_mv_idx_search = 0;
+  inter_sf->prune_warped_prob_thresh = 0;
+  inter_sf->reuse_compound_type_decision = 0;
+  inter_sf->txfm_rd_gate_level = 0;
+  inter_sf->prune_inter_modes_if_skippable = 0;
+}
+
+static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+  interp_sf->disable_filter_search_var_thresh = 0;
+  interp_sf->adaptive_interp_filter_search = 0;
+  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->disable_dual_filter = 0;
+  interp_sf->use_interp_filter = 0;
+  interp_sf->skip_sharp_interp_filter_search = 0;
+}
+
+static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->skip_intra_in_interframe = 1;
+  intra_sf->intra_pruning_with_hog = 0;
+  intra_sf->src_var_thresh_intra_skip = 1;
+  intra_sf->prune_palette_search_level = 0;
+
+  for (int i = 0; i < TX_SIZES; i++) {
+    intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
+    intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+  }
+  intra_sf->disable_smooth_intra = 0;
+}
+
+static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+  tx_sf->inter_tx_size_search_init_depth_sqr = 0;
+  tx_sf->inter_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_sqr = 0;
+  tx_sf->tx_size_search_lgr_block = 0;
+  tx_sf->model_based_prune_tx_search_level = 0;
+  tx_sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
+  tx_sf->tx_type_search.use_skip_flag_prediction = 1;
+  tx_sf->tx_type_search.use_reduced_intra_txset = 0;
+  tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
+  tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
+  tx_sf->tx_type_search.skip_tx_search = 0;
+  tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+  tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
+  tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
+  tx_sf->txb_split_cap = 1;
+  tx_sf->adaptive_txb_search_level = 0;
+  tx_sf->use_intra_txb_hash = 0;
+  tx_sf->use_inter_txb_hash = 1;
+  tx_sf->refine_fast_tx_search_results = 1;
+}
+
+static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+                                  const AV1_COMP *cpi) {
+  if (cpi->oxcf.disable_trellis_quant == 3) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                       ? NO_ESTIMATE_YRD_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 2) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                       ? FINAL_PASS_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 0) {
+    if (is_lossless_requested(&cpi->oxcf)) {
+      rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+    } else {
+      rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
+    }
+  } else if (cpi->oxcf.disable_trellis_quant == 1) {
+    rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+  } else {
+    assert(0 && "Invalid disable_trellis_quant value");
+  }
+  // TODO(sarahparker) Pair this with a speed setting once experiments are done
+  rd_sf->trellis_eob_fast = 0;
+  rd_sf->use_mb_rd_hash = 1;
+  rd_sf->optimize_b_precheck = 0;
+  rd_sf->use_fast_coef_costing = 0;
+  rd_sf->simple_model_rd_from_var = 0;
+  rd_sf->tx_domain_dist_level = 0;
+  rd_sf->tx_domain_dist_thres_level = 0;
+  rd_sf->use_hash_based_trellis = 0;
+  rd_sf->perform_coeff_opt = 0;
+}
+
+static AOM_INLINE void init_winner_mode_sf(
+    WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
+  winner_mode_sf->motion_mode_for_winner_cand = 0;
+  // Set this at the appropriate speed levels
+  winner_mode_sf->tx_size_search_level = USE_FULL_RD;
+  winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
+  winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
+  winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
+  winner_mode_sf->enable_multiwinner_mode_process = 0;
+}
+
+static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+  lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->prune_wiener_based_on_src_var = 0;
+  lpf_sf->prune_sgr_based_on_wiener = 0;
+  lpf_sf->enable_sgr_ep_pruning = 0;
+  lpf_sf->reduce_wiener_window_size = 0;
+  lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
+  // Set decoder side speed feature to use less dual sgr modes
+  lpf_sf->dual_sgr_penalty_level = 0;
+  lpf_sf->disable_lr_filter = 0;
+}
+
+static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+  rt_sf->mode_search_skip_flags = 0;
+  rt_sf->skip_interp_filter_search = 0;
+  rt_sf->force_tx_search_off = 0;
+  rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+  rt_sf->use_simple_rd_model = 0;
+  rt_sf->nonrd_check_partition_merge_mode = 0;
+  rt_sf->nonrd_check_partition_split = 0;
+}
+
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   if (oxcf->mode == GOOD) {
     set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+  } else if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
   }
 
   // This is only used in motion vector unit test.
   if (cpi->oxcf.motion_vector_unit_test == 1)
-    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+    cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv;
   else if (cpi->oxcf.motion_vector_unit_test == 2)
-    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+    cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
+                                 dim_to_size(cpi->oxcf.min_partition_size));
+  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
+                                 dim_to_size(cpi->oxcf.max_partition_size));
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
 }
 
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCK *const x = &cpi->td.mb;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
-  // best quality defaults
-  sf->frame_parameter_update = 1;
-  sf->mv.search_method = NSTEP;
-  sf->recode_loop = ALLOW_RECODE;
-  sf->mv.subpel_search_method = SUBPEL_TREE;
-  sf->mv.subpel_iters_per_step = 2;
-  sf->mv.subpel_force_stop = EIGHTH_PEL;
-  if (cpi->oxcf.disable_trellis_quant == 3) {
-    sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
-                                    ? NO_ESTIMATE_YRD_TRELLIS_OPT
-                                    : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 2) {
-    sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
-                                    ? FINAL_PASS_TRELLIS_OPT
-                                    : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 0) {
-    if (is_lossless_requested(&cpi->oxcf))
-      sf->optimize_coefficients = NO_TRELLIS_OPT;
-    else
-      sf->optimize_coefficients = FULL_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 1) {
-    sf->optimize_coefficients = NO_TRELLIS_OPT;
-  } else {
-    assert(0 && "Invalid disable_trellis_quant value");
-  }
-  sf->gm_erroradv_type = GM_ERRORADV_TR_0;
-  sf->mv.reduce_first_step_size = 0;
-  sf->mv.auto_mv_step_size = 0;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
-  sf->adaptive_rd_thresh = 0;
-  // TODO(sarahparker) Pair this with a speed setting once experiments are done
-  sf->trellis_eob_fast = 0;
-  sf->tx_size_search_method = cpi->oxcf.tx_size_search_method;
-  sf->inter_tx_size_search_init_depth_sqr = 0;
-  sf->inter_tx_size_search_init_depth_rect = 0;
-  sf->intra_tx_size_search_init_depth_rect = 0;
-  sf->intra_tx_size_search_init_depth_sqr = 0;
-  sf->tx_size_search_lgr_block = 0;
-  sf->model_based_prune_tx_search_level = 0;
-  sf->model_based_post_interp_filter_breakout = 0;
-  sf->model_based_motion_mode_rd_breakout = 0;
-  sf->reduce_inter_modes = 0;
-  sf->selective_ref_gm = 1;
-  sf->adaptive_motion_search = 0;
-  sf->adaptive_pred_interp_filter = 0;
-  sf->adaptive_mode_search = 0;
-  sf->alt_ref_search_fp = 0;
-  sf->partition_search_type = SEARCH_PARTITION;
-  sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
-  sf->tx_type_search.ml_tx_split_thresh = 30;
-  sf->tx_type_search.use_skip_flag_prediction = 1;
-  sf->tx_type_search.fast_intra_tx_type_search = 0;
-  sf->tx_type_search.fast_inter_tx_type_search = 0;
-  sf->tx_type_search.skip_tx_search = 0;
-  sf->selective_ref_frame = 0;
-  sf->less_rectangular_check_level = 0;
-  sf->use_square_partition_only_threshold = BLOCK_128X128;
-  sf->prune_ref_frame_for_rect_partitions = 0;
-  sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
-  sf->auto_min_partition_based_on_simple_motion = 0;
-  sf->rd_auto_partition_min_limit = BLOCK_4X4;
-  sf->default_max_partition_size = BLOCK_LARGEST;
-  sf->default_min_partition_size = BLOCK_4X4;
-  sf->adjust_partitioning_from_last_frame = 0;
-  sf->mode_search_skip_flags = 0;
-  sf->disable_filter_search_var_thresh = 0;
-  sf->allow_partition_search_skip = 0;
-  sf->use_accurate_subpel_search = USE_8_TAPS;
-  sf->disable_wedge_search_edge_thresh = 0;
-  sf->use_first_partition_pass_interintra_stats = 0;
-  sf->disable_wedge_search_var_thresh = 0;
-  sf->disable_loop_restoration_chroma = 0;
-  sf->fast_wedge_sign_estimate = 0;
-  sf->prune_wedge_pred_diff_based = 0;
-  sf->drop_ref = 0;
-  sf->skip_intra_in_interframe = 1;
-  sf->txb_split_cap = 1;
-  sf->adaptive_txb_search_level = 0;
-  sf->two_pass_partition_search = 0;
-  sf->firstpass_simple_motion_search_early_term = 0;
-  sf->use_intra_txb_hash = 0;
-  sf->use_inter_txb_hash = 1;
-  sf->use_mb_rd_hash = 1;
-  sf->optimize_b_precheck = 0;
-  sf->two_loop_comp_search = 1;
-  sf->second_loop_comp_fast_tx_search = 0;
-  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
-  sf->reuse_inter_intra_mode = 0;
-  sf->intra_angle_estimation = 0;
-  sf->skip_obmc_in_uniform_mv_field = 0;
-  sf->skip_wm_in_uniform_mv_field = 0;
-  sf->adaptive_interp_filter_search = 0;
-
-  for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = INTRA_ALL;
-    sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
-  }
-  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
-  sf->use_fast_coef_costing = 0;
-  sf->max_intra_bsize = BLOCK_LARGEST;
-  // This setting only takes effect when partition_search_type is set
-  // to FIXED_PARTITION.
-  sf->always_this_block_size = BLOCK_16X16;
-  // Recode loop tolerance %.
-  sf->recode_tolerance = 25;
-  sf->partition_search_breakout_dist_thr = 0;
-  sf->partition_search_breakout_rate_thr = 0;
-  sf->simple_model_rd_from_var = 0;
-  sf->prune_ext_partition_types_search_level = 0;
-  sf->ml_prune_rect_partition = 0;
-  sf->ml_prune_ab_partition = 0;
-  sf->ml_prune_4_partition = 0;
-  sf->fast_cdef_search = 0;
-  for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
-    sf->ml_partition_search_breakout_thresh[i] = -1;  // -1 means not enabled.
-  }
-  sf->simple_motion_search_split_only = 0;
-  sf->simple_motion_search_prune_rect = 0;
-  sf->simple_motion_search_early_term_none = 0;
-
-  // Set this at the appropriate speed levels
-  sf->use_transform_domain_distortion = 0;
-  sf->gm_search_type = GM_FULL_SEARCH;
-  sf->gm_disable_recode = 0;
-  sf->use_fast_interpolation_filter_search = 0;
-  sf->disable_dual_filter = 0;
-  sf->skip_repeat_interpolation_filter_search = 0;
-  sf->use_hash_based_trellis = 0;
-  sf->prune_comp_search_by_single_result = 0;
-  sf->skip_repeated_newmv = 0;
-  sf->prune_single_motion_modes_by_simple_trans = 0;
-
-  // Set decoder side speed feature to use less dual sgr modes
-  sf->dual_sgr_penalty_level = 0;
-
-  // TODO(angiebird, debargha): Re-evaluate the impact of
-  // inter_mode_rd_model_estimation in conjunction with
-  // model_based_motion_mode_rd_breakout
-  sf->inter_mode_rd_model_estimation = 0;
-  sf->inter_mode_rd_model_estimation_adaptive = 0;
-
-  sf->obmc_full_pixel_search_level = 0;
-  sf->skip_sharp_interp_filter_search = 0;
-  sf->prune_comp_type_by_comp_avg = 0;
-  sf->disable_interinter_wedge_newmv_search = 0;
-  sf->disable_smooth_interintra = 0;
-  sf->prune_motion_mode_level = 0;
-  sf->prune_warp_using_wmtype = 0;
-  sf->disable_wedge_interintra_search = 0;
-  sf->perform_coeff_opt = 0;
-  sf->prune_comp_type_by_model_rd = 0;
-  sf->disable_smooth_intra = 0;
-  sf->perform_best_rd_based_gating_for_chroma = 0;
+  init_hl_sf(&sf->hl_sf);
+  init_tpl_sf(&sf->tpl_sf);
+  init_gm_sf(&sf->gm_sf);
+  init_part_sf(&sf->part_sf);
+  init_mv_sf(&sf->mv_sf);
+  init_inter_sf(&sf->inter_sf);
+  init_interp_sf(&sf->interp_sf);
+  init_intra_sf(&sf->intra_sf);
+  init_tx_sf(&sf->tx_sf);
+  init_rd_sf(&sf->rd_sf, cpi);
+  init_winner_mode_sf(&sf->winner_mode_sf);
+  init_lpf_sf(&sf->lpf_sf);
+  init_rt_sf(&sf->rt_sf);
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, speed);
@@ -799,97 +1173,150 @@
     set_rt_speed_features_framesize_independent(cpi, sf, speed);
 
   if (!cpi->seq_params_locked) {
-    cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter;
+    cpi->common.seq_params.enable_dual_filter &=
+        !sf->interp_sf.disable_dual_filter;
+    cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
   }
 
-  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64
   // blocks. Normalise this if the blocks are bigger.
   if (MAX_SB_SIZE_LOG2 > 6) {
-    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+    sf->part_sf.partition_search_breakout_dist_thr <<=
+        2 * (MAX_SB_SIZE_LOG2 - 6);
   }
 
-  cpi->diamond_search_sad = av1_diamond_search_sad;
-
-  sf->allow_exhaustive_searches = 1;
-
   const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-    sf->exhaustive_searches_thresh = (1 << 24);
-  else
-    sf->exhaustive_searches_thresh = (1 << 25);
-  sf->max_exaustive_pct = good_quality_max_mesh_pct[mesh_speed];
-  if (mesh_speed > 0)
-    sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
-
   for (i = 0; i < MAX_MESH_STEP; ++i) {
-    sf->mesh_patterns[i].range =
+    sf->mv_sf.mesh_patterns[i].range =
         good_quality_mesh_patterns[mesh_speed][i].range;
-    sf->mesh_patterns[i].interval =
+    sf->mv_sf.mesh_patterns[i].interval =
         good_quality_mesh_patterns[mesh_speed][i].interval;
   }
-  if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
-      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
-       cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
-    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range;
-      sf->mesh_patterns[i].interval =
-          intrabc_mesh_patterns[mesh_speed][i].interval;
-    }
-    sf->max_exaustive_pct = intrabc_max_mesh_pct[mesh_speed];
+
+  // Update the mesh pattern of exhaustive motion search for intraBC
+  // Though intraBC mesh pattern is populated for all frame types, it is used
+  // only for intra frames of screen contents
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mv_sf.intrabc_mesh_patterns[i].range =
+        intrabc_mesh_patterns[mesh_speed][i].range;
+    sf->mv_sf.intrabc_mesh_patterns[i].interval =
+        intrabc_mesh_patterns[mesh_speed][i].interval;
   }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
-  if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
+  if (is_stat_generation_stage(cpi))
+    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
 
   // No recode or trellis for 1 pass.
-  if (oxcf->pass == 0) {
-    sf->recode_loop = DISALLOW_RECODE;
-    sf->optimize_coefficients = NO_TRELLIS_OPT;
-  }
-  // FIXME: trellis not very efficient for quantization matrices
-  if (oxcf->using_qm) sf->optimize_coefficients = NO_TRELLIS_OPT;
+  if (oxcf->pass == 0) sf->hl_sf.recode_loop = DISALLOW_RECODE;
 
-  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
-    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
-  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
-    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned;
-  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
-    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more;
-  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
-    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) {
+    mv_search_params->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  x->min_partition_size = sf->default_min_partition_size;
-  x->max_partition_size = sf->default_max_partition_size;
+  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
+                                 dim_to_size(cpi->oxcf.min_partition_size));
+  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
+                                 dim_to_size(cpi->oxcf.max_partition_size));
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
 
   // This is only used in motion vector unit test.
   if (cpi->oxcf.motion_vector_unit_test == 1)
-    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+    mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
   else if (cpi->oxcf.motion_vector_unit_test == 2)
-    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
-  cpi->max_comp_type_rd_threshold_mul =
-      comp_type_rd_threshold_mul[sf->prune_comp_type_by_comp_avg];
-  cpi->max_comp_type_rd_threshold_div =
-      comp_type_rd_threshold_div[sf->prune_comp_type_by_comp_avg];
-  const int tx_domain_speed = AOMMIN(speed, MAX_TX_DOMAIN_EVAL_SPEED);
-  cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed];
+    mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+  // assert ensures that tx_domain_dist_level is accessed correctly
+  assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_thres_level < 3);
+  memcpy(winner_mode_params->tx_domain_dist_threshold,
+         tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
+         sizeof(winner_mode_params->tx_domain_dist_threshold));
+
+  assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_level < 3);
+  memcpy(winner_mode_params->use_transform_domain_distortion,
+         tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
+         sizeof(winner_mode_params->use_transform_domain_distortion));
 
   // assert ensures that coeff_opt_dist_thresholds is accessed correctly
-  assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5);
-  cpi->coeff_opt_dist_threshold =
-      coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt];
+  assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
+         cpi->sf.rd_sf.perform_coeff_opt < 6);
+  memcpy(winner_mode_params->coeff_opt_dist_threshold,
+         coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+         sizeof(winner_mode_params->coeff_opt_dist_threshold));
 
-#if CONFIG_DIST_8X8
-  if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+  // assert ensures that predict_skip_levels is accessed correctly
+  assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+         cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
+  memcpy(winner_mode_params->predict_skip_level,
+         predict_skip_levels[cpi->sf.tx_sf.tx_type_search
+                                 .use_skip_flag_prediction],
+         sizeof(winner_mode_params->predict_skip_level));
 
-  if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
-#endif  // CONFIG_DIST_8X8
+  // assert ensures that tx_size_search_level is accessed correctly
+  assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
+         cpi->sf.winner_mode_sf.tx_size_search_level < 3);
+  memcpy(winner_mode_params->tx_size_search_methods,
+         tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
+         sizeof(winner_mode_params->tx_size_search_methods));
+
   if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
-    sf->adaptive_rd_thresh = 0;
-    if (sf->inter_mode_rd_model_estimation == 1) {
-      sf->inter_mode_rd_model_estimation = 0;
-      sf->inter_mode_rd_model_estimation_adaptive = 0;
+    if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
+      // Revert to type 2
+      sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    }
+  }
+}
+
+// Override some speed features based on qindex
+void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  if (is_720p_or_larger && cpi->oxcf.mode == GOOD && speed == 0) {
+    if (cm->quant_params.base_qindex <= 80) {
+      sf->rd_sf.perform_coeff_opt = 2;
+      memcpy(winner_mode_params->coeff_opt_dist_threshold,
+             coeff_opt_dist_thresholds[sf->rd_sf.perform_coeff_opt],
+             sizeof(winner_mode_params->coeff_opt_dist_threshold));
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+      sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+      sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    }
+  }
+
+  if (cpi->oxcf.mode == GOOD && speed >= 3) {
+    // Disable extended partitions for lower quantizers
+    if (cm->quant_params.base_qindex <= 100 &&
+        !cm->features.allow_screen_content_tools && !boosted) {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    }
+  }
+
+  if (cpi->oxcf.mode == GOOD && speed >= 4) {
+    // Disable extended partitions for lower quantizers
+    const int qindex_thresh = boosted ? 80 : 120;
+    if (cm->quant_params.base_qindex <= qindex_thresh &&
+        !cm->features.allow_screen_content_tools &&
+        !frame_is_intra_only(&cpi->common)) {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
 }

diff --git a/libaom/av1/encoder/speed_features.h b/libaom/av1/encoder/speed_features.h
index a321192..d12c3c0 100644
--- a/libaom/av1/encoder/speed_features.h
+++ b/libaom/av1/encoder/speed_features.h

@@ -18,6 +18,38 @@
 extern "C" {
 #endif
 
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+enum {
+  GM_FULL_SEARCH,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
+  GM_DISABLE_SEARCH
+} UENUM1BYTE(GM_SEARCH_TYPE);
+
+enum {
+  GM_ERRORADV_TR_0,
+  GM_ERRORADV_TR_1,
+  GM_ERRORADV_TR_2,
+  GM_ERRORADV_TR_TYPES,
+} UENUM1BYTE(GM_ERRORADV_TYPE);
+
+enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  DIST_WTD_COMP_ENABLED,
+  DIST_WTD_COMP_SKIP_MV_SEARCH,
+  DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
+
 enum {
   INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
               (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
@@ -85,16 +117,6 @@
 } UENUM1BYTE(DEV_SPEED_FEATURES);
 
 enum {
-  DIAMOND = 0,
-  NSTEP = 1,
-  HEX = 2,
-  BIGDIA = 3,
-  SQUARE = 4,
-  FAST_HEX = 5,
-  FAST_DIAMOND = 6
-} UENUM1BYTE(SEARCH_METHODS);
-
-enum {
   // No recode.
   DISALLOW_RECODE = 0,
   // Allow recode for KF and exceeding maximum frame bandwidth.
@@ -122,6 +144,8 @@
 enum {
   // Try the full image with different values.
   LPF_PICK_FROM_FULL_IMAGE,
+  // Try the full image filter search with non-dual filter only.
+  LPF_PICK_FROM_FULL_IMAGE_NON_DUAL,
   // Try a small portion of the image with different values.
   LPF_PICK_FROM_SUBIMAGE,
   // Estimate the level based on quantizer and frame type
@@ -131,6 +155,14 @@
 } UENUM1BYTE(LPF_PICK_METHOD);
 
 enum {
+  CDEF_FULL_SEARCH,
+  CDEF_FAST_SEARCH_LVL1,  // Search among a subset of all possible filters.
+  CDEF_FAST_SEARCH_LVL2,  // Search reduced subset of filters than Level 1.
+  CDEF_PICK_FROM_Q,       // Estimate filter strength based on quantizer.
+  CDEF_PICK_METHODS
+} UENUM1BYTE(CDEF_PICK_METHOD);
+
+enum {
   // Terminate search early based on distortion so far compared to
   // qp step, distortion in the neighborhood of the frame, etc.
   FLAG_EARLY_TERMINATE = 1 << 0,
@@ -151,15 +183,14 @@
 
 enum {
   NO_PRUNE = 0,
-  // eliminates one tx type in vertical and horizontal direction
-  PRUNE_ONE = 1,
-  // eliminates two tx types in each direction
-  PRUNE_TWO = 2,
   // adaptively prunes the least perspective tx types out of all 16
   // (tuned to provide negligible quality loss)
-  PRUNE_2D_ACCURATE = 3,
+  PRUNE_2D_ACCURATE = 1,
   // similar, but applies much more aggressive pruning to get better speed-up
-  PRUNE_2D_FAST = 4,
+  PRUNE_2D_FAST = 2,
+  PRUNE_2D_MORE = 3,
+  // More aggressive pruning based on tx type score and allowed tx count
+  PRUNE_2D_AGGRESSIVE = 4,
 } UENUM1BYTE(TX_TYPE_PRUNE_MODE);
 
 typedef struct {
@@ -167,6 +198,9 @@
   int fast_intra_tx_type_search;
   int fast_inter_tx_type_search;
 
+  // prune two least frequently chosen transforms for each intra mode
+  int use_reduced_intra_txset;
+
   // Use a skip flag prediction model to detect blocks with skip = 1 early
   // and avoid doing full TX type search for such blocks.
   int use_skip_flag_prediction;
@@ -177,6 +211,17 @@
   // skip remaining transform type search when we found the rdcost of skip is
   // better than applying transform
   int skip_tx_search;
+
+  // Prune tx type search using previous frame stats.
+  int prune_tx_type_using_stats;
+  // Prune tx type search using estimated RDcost
+  int prune_tx_type_est_rd;
+
+  // Flag used to control the winner mode processing for tx type pruning for
+  // inter blocks. It enables further tx type mode pruning based on ML model for
+  // mode evaluation and disables tx type mode pruning for winner mode
+  // processing.
+  int enable_winner_mode_tx_type_pruning;
 } TX_TYPE_SEARCH;
 
 enum {
@@ -192,27 +237,197 @@
 } UENUM1BYTE(PARTITION_SEARCH_TYPE);
 
 enum {
-  EIGHTH_PEL,
-  QUARTER_PEL,
-  HALF_PEL,
-  FULL_PEL
-} UENUM1BYTE(SUBPEL_FORCE_STOP);
-
-enum {
   NOT_IN_USE,
   DIRECT_PRED,
   RELAXED_PRED,
   ADAPT_PRED
 } UENUM1BYTE(MAX_PART_PRED_MODE);
 
+enum {
+  LAST_MV_DATA,
+  CURRENT_Q,
+  QTR_ONLY,
+} UENUM1BYTE(MV_PREC_LOGIC);
+
+typedef struct HIGH_LEVEL_SPEED_FEATURES {
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // Determine how motion vector precision is chosen. The possibilities are:
+  // LAST_MV_DATA: use the mv data from the last coded frame
+  // CURRENT_Q: use the current q as a threshold
+  // QTR_ONLY: use quarter pel precision only.
+  MV_PREC_LOGIC high_precision_mv_usage;
+
+  // Whether to disable overlay frames for filtered Altref frames,
+  // overiding oxcf->enable_overlay flag set as 1.
+  int disable_overlay_frames;
+
+  // Enable/disable adaptively deciding whether or not to encode ALTREF overlay
+  // frame.
+  int adaptive_overlay_encoding;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // Enable/disable second_alt_ref temporal filtering.
+  int second_alt_ref_filtering;
+} HIGH_LEVEL_SPEED_FEATURES;
+
+typedef struct TPL_SPEED_FEATURES {
+  // Prune the intra modes search by tpl.
+  // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
+  // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
+  int prune_intra_modes;
+  // This parameter controls which step in the n-step process we start at.
+  int reduce_first_step_size;
+  // Skip motion estimation based on the precision of center MVs and the
+  // difference between center MVs.
+  // If set to 0, motion estimation is skipped for duplicate center MVs
+  // (default). If set to 1, motion estimation is skipped for duplicate
+  // full-pixel center MVs. If set to 2, motion estimation is skipped if the
+  // difference between center MVs is less than the threshold.
+  int skip_alike_starting_mv;
+
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+  // Global motion warp error threshold
+  GM_ERRORADV_TYPE gm_erroradv_type;
+
+  // Disable adaptive threshold for global motion warp error
+  int disable_adaptive_warp_error_thresh;
+
+  // Do not compute the global motion parameters for a LAST2_FRAME or
+  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+  // global model.
+  int selective_ref_gm;
+
+  GM_SEARCH_TYPE gm_search_type;
+
+  // whether to disable the global motion recode loop
+  int gm_disable_recode;
+
+  // During global motion estimation, prune remaining reference frames in a
+  // given direction(past/future), if the evaluated ref_frame in that direction
+  // yields gm_type as INVALID/TRANSLATION/IDENTITY
+  int prune_ref_frame_for_gm_search;
+} GLOBAL_MOTION_SPEED_FEATURES;
+
+typedef struct PARTITION_SPEED_FEATURES {
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Prune extended partition types search
+  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+  // aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Use a ML model to prune horz and vert partitions
+  int ml_prune_rect_partition;
+
+  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+  int ml_prune_ab_partition;
+
+  // Use a ML model to prune horz4 and vert4 partitions.
+  int ml_prune_4_partition;
+
+  // Use a ML model to adaptively terminate partition search after trying
+  // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
+  // 1 - 2 increasing aggressiveness in order.
+  int ml_early_term_after_part_split_level;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split. Can take values 0 - 2, 0 referring to no
+  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+  int less_rectangular_check_level;
+
+  // Use square partition only beyond this block size.
+  BLOCK_SIZE use_square_partition_only_threshold;
+
+  // Sets min and max square partition levels for this superblock based on
+  // motion vector and prediction error distribution produced from 16x16
+  // simple motion search
+  MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+  int auto_min_partition_based_on_simple_motion;
+
+  // Min and max square partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Sets level of adjustmet of variace-based partitioning during
+  // rd_use_partition 0 - no partition adjusment, 1 - try to merge partitions
+  // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 -
+  // try to merge and split leaf partitions
+  int adjust_var_based_rd_partitioning;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Thresholds for ML based partition search breakout.
+  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // The aggresiveness of pruning with simple_motion_search.
+  // Currently 0 is the lowest, and 2 the highest.
+  int simple_motion_search_prune_agg;
+
+  // Perform simple_motion_search on each possible subblock and use it to prune
+  // PARTITION_HORZ and PARTITION_VERT.
+  int simple_motion_search_prune_rect;
+
+  // Perform simple motion search before none_partition to decide if we
+  // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+  // model is disabled. If set to 1, the model attempts to perform
+  // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+  // PARTITION_SPLIT.
+  int simple_motion_search_split;
+
+  // Use features from simple_motion_search to terminate prediction block
+  // partition after PARTITION_NONE
+  int simple_motion_search_early_term_none;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
+  // perform split/no_split decision on intra-frames.
+  int intra_cnn_split;
+
+  // Disable extended partition search for lower block sizes.
+  int ext_partition_eval_thresh;
+
+  // Prune 1:4 partition search based on winner info from split partitions
+  int prune_4_partition_using_split_info;
+
+  // Prune AB partition search using split and HORZ/VERT info
+  int prune_ab_partition_using_split_info;
+} PARTITION_SPEED_FEATURES;
+
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
 
-  // This parameter controls which step in the n-step process we start at.
-  // It's changed adaptively based on circumstances.
-  int reduce_first_step_size;
-
   // If this is set to 1, we limit the motion search range to 2 times the
   // largest motion vector found in the last frame.
   int auto_mv_step_size;
@@ -228,81 +443,127 @@
 
   // When to stop subpel search.
   SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // When to stop subpel search in simple motion search.
+  SUBPEL_FORCE_STOP simple_motion_subpel_force_stop;
+
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Threshold for allowing exhaustive motion search.
+  int exhaustive_searches_thresh;
+
+  // Pattern to be used for any exhaustive mesh searches (except intraBC ME).
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  // Pattern to be used for exhaustive mesh searches of intraBC ME.
+  MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP];
+
+  // Reduce single motion search range based on MV result of prior ref_mv_idx.
+  int reduce_search_range;
+
+  // Prune mesh search.
+  int prune_mesh_search;
+
+  // Use the rd cost around the best FULLPEL_MV to speed up subpel search
+  int use_fullpel_costlist;
+
+  // Set the full pixel search level of obmc
+  // 0: obmc_full_pixel_diamond
+  // 1: obmc_refining_search_sad (faster)
+  int obmc_full_pixel_search_level;
+
+  // Accurate full pixel motion search based on TPL stats.
+  int full_pixel_search_level;
 } MV_SPEED_FEATURES;
 
-#define MAX_MESH_STEP 4
+typedef struct INTER_MODE_SPEED_FEATURES {
+  // 2-pass inter mode model estimation where the preliminary pass skips
+  // transform search and uses a model to estimate rd, while the final pass
+  // computes the full transform search. Two types of models are supported:
+  // 0: not used
+  // 1: used with online dynamic rd model
+  // 2: used with static rd model
+  int inter_mode_rd_model_estimation;
 
-typedef struct MESH_PATTERN {
-  int range;
-  int interval;
-} MESH_PATTERN;
-
-enum {
-  GM_FULL_SEARCH,
-  GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
-  GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
-  GM_DISABLE_SEARCH
-} UENUM1BYTE(GM_SEARCH_TYPE);
-
-enum {
-  GM_ERRORADV_TR_0,
-  GM_ERRORADV_TR_1,
-  GM_ERRORADV_TR_2,
-  GM_ERRORADV_TR_TYPES,
-} UENUM1BYTE(GM_ERRORADV_TYPE);
-
-enum {
-  NO_TRELLIS_OPT,          // No trellis optimization
-  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
-  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
-  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
-} UENUM1BYTE(TRELLIS_OPT_TYPE);
-
-enum {
-  FULL_TXFM_RD,
-  LOW_TXFM_RD,
-} UENUM1BYTE(TXFM_RD_MODEL);
-
-enum {
-  DIST_WTD_COMP_ENABLED,
-  DIST_WTD_COMP_SKIP_MV_SEARCH,
-  DIST_WTD_COMP_DISABLED,
-} UENUM1BYTE(DIST_WTD_COMP_FLAG);
-
-typedef enum {
-  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP_REGULAR,
-  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
-  FLAG_SKIP_EIGHTTAP_SHARP = 1 << MULTITAP_SHARP,
-} INTERP_FILTER_MASK;
-
-typedef struct SPEED_FEATURES {
-  MV_SPEED_FEATURES mv;
-
-  // Frame level coding parameter update
-  int frame_parameter_update;
-
-  RECODE_LOOP_TYPE recode_loop;
-
-  // Trellis (dynamic programming) optimization of quantized values
-  TRELLIS_OPT_TYPE optimize_coefficients;
-
-  // Global motion warp error threshold
-  GM_ERRORADV_TYPE gm_erroradv_type;
-
-  // Always set to 0. If on it enables 0 cost background transmission
-  // (except for the initial transmission of the segmentation). The feature is
-  // disabled because the addition of very large block sizes make the
-  // backgrounds very to cheap to encode, and the segmentation we have
-  // adds overhead.
-  int static_segmentation;
+  // Bypass transform search based on skip rd
+  int txfm_rd_gate_level;
 
   // Limit the inter mode tested in the RD loop
   int reduce_inter_modes;
 
-  // Do not compute the global motion parameters for a LAST2_FRAME or
-  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
-  // global model.
-  int selective_ref_gm;
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Aggressively prune inter modes when best mode is skippable.
+  int prune_inter_modes_if_skippable;
+
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
+  // aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune reference frames for rectangular partitions.
+  // 0 implies no pruning
+  // 1 implies prune for extended partition
+  // 2 implies prune horiz, vert and extended partition
+  int prune_ref_frame_for_rect_partitions;
+
+  int alt_ref_search_fp;
+
+  // flag to skip NEWMV mode in drl if the motion search result is the same
+  int skip_repeated_newmv;
+
+  // Skip the current ref_mv in NEW_MV mode if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  int skip_repeated_full_newmv;
+
+  // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
+  // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
+  // TODO(any): Instead of skipping repeated ref mv, use the recalculated
+  // rd-cost based on mode rate and skip the mode evaluation
+  int skip_repeated_ref_mv;
+
+  // Flag used to control the ref_best_rd based gating for chroma
+  int perform_best_rd_based_gating_for_chroma;
+
+  // Skip certain motion modes (OBMC, warped, interintra) for single reference
+  // motion search, using the results of single ref SIMPLE_TRANSLATION
+  int prune_single_motion_modes_by_simple_trans;
+
+  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+  // single ref modes
+  int reuse_inter_intra_mode;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average modeled rd
+  int prune_comp_type_by_model_rd;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average rd/ref_best_rd
+  int prune_comp_type_by_comp_avg;
+
+  // Skip some ref frames in compound motion search by single motion search
+  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+  // increasing aggressiveness of skipping in order.
+  // Note: The search order might affect the result. It assumes that the single
+  // reference modes are searched before compound modes. It is better to search
+  // same single inter mode as a group.
+  int prune_comp_search_by_single_result;
 
   // If 1 we iterate finding a best reference for 2 ref frames together - via
   // a log search that iterates 4 times (check around mv for last for best
@@ -310,15 +571,167 @@
   // we just use the best motion vector found for each frame by itself.
   BLOCK_SIZE comp_inter_joint_search_thresh;
 
-  // This variable is used to cap the maximum number of times we skip testing a
-  // mode to be evaluated. A high value means we will be faster.
-  int adaptive_rd_thresh;
+  // Instead of performing a full MV search, do a simple translation first
+  // and only perform a full MV search on the motion vectors that performed
+  // well.
+  int prune_mode_search_simple_translation;
 
-  // Determine which method we use to determine transform size. We can choose
-  // between options like full rd, largest for prediction size, largest
-  // for intra and model coefs for the rest.
-  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  // Only search compound modes with at least one "good" reference frame.
+  // A reference frame is good if, after looking at its performance among
+  // the single reference modes, it is one of the two best performers.
+  int prune_compound_using_single_ref;
 
+  // Skip extended compound mode using ref frames of above and left neighbor
+  // blocks.
+  // 0 : no pruning
+  // 1 : prune extended compound mode (less aggressiveness)
+  // 2 : prune extended compound mode (high aggressiveness)
+  int prune_compound_using_neighbors;
+
+  // Based on previous ref_mv_idx search result, prune the following search.
+  int prune_ref_mv_idx_search;
+
+  // Disable one sided compound modes.
+  int disable_onesided_comp;
+
+  // Prune/gate motion mode evaluation based on token based rd
+  // during transform search for inter blocks
+  // Values are 0 (not used) , 1 - 3 with progressively increasing
+  // aggressiveness
+  int prune_motion_mode_level;
+
+  // Prune obmc search using previous frame stats.
+  int prune_obmc_prob_thresh;
+
+  // Disable obmc.
+  int disable_obmc;
+
+  // Gate warp evaluation for motions of type IDENTITY,
+  // TRANSLATION and AFFINE(based on number of warp neighbors)
+  int prune_warp_using_wmtype;
+
+  // Prune warped motion search using previous frame stats.
+  int prune_warped_prob_thresh;
+
+  // Enable/disable interintra wedge search.
+  int disable_wedge_interintra_search;
+
+  // De-couple wedge and mode search during interintra RDO.
+  int fast_interintra_wedge_search;
+
+  // Only enable wedge search if the edge strength is greater than
+  // this threshold. A value of 0 signals that this check is disabled.
+  unsigned int disable_wedge_search_edge_thresh;
+
+  // Only enable wedge search if the variance is above this threshold.
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+
+  // Whether to prune wedge search based on predictor difference
+  int prune_wedge_pred_diff_based;
+
+  // Enable/disable ME for interinter wedge search.
+  int disable_interinter_wedge_newmv_search;
+
+  // Enable/disable ME for interinter diffwtd search. PSNR BD-rate gain of
+  // ~0.1 on the lowres test set, but ~15% slower computation.
+  int enable_interinter_diffwtd_newmv_search;
+
+  // Enable/disable smooth inter-intra mode
+  int disable_smooth_interintra;
+
+  // Disable interinter_wedge
+  int disable_interinter_wedge;
+
+  // Decide when and how to use joint_comp.
+  DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+
+  // Whether to override and disable sb level coeff cost updates, if
+  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
+  int disable_sb_level_coeff_cost_upd;
+
+  // Whether to override and disable sb level mv cost updates, if
+  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
+  int disable_sb_level_mv_cost_upd;
+
+  // Prune inter modes based on tpl stats
+  // 0 : no pruning
+  // 1 - 3 indicate increasing aggressiveness in order.
+  int prune_inter_modes_based_on_tpl;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
+  // Reuse compound type rd decision when exact match is found
+  // 0: No reuse
+  // 1: Reuse the compound type decision
+  int reuse_compound_type_decision;
+} INTER_MODE_SPEED_FEATURES;
+
+typedef struct INTERP_FILTER_SPEED_FEATURES {
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Disable dual filter
+  int disable_dual_filter;
+
+  // Save results of av1_interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are very close with previous
+  // saved results, filter search can be skipped.
+  int use_interp_filter;
+
+  // skip sharp_filter evaluation based on regular and smooth filter rd for
+  // dual_filter=0 case
+  int skip_sharp_interp_filter_search;
+
+  int cb_pred_filter_search;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+} INTERP_FILTER_SPEED_FEATURES;
+
+typedef struct INTRA_MODE_SPEED_FEATURES {
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // variance threshold for intra mode gating when inter turned out to be skip
+  // in inter frame prediction
+  unsigned int src_var_thresh_intra_skip;
+
+  // Prune intra mode candidates based on source block histogram of gradient.
+  int intra_pruning_with_hog;
+
+  // TODO(anyone): tune intra_pruning_with_hog_thresh for various speeds.
+  float intra_pruning_with_hog_thresh;
+
+  // Enable/disable smooth intra modes.
+  int disable_smooth_intra;
+
+  // prune palette search
+  // 0: No pruning
+  // 1: Perform coarse search to prune the palette colors. For winner colors,
+  // neighbors are also evaluated using a finer search.
+  // 2: Perform 2 way palette search from max colors to min colors (and min
+  // colors to remaining colors) and terminate the search if current number of
+  // palette colors is not the winner.
+  int prune_palette_search_level;
+} INTRA_MODE_SPEED_FEATURES;
+
+typedef struct TX_SPEED_FEATURES {
   // Init search depth for square and rectangular transform partitions.
   // Values:
   // 0 - search full tree, 1: search 1 level, 2: search the highest level only
@@ -326,12 +739,11 @@
   int inter_tx_size_search_init_depth_rect;
   int intra_tx_size_search_init_depth_sqr;
   int intra_tx_size_search_init_depth_rect;
+
   // If any dimension of a coding block size above 64, always search the
   // largest transform only, since the largest transform block size is 64x64.
   int tx_size_search_lgr_block;
 
-  PARTITION_SEARCH_TYPE partition_search_type;
-
   TX_TYPE_SEARCH tx_type_search;
 
   // Skip split transform block partition when the collocated bigger block
@@ -349,208 +761,6 @@
   // 1-2: progressively increasing aggressiveness of pruning
   int model_based_prune_tx_search_level;
 
-  // Model based breakout after interpolation filter search
-  // 0: no breakout
-  // 1: use model based rd breakout
-  int model_based_post_interp_filter_breakout;
-
-  // Model based breakout in motion_mode_rd
-  // 0: no breakout
-  // 1: use model based rd breakout
-  int model_based_motion_mode_rd_breakout;
-
-  // Used if partition_search_type = FIXED_SIZE_PARTITION
-  BLOCK_SIZE always_this_block_size;
-
-  // Drop less likely to be picked reference frames in the RD search.
-  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
-  // aggressively than lower ones. (0 means no pruning).
-  int selective_ref_frame;
-
-  // Prune extended partition types search
-  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
-  // aggressiveness of pruning in order.
-  int prune_ext_partition_types_search_level;
-
-  // Use a ML model to prune horz and vert partitions
-  int ml_prune_rect_partition;
-
-  // Disable/Enable interintra motion mode based on stats collected during
-  // first_partition_search_pass
-  int use_first_partition_pass_interintra_stats;
-
-  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
-  int ml_prune_ab_partition;
-
-  // Use a ML model to prune horz4 and vert4 partitions.
-  int ml_prune_4_partition;
-
-  int fast_cdef_search;
-
-  // 2-pass coding block partition search, and also use the mode decisions made
-  // in the initial partition search to prune mode candidates, e.g. ref frames.
-  int two_pass_partition_search;
-
-  // Terminate early in firstpass of two_pass partition search for faster
-  // firstpass.
-  int firstpass_simple_motion_search_early_term;
-
-  // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split. Can take values 0 - 2, 0 referring to no
-  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
-  int less_rectangular_check_level;
-
-  // Use square partition only beyond this block size.
-  BLOCK_SIZE use_square_partition_only_threshold;
-
-  // Prune reference frames for rectangular partitions.
-  // 0 implies no pruning
-  // 1 implies prune for extended partition
-  // 2 implies prune horiz, vert and extended partition
-  int prune_ref_frame_for_rect_partitions;
-
-  // Sets min and max square partition levels for this superblock based on
-  // motion vector and prediction error distribution produced from 16x16
-  // simple motion search
-  MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
-  int auto_min_partition_based_on_simple_motion;
-
-  // Ensures the rd based auto partition search will always
-  // go down at least to the specified level.
-  BLOCK_SIZE rd_auto_partition_min_limit;
-
-  // Min and max partition size we enable (block_size) as per auto
-  // min max, but also used by adjust partitioning, and pick_partitioning.
-  BLOCK_SIZE default_min_partition_size;
-  BLOCK_SIZE default_max_partition_size;
-
-  // Whether or not we allow partitions one smaller or one greater than the last
-  // frame's partitioning. Only used if use_lastframe_partitioning is set.
-  int adjust_partitioning_from_last_frame;
-
-  // TODO(jingning): combine the related motion search speed features
-  // This allows us to use motion search at other sizes as a starting
-  // point for this motion search and limits the search range around it.
-  int adaptive_motion_search;
-
-  // Flag for allowing some use of exhaustive searches;
-  int allow_exhaustive_searches;
-
-  // Threshold for allowing exhaistive motion search.
-  int exhaustive_searches_thresh;
-
-  // Maximum number of exhaustive searches for a frame.
-  int max_exaustive_pct;
-
-  // Pattern to be used for any exhaustive mesh searches.
-  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
-
-  // Allows sub 8x8 modes to use the prediction filter that was determined
-  // best for 8x8 mode. If set to 0 we always re check all the filters for
-  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
-  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
-  int adaptive_pred_interp_filter;
-
-  // Adaptive prediction mode search
-  int adaptive_mode_search;
-
-  int alt_ref_search_fp;
-
-  // Implements various heuristics to skip searching modes
-  // The heuristics selected are based on  flags
-  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
-  unsigned int mode_search_skip_flags;
-
-  // A source variance threshold below which filter search is disabled
-  // Choose a very large value (UINT_MAX) to use 8-tap always
-  unsigned int disable_filter_search_var_thresh;
-
-  // Only enable wedge search if the edge strength is greater than
-  // this threshold. A value of 0 signals that this check is disabled.
-  unsigned int disable_wedge_search_edge_thresh;
-
-  // Only enable wedge search if the variance is above this threshold.
-  unsigned int disable_wedge_search_var_thresh;
-
-  // Whether fast wedge sign estimate is used
-  int fast_wedge_sign_estimate;
-
-  // Whether to prune wedge search based on predictor difference
-  int prune_wedge_pred_diff_based;
-
-  // These bit masks allow you to enable or disable intra modes for each
-  // transform size separately.
-  int intra_y_mode_mask[TX_SIZES];
-  int intra_uv_mode_mask[TX_SIZES];
-
-  // This feature controls how the loop filter level is determined.
-  LPF_PICK_METHOD lpf_pick;
-
-  // This feature controls whether we do the expensive context update and
-  // calculation in the rd coefficient costing loop.
-  int use_fast_coef_costing;
-
-  // This feature controls the tolerence vs target used in deciding whether to
-  // recode a frame. It has no meaning if recode is disabled.
-  int recode_tolerance;
-
-  // This variable controls the maximum block size where intra blocks can be
-  // used in inter frames.
-  // TODO(aconverse): Fold this into one of the other many mode skips
-  BLOCK_SIZE max_intra_bsize;
-
-  // Partition search early breakout thresholds.
-  int64_t partition_search_breakout_dist_thr;
-  int partition_search_breakout_rate_thr;
-
-  // Thresholds for ML based partition search breakout.
-  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
-
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
-  // Fast approximation of av1_model_rd_from_var_lapndz
-  int simple_model_rd_from_var;
-
-  // If true, sub-pixel search uses the exact convolve function used for final
-  // encoding and decoding; otherwise, it uses bilinear interpolation.
-  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
-
-  // Whether to compute distortion in the image domain (slower but
-  // more accurate), or in the transform domain (faster but less acurate).
-  // 0: use image domain
-  // 1: use transform domain in tx_type search, and use image domain for
-  // RD_STATS
-  // 2: use transform domain
-  int use_transform_domain_distortion;
-
-  GM_SEARCH_TYPE gm_search_type;
-
-  // whether to disable the global motion recode loop
-  int gm_disable_recode;
-
-  // Do limited interpolation filter search for dual filters, since best choice
-  // usually includes EIGHTTAP_REGULAR.
-  int use_fast_interpolation_filter_search;
-
-  // Disable dual filter
-  int disable_dual_filter;
-
-  // Save results of interpolation_filter_search for a block
-  // Check mv and ref_frames before search, if they are same with previous
-  // saved results, it can be skipped.
-  int skip_repeat_interpolation_filter_search;
-
-  // Use a hash table to store previously computed optimized qcoeffs from
-  // expensive calls to optimize_txb.
-  int use_hash_based_trellis;
-
-  // flag to drop some ref frames in compound motion search
-  int drop_ref;
-
-  // flag to allow skipping intra mode for inter frame prediction
-  int skip_intra_in_interframe;
-
   // Use hash table to store intra(keyframe only) txb transform search results
   // to avoid repeated search on the same residue signal.
   int use_intra_txb_hash;
@@ -559,142 +769,254 @@
   // to avoid repeated search on the same residue signal.
   int use_inter_txb_hash;
 
+  // Refine TX type after fast TX search.
+  int refine_fast_tx_search_results;
+} TX_SPEED_FEATURES;
+
+typedef struct RD_CALC_SPEED_FEATURES {
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // Fast approximation of av1_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  // 0: use image domain
+  // 1: use transform domain in tx_type search, and use image domain for
+  // RD_STATS
+  // 2: use transform domain
+  int tx_domain_dist_level;
+
+  // Transform domain distortion threshold level
+  int tx_domain_dist_thres_level;
+
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
   // Use hash table to store macroblock RD search results
   // to avoid repeated search on the same residue signal.
   int use_mb_rd_hash;
 
+  // Flag used to control the speed of the eob selection in trellis.
+  int trellis_eob_fast;
+
   // Calculate RD cost before doing optimize_b, and skip if the cost is large.
   int optimize_b_precheck;
 
-  // Use two-loop compound search
-  int two_loop_comp_search;
+  // Flag used to control the extent of coeff R-D optimization
+  int perform_coeff_opt;
+} RD_CALC_SPEED_FEATURES;
 
-  // Use model rd instead of transform search in second loop of compound search
-  int second_loop_comp_fast_tx_search;
+typedef struct WINNER_MODE_SPEED_FEATURES {
+  // Flag used to control the winner mode processing for better R-D optimization
+  // of quantized coeffs
+  int enable_winner_mode_for_coeff_opt;
 
-  // Decide when and how to use joint_comp.
-  DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+  // Flag used to control the winner mode processing for transform size
+  // search method
+  int enable_winner_mode_for_tx_size_srch;
+
+  // Control transform size search level
+  // Eval type: Default       Mode        Winner
+  // Level 0  : FULL RD     LARGEST ALL   FULL RD
+  // Level 1  : FAST RD     LARGEST ALL   FULL RD
+  // Level 2  : LARGEST ALL LARGEST ALL   FULL RD
+  int tx_size_search_level;
+
+  // Flag used to control the winner mode processing for use transform
+  // domain distortion
+  int enable_winner_mode_for_use_tx_domain_dist;
+
+  // Flag used to enable processing of multiple winner modes
+  int enable_multiwinner_mode_process;
+
+  // Motion mode for winner candidates:
+  // 0: speed feature OFF
+  // 1 / 2 : Use configured number of winner candidates
+  int motion_mode_for_winner_cand;
+} WINNER_MODE_SPEED_FEATURES;
+
+typedef struct LOOP_FILTER_SPEED_FEATURES {
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // Control how the CDEF strength is determined.
+  CDEF_PICK_METHOD cdef_pick_method;
 
   // Decoder side speed feature to add penalty for use of dual-sgr filters.
   // Takes values 0 - 10, 0 indicating no penalty and each additional level
   // adding a penalty of 1%
   int dual_sgr_penalty_level;
 
-  // 2-pass inter mode model estimation where the preliminary pass skips
-  // transform search and uses a model to estimate rd, while the final pass
-  // computes the full transform search. Two types of models are supported:
-  // 0: not used
-  // 1: used with online dynamic rd model
-  // 2: used with static rd model
-  int inter_mode_rd_model_estimation;
-
-  // Skip some ref frames in compound motion search by single motion search
-  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
-  // increasing aggressiveness of skipping in order.
-  // Note: The search order might affect the result. It is better to search same
-  // single inter mode as a group.
-  int prune_comp_search_by_single_result;
-
-  // Skip certain motion modes (OBMC, warped, interintra) for single reference
-  // motion search, using the results of single ref SIMPLE_TRANSLATION
-  int prune_single_motion_modes_by_simple_trans;
-
-  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
-  // single ref modes
-  int reuse_inter_intra_mode;
-
-  // Set the full pixel search level of obmc
-  // 0: obmc_full_pixel_diamond
-  // 1: obmc_refining_search_sad (faster)
-  int obmc_full_pixel_search_level;
-
-  // flag to skip NEWMV mode in drl if the motion search result is the same
-  int skip_repeated_newmv;
-
-  // Prune intra mode candidates based on source block gradient stats.
-  int intra_angle_estimation;
-
-  // Skip obmc or warped motion mode when neighborhood motion field is
-  // identical
-  int skip_obmc_in_uniform_mv_field;
-  int skip_wm_in_uniform_mv_field;
-
-  // Enable/disable ME for interinter wedge search.
-  int disable_interinter_wedge_newmv_search;
-
-  // Enable/disable smooth inter-intra mode
-  int disable_smooth_interintra;
-
-  // skip sharp_filter evaluation based on regular and smooth filter rd for
-  // dual_filter=0 case
-  int skip_sharp_interp_filter_search;
-
-  // prune wedge and compound segment approximate rd evaluation based on
-  // compound average rd/ref_best_rd
-  int prune_comp_type_by_comp_avg;
-
-  // Prune/gate motion mode evaluation based on token based rd
-  // during transform search for inter blocks
-  // Values are 0 (not used) , 1 - 3 with progressively increasing
-  // aggressiveness
-  int prune_motion_mode_level;
-
-  // Gate warp evaluation for motions of type IDENTITY,
-  // TRANSLATION and AFFINE(based on number of warp neighbors)
-  int prune_warp_using_wmtype;
-
-  // Perform simple_motion_search on each possible subblock and use it to prune
-  // PARTITION_HORZ and PARTITION_VERT.
-  int simple_motion_search_prune_rect;
-
-  // Perform simple motion search before none_partition to decide if we
-  // want to split directly without trying other partition types.
-  int simple_motion_search_split_only;
-
-  // Use features from simple_motion_search to terminate prediction block
-  // partition after PARTITION_NONE
-  int simple_motion_search_early_term_none;
-
-  int cb_pred_filter_search;
-
-  // adaptive interp_filter search to allow skip of certain filter types.
-  int adaptive_interp_filter_search;
-
-  // mask for skip evaluation of certain interp_filter type.
-  INTERP_FILTER_MASK interp_filter_search_mask;
-
-  // Flag used to control the ref_best_rd based gating for chroma
-  int perform_best_rd_based_gating_for_chroma;
-
-  // Enable/disable interintra wedge search.
-  int disable_wedge_interintra_search;
+  // prune sgr ep using binary search like mechanism
+  int enable_sgr_ep_pruning;
 
   // Disable loop restoration for Chroma plane
   int disable_loop_restoration_chroma;
 
-  // Flag used to control the extent of coeff R-D optimization
-  int perform_coeff_opt;
+  // Prune RESTORE_WIENER evaluation based on source variance
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_wiener_based_on_src_var;
 
-  // Flag used to control the speed of the eob selection in trellis.
-  int trellis_eob_fast;
+  // Prune self-guided loop restoration based on wiener search results
+  // 0 : no pruning
+  // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE
+  // 2 : pruning based on winner restoration type among RESTORE_WIENER and
+  // RESTORE_NONE
+  int prune_sgr_based_on_wiener;
+
+  // Reduce the wiener filter win size for luma
+  int reduce_wiener_window_size;
+
+  // Disable loop restoration filter
+  int disable_lr_filter;
+} LOOP_FILTER_SPEED_FEATURES;
+
+typedef struct REAL_TIME_SPEED_FEATURES {
+  // check intra prediction for non-RD mode.
+  int check_intra_pred_nonrd;
+
+  // Perform coarse ME before calculating variance in variance-based partition
+  int estimate_motion_for_var_based_partition;
+
+  // For nonrd_use_partition: mode of extra check of leaf partition
+  // 0 - don't check merge
+  // 1 - always check merge
+  // 2 - check merge and prune checking final split
+  int nonrd_check_partition_merge_mode;
+
+  // For nonrd_use_partition: check of leaf partition extra split
+  int nonrd_check_partition_split;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // For nonrd: Reduces ref frame search.
+  // 0 - low level of search prune in non last frames
+  // 1 - pruned search in non last frames
+  // 2 - more pruned search in non last frames
+  int nonrd_prune_ref_frame_search;
 
   // This flag controls the use of non-RD mode decision.
   int use_nonrd_pick_mode;
 
-  // prune wedge and compound segment approximate rd evaluation based on
-  // compound average modeled rd
-  int prune_comp_type_by_model_rd;
+  // Use ALTREF frame in non-RD mode decision.
+  int use_nonrd_altref_frame;
 
-  // Enable/disable smooth intra modes.
-  int disable_smooth_intra;
+  // Use compound reference for non-RD mode.
+  int use_comp_ref_nonrd;
 
   // use reduced ref set for real-time mode
   int use_real_time_ref_set;
 
-  // Perform a full TX search on some modes while using the
-  // inter-mode RD model for others. Only enabled when
-  // inter_mode_rd_model_estimation != 0
-  int inter_mode_rd_model_estimation_adaptive;
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
+
+  // Use modeled (currently CurvFit model) RDCost for fast non-RD mode
+  int use_modeled_non_rd_cost;
+
+  // Reuse inter prediction in fast non-rd mode.
+  int reuse_inter_pred_nonrd;
+
+  // Number of best inter modes to search transform. INT_MAX - search all.
+  int num_inter_modes_for_tx_search;
+
+  // Forces TX search off for RDCost calulation.
+  int force_tx_search_off;
+
+  // Use interpolation filter search in non-RD mode decision.
+  int use_nonrd_filter_search;
+
+  // Use simplified RD model for interpolation search and Intra
+  int use_simple_rd_model;
+
+  // If set forces interpolation filter to EIGHTTAP_REGULAR
+  int skip_interp_filter_search;
+
+  // Use hybrid (rd for bsize < 16x16, otherwise nonrd) intra search for intra
+  // only frames.
+  int hybrid_intra_pickmode;
+
+  // Compute variance/sse on source difference, prior to encoding superblock.
+  int source_metrics_sb_nonrd;
+} REAL_TIME_SPEED_FEATURES;
+
+typedef struct SPEED_FEATURES {
+  /*
+   * Sequence/frame level speed features:
+   */
+  HIGH_LEVEL_SPEED_FEATURES hl_sf;
+
+  /*
+   * Speed features related to how tpl's searches are done.
+   */
+  TPL_SPEED_FEATURES tpl_sf;
+
+  /*
+   * Global motion speed features:
+   */
+  GLOBAL_MOTION_SPEED_FEATURES gm_sf;
+
+  /*
+   * Partition search speed features:
+   */
+  PARTITION_SPEED_FEATURES part_sf;
+
+  /*
+   * Motion search speed features:
+   */
+  MV_SPEED_FEATURES mv_sf;
+
+  /*
+   * Inter mode search speed features:
+   */
+  INTER_MODE_SPEED_FEATURES inter_sf;
+
+  /*
+   * Interpolation filter search speed features:
+   */
+  INTERP_FILTER_SPEED_FEATURES interp_sf;
+
+  /*
+   * Intra mode search speed features:
+   */
+  INTRA_MODE_SPEED_FEATURES intra_sf;
+
+  /*
+   * Transform size/type search speed features:
+   */
+  TX_SPEED_FEATURES tx_sf;
+
+  /*
+   * RD calculation speed features:
+   */
+  RD_CALC_SPEED_FEATURES rd_sf;
+
+  /*
+   * Two-pass mode evaluation features:
+   */
+  WINNER_MODE_SPEED_FEATURES winner_mode_sf;
+
+  /*
+   * In-loop filter speed features:
+   */
+  LOOP_FILTER_SPEED_FEATURES lpf_sf;
+
+  /*
+   * Real-time mode speed features:
+   */
+  REAL_TIME_SPEED_FEATURES rt_sf;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
@@ -703,6 +1025,7 @@
                                                   int speed);
 void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
                                                 int speed);
+void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/svc_layercontext.c b/libaom/av1/encoder/svc_layercontext.c
new file mode 100644
index 0000000..b72d8aa
--- /dev/null
+++ b/libaom/av1/encoder/svc_layercontext.c

@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/encoder.h"
+
+static void swap_ptr(void *a, void *b) {
+  void **a_p = (void **)a;
+  void **b_p = (void **)b;
+  void *c = *a_p;
+  *a_p = *b_p;
+  *b_p = c;
+}
+
+void av1_init_layer_context(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SVC *const svc = &cpi->svc;
+  int mi_rows = cpi->common.mi_params.mi_rows;
+  int mi_cols = cpi->common.mi_params.mi_cols;
+  svc->base_framerate = 30.0;
+  svc->current_superframe = 0;
+
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lrc->ni_av_qi = oxcf->worst_allowed_q;
+      lrc->total_actual_bits = 0;
+      lrc->total_target_vs_actual = 0;
+      lrc->ni_tot_qi = 0;
+      lrc->tot_q = 0.0;
+      lrc->avg_q = 0.0;
+      lrc->ni_frames = 0;
+      lrc->decimation_count = 0;
+      lrc->decimation_factor = 0;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+      for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+        lrc->rate_correction_factors[i] = 1.0;
+      }
+      lc->target_bandwidth = lc->layer_target_bitrate;
+      lrc->last_q[INTER_FRAME] = lrc->worst_quality;
+      lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+      lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+      lrc->buffer_level =
+          oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000;
+      lrc->bits_off_target = lrc->buffer_level;
+      // Initialize the cyclic refresh parameters. If spatial layers are used
+      // (i.e., ss_number_layers > 1), these need to be updated per spatial
+      // layer. Cyclic refresh is only applied on base temporal layer.
+      if (svc->number_spatial_layers > 1 && tl == 0) {
+        size_t last_coded_q_map_size;
+        lc->sb_index = 0;
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
+        CHECK_MEM_ERROR(cm, lc->map,
+                        aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
+        memset(lc->map, 0, mi_rows * mi_cols);
+        last_coded_q_map_size =
+            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
+        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+                        aom_malloc(last_coded_q_map_size));
+        assert(MAXQ <= 255);
+        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+      }
+    }
+  }
+}
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  int layer = 0;
+  int64_t spatial_layer_target = 0;
+  float bitrate_alloc = 1.0;
+
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate;
+    }
+    spatial_layer_target = svc->layer_context[layer].target_bandwidth;
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      LAYER_CONTEXT *const lc =
+          &svc->layer_context[sl * svc->number_temporal_layers + tl];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lc->spatial_layer_target_bandwidth = spatial_layer_target;
+      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      lrc->starting_buffer_level =
+          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+      lrc->optimal_buffer_level =
+          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+      lrc->maximum_buffer_size =
+          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+      lrc->bits_off_target =
+          AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+      lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      lc->framerate = cpi->framerate / lc->framerate_factor;
+      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+    }
+  }
+}
+
+static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
+  return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                                     cpi->svc.number_temporal_layers +
+                                 cpi->svc.temporal_layer_id];
+}
+
+void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  RATE_CONTROL *const lrc = &lc->rc;
+  const int tl = svc->temporal_layer_id;
+  lc->framerate = cpi->framerate / lc->framerate_factor;
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (tl == 0) {
+    lc->avg_frame_size = lrc->avg_frame_bandwidth;
+  } else {
+    int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers +
+                     svc->temporal_layer_id - 1;
+    LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer];
+    const double prev_layer_framerate =
+        cpi->framerate / lcprev->framerate_factor;
+    const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
+    lc->avg_frame_size =
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
+  }
+}
+
+void av1_restore_layer_context(AV1_COMP *const cpi) {
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+  // Restore layer rate control.
+  cpi->rc = lc->rc;
+  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+  gf_group->index = lc->group_index;
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  cpi->rc.frames_since_key = old_frame_since_key;
+  cpi->rc.frames_to_key = old_frame_to_key;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    swap_ptr(&cr->map, &lc->map);
+    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
+    cr->sb_index = lc->sb_index;
+    cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+    cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+  }
+  svc->skip_nonzeromv_last = 0;
+  svc->skip_nonzeromv_gf = 0;
+  // For each reference (LAST/GOLDEN) set the skip_nonzero_last/gf frame flags.
+  // This is to skip testing nonzero-mv for that reference if it was last
+  // refreshed (i.e., buffer slot holding that reference was refreshed) on the
+  // previous spatial layer at the same time (current_superframe).
+  if (svc->external_ref_frame_config) {
+    int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1];
+    if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+      svc->skip_nonzeromv_last = 1;
+    ref_frame_idx = svc->ref_idx[GOLDEN_FRAME - 1];
+    if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+      svc->skip_nonzeromv_gf = 1;
+  }
+}
+
+void av1_save_layer_context(AV1_COMP *const cpi) {
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = get_layer_context(cpi);
+  lc->rc = cpi->rc;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
+  lc->group_index = gf_group->index;
+  if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = lc->map;
+    uint8_t *temp2 = lc->last_coded_q_map;
+    lc->map = cr->map;
+    cr->map = temp;
+    lc->last_coded_q_map = cr->last_coded_q_map;
+    cr->last_coded_q_map = temp2;
+    lc->sb_index = cr->sb_index;
+    lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+    lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+  }
+  // For any buffer slot that is refreshed, update it with
+  // the spatial_layer_id and the current_superframe.
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    // All slots are refreshed on KEY.
+    for (unsigned int i = 0; i < REF_FRAMES; i++) {
+      svc->buffer_time_index[i] = svc->current_superframe;
+      svc->buffer_spatial_layer[i] = svc->spatial_layer_id;
+    }
+  } else if (cpi->svc.external_ref_frame_config) {
+    for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      int ref_frame_map_idx = svc->ref_idx[i];
+      if (cpi->svc.refresh[ref_frame_map_idx]) {
+        svc->buffer_time_index[ref_frame_map_idx] = svc->current_superframe;
+        svc->buffer_spatial_layer[ref_frame_map_idx] = svc->spatial_layer_id;
+      }
+    }
+  }
+  if (svc->spatial_layer_id == svc->number_spatial_layers - 1)
+    svc->current_superframe++;
+}
+
+void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      if (lc->map) aom_free(lc->map);
+      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
+    }
+  }
+}
+
+// Reset on key frame: reset counters, references and buffer updates.
+void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+      if (is_key) lc->frames_from_key_frame = 0;
+    }
+  }
+  av1_update_temporal_layer_framerate(cpi);
+  av1_restore_layer_context(cpi);
+}
+
+static void get_layer_resolution(const int width_org, const int height_org,
+                                 const int num, const int den, int *width_out,
+                                 int *height_out) {
+  int w, h;
+  if (width_out == NULL || height_out == NULL || den == 0) return;
+  w = width_org * num / den;
+  h = height_org * num / den;
+  // Make height and width even.
+  w += w % 2;
+  h += h % 2;
+  *width_out = w;
+  *height_out = h;
+}
+
+void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  int width = 0, height = 0;
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
+                       &height);
+  av1_set_size_literal(cpi, width, height);
+}

diff --git a/libaom/av1/encoder/svc_layercontext.h b/libaom/av1/encoder/svc_layercontext.h
new file mode 100644
index 0000000..7cb85a3
--- /dev/null
+++ b/libaom/av1/encoder/svc_layercontext.h

@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  RATE_CONTROL rc;
+  int framerate_factor;
+  int64_t layer_target_bitrate;
+  int scaling_factor_num;
+  int scaling_factor_den;
+  int64_t target_bandwidth;
+  int64_t spatial_layer_target_bandwidth;
+  double framerate;
+  int avg_frame_size;
+  int max_q;
+  int min_q;
+  int frames_from_key_frame;
+  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  int sb_index;
+  int8_t *map;
+  uint8_t *last_coded_q_map;
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  int counter_encode_maxq_scene_change;
+  uint8_t speed;
+  unsigned char group_index;
+} LAYER_CONTEXT;
+
+typedef struct SVC {
+  int spatial_layer_id;
+  int temporal_layer_id;
+  int number_spatial_layers;
+  int number_temporal_layers;
+  int external_ref_frame_config;
+  int non_reference_frame;
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int reference[INTER_REFS_PER_FRAME];
+  int ref_idx[INTER_REFS_PER_FRAME];
+  int refresh[REF_FRAMES];
+  double base_framerate;
+  unsigned int current_superframe;
+  unsigned int buffer_time_index[REF_FRAMES];
+  unsigned char buffer_spatial_layer[REF_FRAMES];
+  int skip_nonzeromv_last;
+  int skip_nonzeromv_gf;
+  // Layer context used for rate control in one pass temporal CBR mode or
+  // two pass spatial mode.
+  LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+} SVC;
+
+struct AV1_COMP;
+
+// Initialize layer context data from init_config().
+void av1_init_layer_context(struct AV1_COMP *const cpi);
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth);
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current temporal layer.
+void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+void av1_restore_layer_context(struct AV1_COMP *const cpi);
+
+// Save the layer context after encoding the frame.
+void av1_save_layer_context(struct AV1_COMP *const cpi);
+
+void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
+
+void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
+
+void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_

diff --git a/libaom/av1/encoder/temporal_filter.c b/libaom/av1/encoder/temporal_filter.c
index ba883d7..a637df5 100644
--- a/libaom/av1/encoder/temporal_filter.c
+++ b/libaom/av1/encoder/temporal_filter.c

@@ -15,1289 +15,1324 @@
 #include "config/aom_config.h"
 
 #include "av1/common/alloccommon.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/odintrin.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
-#include "av1/common/odintrin.h"
 #include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
-#include "av1/encoder/encoder.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/temporal_filter.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 
-#define EDGE_THRESHOLD 50
-#define SQRT_PI_BY_2 1.25331413732
+// NOTE: All `tf` in this file means `temporal filtering`.
 
-static unsigned int index_mult[14] = {
-  0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
-};
+// Does motion search for blocks in temporal filtering. This is the first step
+// for temporal filtering. More specifically, given a frame to be filtered and
+// another frame as reference, this function searches the reference frame to
+// find out the most alike block as that from the frame to be filtered. This
+// found block will be further used for weighted averaging.
+// NOTE: Besides doing motion search for the entire block, this function will
+// also do motion search for each 1/4 sub-block to get more precise prediction.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   frame_to_filter: Pointer to the frame to be filtered.
+//   ref_frame: Pointer to the reference frame.
+//   block_size: Block size used for motion search.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   ref_mv: Reference motion vector, which is commonly inherited from the
+//           motion search result of previous frame.
+//   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+// Returns:
+//   Search error (MSE) of the entire block.
+static int tf_motion_search(AV1_COMP *cpi,
+                            const YV12_BUFFER_CONFIG *frame_to_filter,
+                            const YV12_BUFFER_CONFIG *ref_frame,
+                            const BLOCK_SIZE block_size, const int mb_row,
+                            const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                            int *subblock_mses) {
+  // Frame information
+  const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
 
-static int64_t highbd_index_mult[14] = { 0U,          0U,          0U,
-                                         0U,          3221225472U, 2576980378U,
-                                         2147483648U, 1840700270U, 1610612736U,
-                                         1431655766U, 1288490189U, 1171354718U,
-                                         0U,          991146300U };
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int y_stride = frame_to_filter->y_stride;
+  assert(y_stride == ref_frame->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
 
-static void temporal_filter_predictors_mb_c(
-    MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
-    int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
-    uint8_t *pred, struct scale_factors *scale, int x, int y,
-    int can_use_previous, int num_planes, MV *blk_mvs, int use_32x32) {
-  mv_precision mv_precision_uv;
-  int uv_stride;
-  // TODO(angiebird): change plane setting accordingly
-  ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
-  const InterpFilters interp_filters =
-      av1_make_interp_filters(MULTITAP_SHARP, MULTITAP_SHARP);
-  WarpTypesAllowed warp_types;
-  memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+  const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type;
 
-  const int ssx = (uv_block_width == (BW >> 1)) ? 1 : 0;
-  if (ssx) {
-    uv_stride = (stride + 1) >> 1;
-    mv_precision_uv = MV_PRECISION_Q4;
-  } else {
-    uv_stride = stride;
-    mv_precision_uv = MV_PRECISION_Q3;
-  }
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
 
-  if (use_32x32) {
-    assert(mv_row >= INT16_MIN && mv_row <= INT16_MAX && mv_col >= INT16_MIN &&
-           mv_col <= INT16_MAX);
-    const MV mv = { (int16_t)mv_row, (int16_t)mv_col };
+  const search_site_config ss_cfg =
+      cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+  const SEARCH_METHODS full_search_method = NSTEP;
+  const int step_param = av1_init_search_range(
+      AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
+  const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
+  const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
+  const MV_COST_TYPE mv_cost_type =
+      min_frame_size >= 720
+          ? MV_COST_L1_HDRES
+          : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
 
-    av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
-                              BH, &conv_params, interp_filters, &warp_types, x,
-                              y, 0, 0, MV_PRECISION_Q3, x, y, xd,
-                              can_use_previous);
-    if (num_planes > 1) {
-      av1_build_inter_predictor(
-          u_mb_ptr, uv_stride, &pred[BLK_PELS], uv_block_width, &mv, scale,
-          uv_block_width, uv_block_height, &conv_params, interp_filters,
-          &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
-      av1_build_inter_predictor(
-          v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], uv_block_width, &mv,
-          scale, uv_block_width, uv_block_height, &conv_params, interp_filters,
-          &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
-    }
+  // Starting position for motion search.
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
 
-    return;
-  }
-
-  // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
-  // predictors.
-  int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
-  // Y predictor
-  for (i = 0; i < BH; i += ys) {
-    for (j = 0; j < BW; j += xs) {
-      const MV mv = blk_mvs[k];
-      const int y_offset = i * stride + j;
-      const int p_offset = i * BW + j;
-
-      av1_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
-                                BW, &mv, scale, xs, ys, &conv_params,
-                                interp_filters, &warp_types, x, y, 0, 0,
-                                MV_PRECISION_Q3, x, y, xd, can_use_previous);
-      k++;
-    }
-  }
-
-  // U and V predictors
-  if (num_planes > 1) {
-    ys = (uv_block_height >> 1);
-    xs = (uv_block_width >> 1);
-    k = 0;
-
-    for (i = 0; i < uv_block_height; i += ys) {
-      for (j = 0; j < uv_block_width; j += xs) {
-        const MV mv = blk_mvs[k];
-        const int uv_offset = i * uv_stride + j;
-        const int p_offset = i * uv_block_width + j;
-
-        av1_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
-                                  &pred[BLK_PELS + p_offset], uv_block_width,
-                                  &mv, scale, xs, ys, &conv_params,
-                                  interp_filters, &warp_types, x, y, 1, 0,
-                                  mv_precision_uv, x, y, xd, can_use_previous);
-        av1_build_inter_predictor(
-            v_mb_ptr + uv_offset, uv_stride, &pred[(BLK_PELS << 1) + p_offset],
-            uv_block_width, &mv, scale, xs, ys, &conv_params, interp_filters,
-            &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd,
-            can_use_previous);
-        k++;
-      }
-    }
-  }
-}
-
-static void apply_temporal_filter_self(const uint8_t *pred, int buf_stride,
-                                       unsigned int block_width,
-                                       unsigned int block_height,
-                                       int filter_weight, uint32_t *accumulator,
-                                       uint16_t *count) {
-  const int modifier = filter_weight * 16;
-  unsigned int i, j, k = 0;
-  assert(filter_weight == 2);
-
-  for (i = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++) {
-      const int pixel_value = pred[i * buf_stride + j];
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-      ++k;
-    }
-  }
-}
-
-static void highbd_apply_temporal_filter_self(
-    const uint8_t *pred8, int buf_stride, unsigned int block_width,
-    unsigned int block_height, int filter_weight, uint32_t *accumulator,
-    uint16_t *count) {
-  const int modifier = filter_weight * 16;
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  unsigned int i, j, k = 0;
-  assert(filter_weight == 2);
-
-  for (i = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++) {
-      const int pixel_value = pred[i * buf_stride + j];
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-      ++k;
-    }
-  }
-}
-
-static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
-                            int filter_weight) {
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-static INLINE int highbd_mod_index(int64_t sum_dist, int index, int rounding,
-                                   int strength, int filter_weight) {
-  assert(index >= 0 && index <= 13);
-  assert(highbd_index_mult[index] != 0);
-
-  int mod =
-      (int)((AOMMIN(sum_dist, INT32_MAX) * highbd_index_mult[index]) >> 32);
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-static INLINE void calculate_squared_errors(const uint8_t *s, int s_stride,
-                                            const uint8_t *p, int p_stride,
-                                            uint16_t *diff_sse, unsigned int w,
-                                            unsigned int h) {
-  int idx = 0;
-  unsigned int i, j;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int16_t diff = s[i * s_stride + j] - p[i * p_stride + j];
-      diff_sse[idx] = diff * diff;
-      idx++;
-    }
-  }
-}
-
-static INLINE int get_filter_weight(unsigned int i, unsigned int j,
-                                    unsigned int block_height,
-                                    unsigned int block_width, const int *blk_fw,
-                                    int use_32x32) {
-  if (use_32x32)
-    // blk_fw[0] ~ blk_fw[3] are the same.
-    return blk_fw[0];
-
-  int filter_weight = 0;
-  if (i < block_height / 2) {
-    if (j < block_width / 2)
-      filter_weight = blk_fw[0];
-    else
-      filter_weight = blk_fw[1];
-  } else {
-    if (j < block_width / 2)
-      filter_weight = blk_fw[2];
-    else
-      filter_weight = blk_fw[3];
-  }
-  return filter_weight;
-}
-
-void av1_apply_temporal_filter_c(
-    const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
-    int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
-    int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
-    int uv_buf_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
-    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
-  unsigned int i, j, k, m;
-  int modifier;
-  const int rounding = (1 << strength) >> 1;
-  const unsigned int uv_block_width = block_width >> ss_x;
-  const unsigned int uv_block_height = block_height >> ss_y;
-  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
-  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
-  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
-
-  int idx = 0, idy;
-
-  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
-  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
-  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
-
-  // Calculate diff^2 for each pixel of the block.
-  // TODO(yunqing): the following code needs to be optimized.
-  calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride, y_diff_sse,
-                           block_width, block_height);
-  calculate_squared_errors(u_frame1, uv_stride, u_pred, uv_buf_stride,
-                           u_diff_sse, uv_block_width, uv_block_height);
-  calculate_squared_errors(v_frame1, uv_stride, v_pred, uv_buf_stride,
-                           v_diff_sse, uv_block_width, uv_block_height);
-
-  for (i = 0, k = 0, m = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++) {
-      const int pixel_value = y_pred[i * y_buf_stride + j];
-      int filter_weight =
-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
-
-      // non-local mean approach
-      int y_index = 0;
-
-      const int uv_r = i >> ss_y;
-      const int uv_c = j >> ss_x;
-      modifier = 0;
-
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          const int row = (int)i + idy;
-          const int col = (int)j + idx;
-
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            modifier += y_diff_sse[row * (int)block_width + col];
-            ++y_index;
-          }
-        }
-      }
-
-      assert(y_index > 0);
-
-      modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
-      modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
-
-      y_index += 2;
-
-      modifier =
-          (int)mod_index(modifier, y_index, rounding, strength, filter_weight);
-
-      y_count[k] += modifier;
-      y_accumulator[k] += modifier * pixel_value;
-
-      ++k;
-
-      // Process chroma component
-      if (!(i & ss_y) && !(j & ss_x)) {
-        const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c];
-        const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
-
-        // non-local mean approach
-        int cr_index = 0;
-        int u_mod = 0, v_mod = 0;
-        int y_diff = 0;
-
-        for (idy = -1; idy <= 1; ++idy) {
-          for (idx = -1; idx <= 1; ++idx) {
-            const int row = uv_r + idy;
-            const int col = uv_c + idx;
-
-            if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
-                col < (int)uv_block_width) {
-              u_mod += u_diff_sse[row * uv_block_width + col];
-              v_mod += v_diff_sse[row * uv_block_width + col];
-              ++cr_index;
-            }
-          }
-        }
-
-        assert(cr_index > 0);
-
-        for (idy = 0; idy < 1 + ss_y; ++idy) {
-          for (idx = 0; idx < 1 + ss_x; ++idx) {
-            const int row = (uv_r << ss_y) + idy;
-            const int col = (uv_c << ss_x) + idx;
-            y_diff += y_diff_sse[row * (int)block_width + col];
-            ++cr_index;
-          }
-        }
-
-        u_mod += y_diff;
-        v_mod += y_diff;
-
-        u_mod =
-            (int)mod_index(u_mod, cr_index, rounding, strength, filter_weight);
-        v_mod =
-            (int)mod_index(v_mod, cr_index, rounding, strength, filter_weight);
-
-        u_count[m] += u_mod;
-        u_accumulator[m] += u_mod * u_pixel_value;
-        v_count[m] += v_mod;
-        v_accumulator[m] += v_mod * v_pixel_value;
-
-        ++m;
-      }  // Complete YUV pixel
-    }
-  }
-}
-
-static INLINE void highbd_calculate_squared_errors(
-    const uint16_t *s, int s_stride, const uint16_t *p, int p_stride,
-    uint32_t *diff_sse, unsigned int w, unsigned int h) {
-  int idx = 0;
-  unsigned int i, j;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int16_t diff = s[i * s_stride + j] - p[i * p_stride + j];
-      diff_sse[idx] = diff * diff;
-      idx++;
-    }
-  }
-}
-
-void av1_highbd_apply_temporal_filter_c(
-    const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride,
-    const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up,
-    const uint8_t *vp, int uv_buf_stride, unsigned int block_width,
-    unsigned int block_height, int ss_x, int ss_y, int strength,
-    const int *blk_fw, int use_32x32, uint32_t *y_accumulator,
-    uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count,
-    uint32_t *v_accumulator, uint16_t *v_count) {
-  unsigned int i, j, k, m;
-  int64_t modifier;
-  const int rounding = (1 << strength) >> 1;
-  const unsigned int uv_block_width = block_width >> ss_x;
-  const unsigned int uv_block_height = block_height >> ss_y;
-  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
-  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
-  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
-
-  const uint16_t *y_frame1 = CONVERT_TO_SHORTPTR(yf);
-  const uint16_t *u_frame1 = CONVERT_TO_SHORTPTR(uf);
-  const uint16_t *v_frame1 = CONVERT_TO_SHORTPTR(vf);
-  const uint16_t *y_pred = CONVERT_TO_SHORTPTR(yp);
-  const uint16_t *u_pred = CONVERT_TO_SHORTPTR(up);
-  const uint16_t *v_pred = CONVERT_TO_SHORTPTR(vp);
-  int idx = 0, idy;
-
-  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
-  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
-  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
-
-  // Calculate diff^2 for each pixel of the block.
-  // TODO(yunqing): the following code needs to be optimized.
-  highbd_calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride,
-                                  y_diff_sse, block_width, block_height);
-  highbd_calculate_squared_errors(u_frame1, uv_stride, u_pred, uv_buf_stride,
-                                  u_diff_sse, uv_block_width, uv_block_height);
-  highbd_calculate_squared_errors(v_frame1, uv_stride, v_pred, uv_buf_stride,
-                                  v_diff_sse, uv_block_width, uv_block_height);
-
-  for (i = 0, k = 0, m = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++) {
-      const int pixel_value = y_pred[i * y_buf_stride + j];
-      int filter_weight =
-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
-
-      // non-local mean approach
-      int y_index = 0;
-
-      const int uv_r = i >> ss_y;
-      const int uv_c = j >> ss_x;
-      modifier = 0;
-
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          const int row = (int)i + idy;
-          const int col = (int)j + idx;
-
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            modifier += y_diff_sse[row * (int)block_width + col];
-            ++y_index;
-          }
-        }
-      }
-
-      assert(y_index > 0);
-
-      modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
-      modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
-
-      y_index += 2;
-
-      const int final_y_mod = highbd_mod_index(modifier, y_index, rounding,
-                                               strength, filter_weight);
-
-      y_count[k] += final_y_mod;
-      y_accumulator[k] += final_y_mod * pixel_value;
-
-      ++k;
-
-      // Process chroma component
-      if (!(i & ss_y) && !(j & ss_x)) {
-        const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c];
-        const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
-
-        // non-local mean approach
-        int cr_index = 0;
-        int64_t u_mod = 0, v_mod = 0;
-        int y_diff = 0;
-
-        for (idy = -1; idy <= 1; ++idy) {
-          for (idx = -1; idx <= 1; ++idx) {
-            const int row = uv_r + idy;
-            const int col = uv_c + idx;
-
-            if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
-                col < (int)uv_block_width) {
-              u_mod += u_diff_sse[row * uv_block_width + col];
-              v_mod += v_diff_sse[row * uv_block_width + col];
-              ++cr_index;
-            }
-          }
-        }
-
-        assert(cr_index > 0);
-
-        for (idy = 0; idy < 1 + ss_y; ++idy) {
-          for (idx = 0; idx < 1 + ss_x; ++idx) {
-            const int row = (uv_r << ss_y) + idy;
-            const int col = (uv_c << ss_x) + idx;
-            y_diff += y_diff_sse[row * (int)block_width + col];
-            ++cr_index;
-          }
-        }
-
-        u_mod += y_diff;
-        v_mod += y_diff;
-
-        const int final_u_mod = highbd_mod_index(u_mod, cr_index, rounding,
-                                                 strength, filter_weight);
-        const int final_v_mod = highbd_mod_index(v_mod, cr_index, rounding,
-                                                 strength, filter_weight);
-
-        u_count[m] += final_u_mod;
-        u_accumulator[m] += final_u_mod * u_pixel_value;
-        v_count[m] += final_v_mod;
-        v_accumulator[m] += final_v_mod * v_pixel_value;
-
-        ++m;
-      }  // Complete YUV pixel
-    }
-  }
-}
-
-// Only used in single plane case
-void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
-                                 uint8_t *frame2, unsigned int block_width,
-                                 unsigned int block_height, int strength,
-                                 const int *blk_fw, int use_32x32,
-                                 unsigned int *accumulator, uint16_t *count) {
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
-
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
-      int filter_weight =
-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
-
-      // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
-
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
-
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
-          }
-        }
-      }
-
-      assert(index > 0);
-
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
-
-      modifier *= 3;
-      modifier /= index;
-
-      ++frame2;
-
-      modifier += rounding;
-      modifier >>= strength;
-
-      if (modifier > 16) modifier = 16;
-
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
-
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-
-      byte++;
-    }
-
-    byte += stride - block_width;
-  }
-}
-
-// Only used in single plane case
-void av1_highbd_temporal_filter_apply_c(
-    uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
-    unsigned int block_width, unsigned int block_height, int strength,
-    int *blk_fw, int use_32x32, unsigned int *accumulator, uint16_t *count) {
-  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
-  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
-
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
-      int filter_weight =
-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
-
-      // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
-
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
-
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
-          }
-        }
-      }
-
-      assert(index > 0);
-
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
-
-      modifier *= 3;
-      modifier /= index;
-
-      ++frame2;
-
-      modifier += rounding;
-      modifier >>= strength;
-
-      if (modifier > 16) modifier = 16;
-
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
-
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-
-      byte++;
-    }
-
-    byte += stride - block_width;
-  }
-}
-
-static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
-                                              uint8_t *arf_frame_buf,
-                                              uint8_t *frame_ptr_buf,
-                                              int stride, int x_pos, int y_pos,
-                                              MV *blk_mvs, int *blk_bestsme) {
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  int bestsme = INT_MAX;
+  // Setup.
+  mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+  // Unused intermediate results for motion search.
+  unsigned int sse, error;
   int distortion;
-  unsigned int sse;
   int cost_list[5];
-  MvLimits tmp_mv_limits = x->mv_limits;
 
-  MV best_ref_mv1 = kZeroMv;
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  // Do motion search.
+  // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
+  // searched result will be stored in `mb->best_mv`.
+  int_mv best_mv;
+  int block_mse = INT_MAX;
+  mb->mv_cost_type = mv_cost_type;
 
-  // Save input state
-  struct buf_2d src = x->plane[0].src;
-  struct buf_2d pre = xd->plane[0].pre[0];
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, &ss_cfg);
+  full_ms_params.run_mesh_search = 1;
+  full_ms_params.search_method = full_search_method;
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+                        NULL);
 
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+  // Since we are merely refining the result from full pixel search, we don't
+  // need regularization for subpel search
+  mb->mv_cost_type = MV_COST_NONE;
+  if (force_integer_mv == 1) {  // Only do full search on the entire block.
+    const int mv_row = best_mv.as_mv.row;
+    const int mv_col = best_mv.as_mv.col;
+    best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
+    best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
+    const int mv_offset = mv_row * y_stride + mv_col;
+    error = cpi->fn_ptr[block_size].vf(
+        ref_frame->y_buffer + y_offset + mv_offset, y_stride,
+        frame_to_filter->y_buffer + y_offset, y_stride, &sse);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    mb->e_mbd.mi[0]->mv[0] = best_mv;
+  } else {  // Do fractional search on the entire block and all sub-blocks.
+    av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
+                                      &baseline_mv, cost_list);
+    ms_params.forced_stop = EIGHTH_PEL;
+    ms_params.var_params.subpel_search_type = subpel_search_type;
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    error = cpi->mv_search_params.find_fractional_mv_step(
+        &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
+        &distortion, &sse, NULL);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    *ref_mv = best_mv.as_mv;
+    // On 4 sub-blocks.
+    const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
+    const int subblock_height = block_size_high[subblock_size];
+    const int subblock_width = block_size_wide[subblock_size];
+    const int subblock_pels = subblock_height * subblock_width;
+    start_mv = get_fullmv_from_mv(ref_mv);
 
-  // Setup frame pointers
-  x->plane[0].src.buf = arf_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = frame_ptr_buf;
-  xd->plane[0].pre[0].stride = stride;
+    int subblock_idx = 0;
+    for (int i = 0; i < mb_height; i += subblock_height) {
+      for (int j = 0; j < mb_width; j += subblock_width) {
+        const int offset = i * y_stride + j;
+        mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+        mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+        mb->mv_cost_type = mv_cost_type;
 
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+        av1_make_default_fullpel_ms_params(
+            &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg);
+        full_ms_params.run_mesh_search = 1;
+        full_ms_params.search_method = full_search_method;
+        av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                              cond_cost_list(cpi, cost_list),
+                              &best_mv.as_fullmv, NULL);
 
-  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  // av1_full_pixel_search() parameters: best_ref_mv1_full is the start mv, and
-  // best_ref_mv1 is for mv rate calculation. The search result is stored in
-  // x->best_mv.
-  av1_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, NSTEP,
-                        1, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
-                        0, 0, x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
-  x->mv_limits = tmp_mv_limits;
-
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  if (cpi->common.cur_frame_force_integer_mv == 1) {
-    const uint8_t *const src_address = x->plane[0].src.buf;
-    const int src_stride = x->plane[0].src.stride;
-    const uint8_t *const y = xd->plane[0].pre[0].buf;
-    const int y_stride = xd->plane[0].pre[0].stride;
-    const int offset = x->best_mv.as_mv.row * y_stride + x->best_mv.as_mv.col;
-
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-
-    bestsme = cpi->fn_ptr[TF_BLOCK].vf(y + offset, y_stride, src_address,
-                                       src_stride, &sse);
-
-    x->e_mbd.mi[0]->mv[0] = x->best_mv;
-
-    // Restore input state
-    x->plane[0].src = src;
-    xd->plane[0].pre[0] = pre;
-
-    return bestsme;
-  }
-
-  // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
-  // calculation. The start full mv and the search result are stored in
-  // x->best_mv. mi_row and mi_col are only needed for "av1_is_scaled(sf)=1"
-  // case.
-  bestsme = cpi->find_fractional_mv_step(
-      x, &cpi->common, 0, 0, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_iters_per_step,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
-      0, 0, BW, BH, USE_8_TAPS, 1);
-
-  x->e_mbd.mi[0]->mv[0] = x->best_mv;
-
-  // DO motion search on 4 16x16 sub_blocks.
-  int i, j, k = 0;
-  best_ref_mv1.row = x->e_mbd.mi[0]->mv[0].as_mv.row;
-  best_ref_mv1.col = x->e_mbd.mi[0]->mv[0].as_mv.col;
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  for (i = 0; i < BH; i += SUB_BH) {
-    for (j = 0; j < BW; j += SUB_BW) {
-      // Setup frame pointers
-      x->plane[0].src.buf = arf_frame_buf + i * stride + j;
-      x->plane[0].src.stride = stride;
-      xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
-      xd->plane[0].pre[0].stride = stride;
-
-      av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-      av1_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
-                            step_param, NSTEP, 1, sadpb,
-                            cond_cost_list(cpi, cost_list), &best_ref_mv1, 0, 0,
-                            x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
-      x->mv_limits = tmp_mv_limits;
-
-      blk_bestsme[k] = cpi->find_fractional_mv_step(
-          x, &cpi->common, 0, 0, &best_ref_mv1,
-          cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[TF_SUB_BLOCK], 0, mv_sf->subpel_iters_per_step,
-          cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
-          NULL, 0, 0, SUB_BW, SUB_BH, USE_8_TAPS, 1);
-
-      blk_mvs[k] = x->best_mv.as_mv;
-      k++;
+        // Since we are merely refining the result from full pixel search, we
+        // don't need regularization for subpel search
+        mb->mv_cost_type = MV_COST_NONE;
+        av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+                                          &baseline_mv, cost_list);
+        ms_params.forced_stop = EIGHTH_PEL;
+        ms_params.var_params.subpel_search_type = subpel_search_type;
+        subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+        error = cpi->mv_search_params.find_fractional_mv_step(
+            &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+            &best_mv.as_mv, &distortion, &sse, NULL);
+        subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+        subblock_mvs[subblock_idx] = best_mv.as_mv;
+        ++subblock_idx;
+      }
     }
   }
 
-  // Restore input state
-  x->plane[0].src = src;
-  xd->plane[0].pre[0] = pre;
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+  mb->mv_cost_type = ori_mv_cost_type;
 
-  return bestsme;
+  return block_mse;
 }
 
-static void temporal_filter_iterate_c(AV1_COMP *cpi,
-                                      YV12_BUFFER_CONFIG **frames,
-                                      int frame_count, int alt_ref_index,
-                                      int strength,
-                                      struct scale_factors *ref_scale_factors) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  int byte;
-  int frame;
-  int mb_col, mb_row;
-  int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
-  int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
-  int mb_y_offset = 0;
-  int mb_y_src_offset = 0;
-  int mb_uv_offset = 0;
-  int mb_uv_src_offset = 0;
-  DECLARE_ALIGNED(16, unsigned int, accumulator[BLK_PELS * 3]);
-  DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
-  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
-  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
-  uint8_t *dst1, *dst2;
-  DECLARE_ALIGNED(32, uint16_t, predictor16[BLK_PELS * 3]);
-  DECLARE_ALIGNED(32, uint8_t, predictor8[BLK_PELS * 3]);
-  uint8_t *predictor;
-  const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
-  const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
+// Helper function to get weight according to thresholds.
+static INLINE int get_weight_by_thresh(const int value, const int low,
+                                       const int high) {
+  return value < low ? 2 : value < high ? 1 : 0;
+}
 
-  // Save input state
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  int i;
-  const int is_hbd = is_cur_buf_hbd(mbd);
-  if (is_hbd) {
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  } else {
-    predictor = predictor8;
+// Gets filter weight for blocks in temporal filtering. The weights will be
+// assigned based on the motion search errors.
+// NOTE: Besides assigning filter weight for the block, this function will also
+// determine whether to split the entire block into 4 sub-blocks for further
+// filtering.
+// TODO(any): Many magic numbers are used in this function. They may be tuned
+// to improve the performance.
+// Inputs:
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  will affect the filter weight for the to-filter frame.
+//   subblock_filter_weights: Pointer to the assigned filter weight for each
+//                            sub-block. If not using sub-blocks, the first
+//                            element will be used for the entire block.
+// Returns: Whether to use 4 sub-blocks to replace the original block.
+static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
+                                const int is_second_arf,
+                                int *subblock_filter_weights) {
+  // `block_mse` is initialized as INT_MAX and will be overwritten after the
+  // motion search with reference frame, therefore INT_MAX can ONLY be accessed
+  // by to-filter frame.
+  if (block_mse == INT_MAX) {
+    const int weight = TF_ENABLE_PLANEWISE_STRATEGY
+                           ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
+                           : is_second_arf ? 64 : 32;
+    subblock_filter_weights[0] = subblock_filter_weights[1] =
+        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
+    return 0;
   }
 
-  mbd->block_ref_scale_factors[0] = ref_scale_factors;
-  mbd->block_ref_scale_factors[1] = ref_scale_factors;
+  const int thresh_low = is_second_arf ? 20 : 40;
+  const int thresh_high = is_second_arf ? 40 : 80;
 
-  for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+  int min_subblock_mse = INT_MAX;
+  int max_subblock_mse = INT_MIN;
+  int sum_subblock_mse = 0;
+  for (int i = 0; i < 4; ++i) {
+    sum_subblock_mse += subblock_mses[i];
+    min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+    max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+    subblock_filter_weights[i] =
+        get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
+  }
 
-  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-    // Source frames are extended to 16 pixels. This is different than
-    //  L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
-    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
-    //  before and 3 pixels after.  So the largest Y mv on a border would
-    //  then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
-    //  Y and therefore only extended by 8.  The largest mv that a UV block
-    //  can support is 8 - AOM_INTERP_EXTEND.  A UV mv is half of a Y mv.
-    //  (16 - AOM_INTERP_EXTEND) >> 1 which is greater than
-    //  8 - AOM_INTERP_EXTEND.
-    // To keep the mv in play for both Y and UV planes the max that it
-    //  can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
-    cpi->td.mb.mv_limits.row_min =
-        -((mb_row * BH) + (17 - 2 * AOM_INTERP_EXTEND));
-    cpi->td.mb.mv_limits.row_max =
-        ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * AOM_INTERP_EXTEND);
+  if (((block_mse * 15 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 48) ||
+      ((block_mse * 14 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 24)) {  // No split.
+    const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
+    subblock_filter_weights[0] = subblock_filter_weights[1] =
+        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
+    return 0;
+  } else {  // Do split.
+    return 1;
+  }
+}
 
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int j, k;
-      int stride;
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
 
-      memset(accumulator, 0, BLK_PELS * 3 * sizeof(accumulator[0]));
-      memset(count, 0, BLK_PELS * 3 * sizeof(count[0]));
+// Builds predictor for blocks in temporal filtering. This is the second step
+// for temporal filtering, which is to construct predictions from all reference
+// frames INCLUDING the frame to be filtered itself. These predictors are built
+// based on the motion search results (motion vector is set as 0 for the frame
+// to be filtered), and will be futher used for weighted averaging.
+// Inputs:
+//   ref_frame: Pointer to the reference frame (or the frame to be filtered).
+//   mbd: Pointer to the block for filtering. Besides containing the subsampling
+//        information of all planes, this field also gives the searched motion
+//        vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector
+//        should be 0 if the `ref_frame` itself is the frame to be filtered.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   scale: Scaling factor.
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   subblock_mvs: The motion vectors for each sub-block (row-major order).
+//   pred: Pointer to the predictor to build.
+// Returns:
+//   Nothing will be returned. But the content to which `pred` points will be
+//   modified.
+static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
+                               const MACROBLOCKD *mbd,
+                               const BLOCK_SIZE block_size, const int mb_row,
+                               const int mb_col, const int num_planes,
+                               const struct scale_factors *scale,
+                               const int use_subblock, const MV *subblock_mvs,
+                               uint8_t *pred) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
-      cpi->td.mb.mv_limits.col_min =
-          -((mb_col * BW) + (17 - 2 * AOM_INTERP_EXTEND));
-      cpi->td.mb.mv_limits.col_max =
-          ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * AOM_INTERP_EXTEND);
+  // Information of the entire block.
+  const int mb_height = block_size_high[block_size];  // Height.
+  const int mb_width = block_size_wide[block_size];   // Width.
+  const int mb_pels = mb_height * mb_width;           // Number of pixels.
+  const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
+  const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
+  const int bit_depth = mbd->bd;                      // Bit depth.
+  const int is_intrabc = 0;                           // Is intra-copied?
+  const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
+  const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
+  const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
+  const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
 
-      for (frame = 0; frame < frame_count; frame++) {
-        // MVs for 4 16x16 sub blocks.
-        MV blk_mvs[4];
-        // Filter weights for 4 16x16 sub blocks.
-        int blk_fw[4] = { 0, 0, 0, 0 };
-        int use_32x32 = 0;
+  // Information of each sub-block (actually in use).
+  const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
+  const int block_height = mb_height >> (num_blocks - 1);  // Height.
+  const int block_width = mb_width >> (num_blocks - 1);    // Width.
 
+  // Default interpolation filters.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP);
+
+  // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    // Information of each sub-block in current plane.
+    const int plane_h = mb_height >> subsampling_y;  // Plane height.
+    const int plane_w = mb_width >> subsampling_x;   // Plane width.
+    const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
+    const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
+    const int h = block_height >> subsampling_y;     // Sub-block height.
+    const int w = block_width >> subsampling_x;      // Sub-block width.
+    const int is_y_plane = (plane == 0);             // Is Y-plane?
+
+    const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
+                                    ref_frame->widths[is_y_plane ? 0 : 1],
+                                    ref_frame->heights[is_y_plane ? 0 : 1],
+                                    ref_frame->strides[is_y_plane ? 0 : 1] };
+
+    // Handle entire block or sub-blocks if needed.
+    int subblock_idx = 0;
+    for (int i = 0; i < plane_h; i += h) {
+      for (int j = 0; j < plane_w; j += w) {
+        // Choose proper motion vector.
+        const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
+        assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+               mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+        const int y = plane_y + i;
+        const int x = plane_x + j;
+
+        // Build predictior for each sub-block on current plane.
+        InterPredParams inter_pred_params;
+        av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                              subsampling_y, bit_depth, is_high_bitdepth,
+                              is_intrabc, scale, &ref_buf, interp_filters);
+        inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+        av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
+                                          plane_w, &mv, &inter_pred_params);
+
+        ++subblock_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Computes temporal filter weights and accumulators for the frame to be
+// filtered. More concretely, the filter weights for all pixels are the same.
+// Inputs:
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes as well as the bit-depth.
+//   block_size: Size of the block.
+//   num_planes: Number of planes in the frame.
+//   filter_weight: Weight used for filtering.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
+                                    const BLOCK_SIZE block_size,
+                                    const int num_planes,
+                                    const int filter_weight,
+                                    const uint8_t *pred, uint32_t *accum,
+                                    uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_cur_buf_hbd(mbd);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        accum[idx] += filter_weight * pred_value;
+        count[idx] += filter_weight;
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Function to compute pixel-wise squared difference between two buffers.
+// Inputs:
+//   ref: Pointer to reference buffer.
+//   ref_offset: Start position of reference buffer for computation.
+//   ref_stride: Stride for reference buffer.
+//   tgt: Pointer to target buffer.
+//   tgt_offset: Start position of target buffer for computation.
+//   tgt_stride: Stride for target buffer.
+//   height: Height of block for computation.
+//   width: Width of block for computation.
+//   is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
+//   square_diff: Pointer to save the squared differces.
+// Returns:
+//   Nothing will be returned. But the content to which `square_diff` points
+//   will be modified.
+static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+                                       const int ref_stride, const uint8_t *tgt,
+                                       const int tgt_offset,
+                                       const int tgt_stride, const int height,
+                                       const int width,
+                                       const int is_high_bitdepth,
+                                       uint32_t *square_diff) {
+  const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+  const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
+
+  int ref_idx = 0;
+  int tgt_idx = 0;
+  int idx = 0;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
+                                                  : ref[ref_offset + ref_idx];
+      const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
+                                                  : tgt[tgt_offset + tgt_idx];
+      const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
+                                                    : (tgt_value - ref_value);
+      square_diff[idx] = diff * diff;
+
+      ++ref_idx;
+      ++tgt_idx;
+      ++idx;
+    }
+    ref_idx += (ref_stride - width);
+    tgt_idx += (tgt_stride - width);
+  }
+}
+
+// Function to adjust the filter weight when use YUV strategy.
+// Inputs:
+//   filter_weight: Original filter weight.
+//   sum_square_diff: Sum of squared difference between input frame and
+//                    prediction. This field is computed pixel by pixel, and
+//                    is used as a reference for the filter weight adjustment.
+//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
+//                   This field should align with the above lookup tables
+//                   `filter_weight_adjustment_lookup_table_yuv` and
+//                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
+//   strength: Strength for filter weight adjustment.
+// Returns:
+//   Adjusted filter weight which will finally be used for filtering.
+static INLINE int adjust_filter_weight_yuv(const int filter_weight,
+                                           const uint64_t sum_square_diff,
+                                           const int num_ref_pixels,
+                                           const int strength) {
+  int modifier =
+      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
+      num_ref_pixels;
+  const int rounding = (1 << strength) >> 1;
+  modifier = (modifier + rounding) >> strength;
+  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
+}
+
+// Applies temporal filter with YUV strategy.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all YUV planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   strength: Strength for filter weight adjustment.
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   subblock_filter_weights: The filter weights for each sub-block (row-major
+//                            order). If `use_subblock` is set as 0, the first
+//                            weight will be applied to the entire block.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_yuv_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff =
+      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
+                        plane_w, plane_h, plane_w, is_high_bitdepth,
+                        square_diff + plane_offset);
+    plane_offset += mb_pels;
+  }
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
+
+  // Handle planes in sequence.
+  plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    // Perform filtering.
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        // non-local mean approach
+        uint64_t sum_square_diff = 0;
+        int num_ref_pixels = 0;
+
+        for (int wi = -half_window; wi <= half_window; ++wi) {
+          for (int wj = -half_window; wj <= half_window; ++wj) {
+            const int y = i + wi;  // Y-coord on the current plane.
+            const int x = j + wj;  // X-coord on the current plane.
+            if (y >= 0 && y < h && x >= 0 && x < w) {
+              sum_square_diff += square_diff[plane_offset + y * w + x];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
+          for (int p = 1; p < num_planes; ++p) {
+            const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
+            const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
+            const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
+            const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
+            const int ww = w >> ss_x_shift;  // Width of UV-plane.
+            sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
+            ++num_ref_pixels;
+          }
+        } else {  // Filter U-plane and V-plane using Y-plane.
+          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
+          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = w << ss_x_shift;         // Width of Y-plane.
+              sum_square_diff += square_diff[yy * ww + xx];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        // Base filter weight estimated by motion search error.
+        const int subblock_idx =
+            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
+        const int filter_weight = subblock_filter_weights[subblock_idx];
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        const int adjusted_weight = adjust_filter_weight_yuv(
+            filter_weight, sum_square_diff, num_ref_pixels, strength);
+        accum[idx] += adjusted_weight * pred_value;
+        count[idx] += adjusted_weight;
+
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+
+  aom_free(square_diff);
+}
+
+// Applies temporal filter with plane-wise strategy.
+// The strategy of filter weight adjustment is different from the function
+// `av1_apply_temporal_filter_yuv_c()`.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor. This is actually the `q` defined in libaom,
+//             which is converted from `qindex`.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_planewise_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff =
+      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
+                        plane_w, plane_h, plane_w, is_high_bitdepth,
+                        square_diff + plane_offset);
+    plane_offset += mb_pels;
+  }
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
+
+  // Hyper-parameter for filter weight adjustment.
+  const int frame_height = frame_to_filter->heights[0]
+                           << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  // Handle planes in sequence.
+  plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    // Perform filtering.
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        // non-local mean approach
+        uint64_t sum_square_diff = 0;
+        int num_ref_pixels = 0;
+
+        for (int wi = -half_window; wi <= half_window; ++wi) {
+          for (int wj = -half_window; wj <= half_window; ++wj) {
+            const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
+            const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
+            sum_square_diff += square_diff[plane_offset + y * w + x];
+            ++num_ref_pixels;
+          }
+        }
+
+        // Filter U-plane and V-plane using Y-plane. This is because motion
+        // search is only done on Y-plane, so the information from Y-plane will
+        // be more accurate.
+        if (plane != 0) {
+          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
+          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = w << ss_x_shift;         // Width of Y-plane.
+              sum_square_diff += square_diff[yy * ww + xx];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        // Scale down the difference for high bit depth input.
+        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
+        const double window_error = (double)(sum_square_diff) / num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error =
+            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+        // Control factor for non-local mean approach.
+        const double r =
+            (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
+        const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+
+        // Compute filter weight.
+        const double scaled_diff =
+            AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
+        const int adjusted_weight =
+            (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        accum[idx] += adjusted_weight * pred_value;
+        count[idx] += adjusted_weight;
+
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+
+  aom_free(square_diff);
+}
+
+// Computes temporal filter weights and accumulators from all reference frames
+// excluding the current frame to be filtered.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes and the bit-depth.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//                 (Used in YUV strategy)
+//   subblock_filter_weights: The filter weights for each sub-block (row-major
+//                            order). If `use_subblock` is set as 0, the first
+//                            weight will be applied to the entire block. (Used
+//                            in YUV strategy)
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order). (Used in plane-wise
+//                 strategy)
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_others(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const double *noise_levels,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  if (TF_ENABLE_PLANEWISE_STRATEGY) {
+    // TODO(any): avx2 and sse2 version should be changed to align with C
+    // function before using.
+    if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) {
+      av1_apply_temporal_filter_planewise_c(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
+    } else {
+      av1_apply_temporal_filter_planewise(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
+    }
+  } else {  // Commonly used for low-resolution video.
+    if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
+        subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
+      return;
+    }
+    const int adj_strength = strength + 2 * (mbd->bd - 8);
+    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
+        block_size != BLOCK_32X32) {
+      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
+                                    mb_col, num_planes, adj_strength,
+                                    use_subblock, subblock_filter_weights, pred,
+                                    accum, count);
+    } else {
+      // TODO(any): sse4 version should be changed to align with C function
+      // before using.
+      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
+                                      mb_col, num_planes, adj_strength,
+                                      use_subblock, subblock_filter_weights,
+                                      pred, accum, count);
+    }
+  }
+}
+
+// Normalizes the accumulated filtering result to produce the filtered frame.
+// Inputs:
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   accum: Pointer to the pre-computed accumulator.
+//   count: Pointer to the pre-computed count.
+//   result_buffer: Pointer to result buffer.
+// Returns:
+//   Nothing will be returned. But the content to which `result_buffer` point
+//   will be modified.
+static void tf_normalize_filtered_frame(
+    const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
+    const int mb_col, const int num_planes, const uint32_t *accum,
+    const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    uint8_t *const buf = result_buffer->buffers[plane];
+    uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
+
+    int plane_idx = 0;             // Pixel index on current plane (block-base).
+    int frame_idx = frame_offset;  // Pixel index on the entire frame.
+    for (int i = 0; i < plane_h; ++i) {
+      for (int j = 0; j < plane_w; ++j) {
+        const int idx = plane_idx + plane_offset;
+        const uint16_t rounding = count[idx] >> 1;
+        if (is_high_bitdepth) {
+          buf16[frame_idx] =
+              (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        } else {
+          buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        }
+        ++plane_idx;
+        ++frame_idx;
+      }
+      frame_idx += (frame_stride - plane_w);
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
+typedef struct {
+  int64_t sum;
+  int64_t sse;
+} FRAME_DIFF;
+
+// Does temporal filter for a particular frame.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   frames: Frame buffers used for temporal filtering.
+//   num_frames: Number of frames in the frame buffer.
+//   filter_frame_idx: Index of the frame to be filtered.
+//   is_key_frame: Whether the to-filter is a key frame.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  is ONLY used for assigning filter weight.
+//   block_size: Block size used for temporal filtering.
+//   scale: Scaling factor.
+//   strength: Pre-estimated strength for filter weight adjustment.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
+// Returns:
+//   Difference between filtered frame and the original frame.
+static FRAME_DIFF tf_do_filtering(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
+    const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
+    const BLOCK_SIZE block_size, const struct scale_factors *scale,
+    const int strength, const double *noise_levels) {
+  // Basic information.
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  const int mb_cols = get_num_blocks(frame_width, mb_width);
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+  // Setup.
+  mbd->block_ref_scale_factors[0] = scale;
+  mbd->block_ref_scale_factors[1] = scale;
+  // A temporary block info used to store state in temporal filtering process.
+  MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO));
+  memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO));
+  mbd->mi = &tmp_mb_mode_info;
+  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+  // Allocate memory for predictor, accumulator and count.
+  uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
+  uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
+  uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
+  memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
+  memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
+  uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
+
+  // Do filtering.
+  FRAME_DIFF diff = { 0, 0 };
+  // Perform temporal filtering block by block.
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                            cpi->oxcf.border_in_pixels);
+      memset(accum, 0, num_planes * mb_pels * sizeof(accum[0]));
+      memset(count, 0, num_planes * mb_pels * sizeof(count[0]));
+      MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
+      // Perform temporal filtering frame by frame.
+      for (int frame = 0; frame < num_frames; frame++) {
         if (frames[frame] == NULL) continue;
 
-        mbd->mi[0]->mv[0].as_mv.row = 0;
-        mbd->mi[0]->mv[0].as_mv.col = 0;
-        mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
-        blk_mvs[0] = kZeroMv;
-        blk_mvs[1] = kZeroMv;
-        blk_mvs[2] = kZeroMv;
-        blk_mvs[3] = kZeroMv;
+        // Motion search.
+        MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+        int subblock_filter_weights[4] = { 0, 0, 0, 0 };
+        int block_mse = INT_MAX;
+        int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
 
-        if (frame == alt_ref_index) {
-          blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
-          use_32x32 = 1;
-        } else {
-          int thresh_low = 10000;
-          int thresh_high = 20000;
-          int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-
-          // Find best match in this frame by MC
-          int err = temporal_filter_find_matching_mb_c(
-              cpi, frames[alt_ref_index]->y_buffer + mb_y_src_offset,
-              frames[frame]->y_buffer + mb_y_src_offset,
-              frames[frame]->y_stride, mb_col * BW, mb_row * BH, blk_mvs,
-              blk_bestsme);
-
-          int err16 =
-              blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
-          int max_err = INT_MIN, min_err = INT_MAX;
-          for (k = 0; k < 4; k++) {
-            if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
-            if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
-          }
-
-          if (((err * 15 < (err16 << 4)) && max_err - min_err < 12000) ||
-              ((err * 14 < (err16 << 4)) && max_err - min_err < 6000)) {
-            use_32x32 = 1;
-            // Assign higher weight to matching MB if it's error
-            // score is lower. If not applying MC default behavior
-            // is to weight all MBs equal.
-            blk_fw[0] = err < (thresh_low << THR_SHIFT)
-                            ? 2
-                            : err < (thresh_high << THR_SHIFT) ? 1 : 0;
-            blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
-          } else {
-            use_32x32 = 0;
-            for (k = 0; k < 4; k++)
-              blk_fw[k] = blk_bestsme[k] < thresh_low
-                              ? 2
-                              : blk_bestsme[k] < thresh_high ? 1 : 0;
+        if (frame == filter_frame_idx) {  // Frame to be filtered.
+          // Set motion vector as 0 for the frame to be filtered.
+          mbd->mi[0]->mv[0].as_mv = kZeroMv;
+          // Change ref_mv sign for following frames.
+          ref_mv.row *= -1;
+          ref_mv.col *= -1;
+        } else {  // Other reference frames.
+          block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
+                                       block_size, mb_row, mb_col, &ref_mv,
+                                       subblock_mvs, subblock_mses);
+          // Do not pass down the reference motion vector if error is too large.
+          const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
+          if (block_mse > (thresh << (mbd->bd - 8))) {
+            ref_mv = kZeroMv;
           }
         }
 
-        if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) {
-          // Construct the predictors
-          temporal_filter_predictors_mb_c(
-              mbd, frames[frame]->y_buffer + mb_y_src_offset,
-              frames[frame]->u_buffer + mb_uv_src_offset,
-              frames[frame]->v_buffer + mb_uv_src_offset,
-              frames[frame]->y_stride, mb_uv_width, mb_uv_height,
-              mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col,
-              predictor, ref_scale_factors, mb_col * BW, mb_row * BH,
-              cm->allow_warped_motion, num_planes, blk_mvs, use_32x32);
+        // Build predictor.
+        int use_subblock = tf_get_filter_weight(
+            block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
+        tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+                           num_planes, scale, use_subblock, subblock_mvs, pred);
 
-          // Apply the filter (YUV)
-          if (frame == alt_ref_index) {
-            uint8_t *pred = predictor;
-            uint32_t *accum = accumulator;
-            uint16_t *cnt = count;
-            int plane;
-
-            // All 4 blk_fws are equal to 2.
-            for (plane = 0; plane < num_planes; ++plane) {
-              const int pred_stride = plane ? mb_uv_width : BW;
-              const unsigned int w = plane ? mb_uv_width : BW;
-              const unsigned int h = plane ? mb_uv_height : BH;
-
-              if (is_hbd) {
-                highbd_apply_temporal_filter_self(pred, pred_stride, w, h,
-                                                  blk_fw[0], accum, cnt);
-              } else {
-                apply_temporal_filter_self(pred, pred_stride, w, h, blk_fw[0],
-                                           accum, cnt);
-              }
-
-              pred += BLK_PELS;
-              accum += BLK_PELS;
-              cnt += BLK_PELS;
-            }
-          } else {
-            if (is_hbd) {
-              const int adj_strength = strength + 2 * (mbd->bd - 8);
-
-              if (num_planes <= 1) {
-                // Single plane case
-                av1_highbd_temporal_filter_apply_c(
-                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
-                    BH, adj_strength, blk_fw, use_32x32, accumulator, count);
-              } else {
-                // Process 3 planes together.
-                av1_highbd_apply_temporal_filter(
-                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
-                    f->u_buffer + mb_uv_src_offset,
-                    f->v_buffer + mb_uv_src_offset, f->uv_stride,
-                    predictor + BLK_PELS, predictor + (BLK_PELS << 1),
-                    mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
-                    mbd->plane[1].subsampling_y, adj_strength, blk_fw,
-                    use_32x32, accumulator, count, accumulator + BLK_PELS,
-                    count + BLK_PELS, accumulator + (BLK_PELS << 1),
-                    count + (BLK_PELS << 1));
-              }
-            } else {
-              if (num_planes <= 1) {
-                // Single plane case
-                av1_temporal_filter_apply_c(
-                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
-                    BH, strength, blk_fw, use_32x32, accumulator, count);
-              } else {
-                // Process 3 planes together.
-                av1_apply_temporal_filter(
-                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
-                    f->u_buffer + mb_uv_src_offset,
-                    f->v_buffer + mb_uv_src_offset, f->uv_stride,
-                    predictor + BLK_PELS, predictor + (BLK_PELS << 1),
-                    mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
-                    mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
-                    accumulator, count, accumulator + BLK_PELS,
-                    count + BLK_PELS, accumulator + (BLK_PELS << 1),
-                    count + (BLK_PELS << 1));
-              }
-            }
-          }
+        // Perform weighted averaging.
+        if (frame == filter_frame_idx) {  // Frame to be filtered.
+          av1_apply_temporal_filter_self(mbd, block_size, num_planes,
+                                         subblock_filter_weights[0], pred,
+                                         accum, count);
+        } else {  // Other reference frames.
+          const FRAME_TYPE frame_type =
+              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
+                                                           : KEY_FRAME;
+          const int q_factor =
+              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
+                                           cpi->common.seq_params.bit_depth);
+          av1_apply_temporal_filter_others(
+              frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+              strength, use_subblock, subblock_filter_weights, noise_levels,
+              block_mse, subblock_mses, q_factor, pred, accum, count);
         }
       }
 
-      // Normalize filter output to produce AltRef frame
-      if (is_hbd) {
-        uint16_t *dst1_16;
-        uint16_t *dst2_16;
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < BH; i++) {
-          for (j = 0; j < BW; j++, k++) {
-            dst1_16[byte] =
-                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+      tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+                                  accum, count, &cpi->alt_ref_buffer);
 
-            // move to next pixel
-            byte++;
-          }
-
-          byte += stride - BW;
-        }
-        if (num_planes > 1) {
-          dst1 = cpi->alt_ref_buffer.u_buffer;
-          dst2 = cpi->alt_ref_buffer.v_buffer;
-          dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-          dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-          stride = cpi->alt_ref_buffer.uv_stride;
-          byte = mb_uv_offset;
-          for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
-            for (j = 0; j < mb_uv_width; j++, k++) {
-              int m = k + BLK_PELS;
-              // U
-              dst1_16[byte] =
-                  (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-              // V
-              dst2_16[byte] =
-                  (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-              // move to next pixel
-              byte++;
-            }
-            byte += stride - mb_uv_width;
-          }
-        }
-      } else {
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < BH; i++) {
-          for (j = 0; j < BW; j++, k++) {
-            dst1[byte] =
-                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-            // move to next pixel
-            byte++;
-          }
-          byte += stride - BW;
-        }
-        if (num_planes > 1) {
-          dst1 = cpi->alt_ref_buffer.u_buffer;
-          dst2 = cpi->alt_ref_buffer.v_buffer;
-          stride = cpi->alt_ref_buffer.uv_stride;
-          byte = mb_uv_offset;
-          for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
-            for (j = 0; j < mb_uv_width; j++, k++) {
-              int m = k + BLK_PELS;
-              // U
-              dst1[byte] =
-                  (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-              // V
-              dst2[byte] =
-                  (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-              // move to next pixel
-              byte++;
-            }
-            byte += stride - mb_uv_width;
-          }
-        }
+      if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) {
+        const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+        const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+        const int source_y_stride = frame_to_filter->y_stride;
+        const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
+        const int source_offset =
+            mb_row * y_height * source_y_stride + mb_col * y_width;
+        const int filter_offset =
+            mb_row * y_height * filter_y_stride + mb_col * y_width;
+        unsigned int sse = 0;
+        cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset,
+                                   source_y_stride,
+                                   cpi->alt_ref_buffer.y_buffer + filter_offset,
+                                   filter_y_stride, &sse);
+        diff.sum += sse;
+        diff.sse += sse * sse;
       }
-      mb_y_offset += BW;
-      mb_y_src_offset += BW;
-      mb_uv_offset += mb_uv_width;
-      mb_uv_src_offset += mb_uv_width;
     }
-    mb_y_offset += BH * cpi->alt_ref_buffer.y_stride - BW * mb_cols;
-    mb_y_src_offset += BH * f->y_stride - BW * mb_cols;
-    mb_uv_src_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
-    mb_uv_offset +=
-        mb_uv_height * cpi->alt_ref_buffer.uv_stride - mb_uv_width * mb_cols;
   }
 
   // Restore input state
-  for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mb_mode_info;
+
+  free(tmp_mb_mode_info);
+  aom_free(pred8);
+  aom_free(pred16);
+  aom_free(accum);
+  aom_free(count);
+
+  return diff;
 }
 
-// This is an adaptation of the mehtod in the following paper:
-// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
-// estimation using Laplacian operator and adaptive edge detection,"
-// Proc. 3rd International Symposium on Communications, Control and
-// Signal Processing, 2008, St Julians, Malta.
-//
-// Return noise estimate, or -1.0 if there was a failure
-static double estimate_noise(const uint8_t *src, int width, int height,
-                             int stride, int edge_thresh) {
-  int64_t sum = 0;
-  int64_t num = 0;
+// A constant number, sqrt(pi / 2),  used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
+double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
+                                            const int plane,
+                                            const int bit_depth) {
+  const int is_y_plane = (plane == 0);
+  const int height = frame->crop_heights[is_y_plane ? 0 : 1];
+  const int width = frame->crop_widths[is_y_plane ? 0 : 1];
+  const int stride = frame->strides[is_y_plane ? 0 : 1];
+  const uint8_t *src = frame->buffers[plane];
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+
+  int64_t accum = 0;
+  int count = 0;
   for (int i = 1; i < height - 1; ++i) {
     for (int j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      // Sobel gradients
-      const int Gx = (src[k - stride - 1] - src[k - stride + 1]) +
-                     (src[k + stride - 1] - src[k + stride + 1]) +
-                     2 * (src[k - 1] - src[k + 1]);
-      const int Gy = (src[k - stride - 1] - src[k + stride - 1]) +
-                     (src[k - stride + 1] - src[k + stride + 1]) +
-                     2 * (src[k - stride] - src[k + stride]);
-      const int Ga = abs(Gx) + abs(Gy);
-      if (Ga < edge_thresh) {  // Smooth pixels
-        // Find Laplacian
-        const int v =
-            4 * src[k] -
-            2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
-            (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
-             src[k + stride + 1]);
-        sum += abs(v);
-        ++num;
+      // Setup a small 3x3 matrix.
+      const int center_idx = i * stride + j;
+      int mat[3][3];
+      for (int ii = -1; ii <= 1; ++ii) {
+        for (int jj = -1; jj <= 1; ++jj) {
+          const int idx = center_idx + ii * stride + jj;
+          mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
+        }
+      }
+      // Compute sobel gradients.
+      const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
+      // Accumulate Laplacian.
+      if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) {  // Only count smooth pixels.
+        const int v = 4 * mat[1][1] -
+                      2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                      (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+        accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
+        ++count;
       }
     }
   }
-  // If very few smooth pels, return -1 since the estimate is unreliable
-  if (num < 16) return -1.0;
 
-  const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
-  return sigma;
+  // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
 }
 
-// Return noise estimate, or -1.0 if there was a failure
-static double highbd_estimate_noise(const uint8_t *src8, int width, int height,
-                                    int stride, int bd, int edge_thresh) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int64_t sum = 0;
-  int64_t num = 0;
-  for (int i = 1; i < height - 1; ++i) {
-    for (int j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      // Sobel gradients
-      const int Gx = (src[k - stride - 1] - src[k - stride + 1]) +
-                     (src[k + stride - 1] - src[k + stride + 1]) +
-                     2 * (src[k - 1] - src[k + 1]);
-      const int Gy = (src[k - stride - 1] - src[k + stride - 1]) +
-                     (src[k - stride + 1] - src[k + stride + 1]) +
-                     2 * (src[k - stride] - src[k + stride]);
-      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bd - 8);
-      if (Ga < edge_thresh) {  // Smooth pixels
-        // Find Laplacian
-        const int v =
-            4 * src[k] -
-            2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
-            (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
-             src[k + stride + 1]);
-        sum += ROUND_POWER_OF_TWO(abs(v), bd - 8);
-        ++num;
-      }
+// Estimates the strength for filter weight adjustment, which is used in YUV
+// strategy. This estimation is based on the pre-estimated noise level of the
+// to-filter frame.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
+//   group_boost: Boost level for the current group of frames.
+// Returns:
+//   Estimated strength which will be used for filter weight adjustment.
+static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level,
+                                const int group_boost) {
+  int strength = cpi->oxcf.arnr_strength;
+
+  // Adjust the strength based on the estimated noise level.
+  if (noise_level > 0) {       // Adjust when the noise level is reliable.
+    if (noise_level < 0.75) {  // Noise level lies in range (0, 0.75).
+      strength = strength - 2;
+    } else if (noise_level < 1.75) {  // Noise level lies in range [0.75, 1.75).
+      strength = strength - 1;
+    } else if (noise_level < 4.0) {  // Noise level lies in range [1.75, 4.0).
+      strength = strength + 0;
+    } else {  // Noise level lies in range [4.0, +inf).
+      strength = strength + 1;
     }
   }
-  // If very few smooth pels, return -1 since the estimate is unreliable
-  if (num < 16) return -1.0;
-
-  const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
-  return sigma;
-}
-
-// Apply buffer limits and context specific adjustments to arnr filter.
-static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
-                               int *arnr_frames, int *arnr_strength) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int frames_after_arf =
-      av1_lookahead_depth(cpi->lookahead) - distance - 1;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
-  int frames_bwd;
-  int q, frames, strength;
-
-  // Define the forward and backwards filter limits for this arnr group.
-  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
-  if (frames_fwd > distance) frames_fwd = distance;
-
-  frames_bwd = frames_fwd;
-
-  // For even length filter there is one more frame backward
-  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
-
-  // Set the baseline active filter size.
-  frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q.
-  if (cpi->common.current_frame.frame_number > 1)
-    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
-                                      cpi->common.seq_params.bit_depth));
-  else
-    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
-                                      cpi->common.seq_params.bit_depth));
-  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
-  struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance);
-  double noiselevel;
-  if (is_cur_buf_hbd(mbd)) {
-    noiselevel = highbd_estimate_noise(
-        buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
-        buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
-  } else {
-    noiselevel = estimate_noise(buf->img.y_buffer, buf->img.y_crop_width,
-                                buf->img.y_crop_height, buf->img.y_stride,
-                                EDGE_THRESHOLD);
-  }
-  int adj_strength = oxcf->arnr_strength;
-  if (noiselevel > 0) {
-    // Get 4 integer adjustment levels in [-2, 1]
-    int noiselevel_adj;
-    if (noiselevel < 0.75)
-      noiselevel_adj = -2;
-    else if (noiselevel < 1.75)
-      noiselevel_adj = -1;
-    else if (noiselevel < 4.0)
-      noiselevel_adj = 0;
-    else
-      noiselevel_adj = 1;
-    adj_strength += noiselevel_adj;
-  }
-  // printf("[noise level: %g, strength = %d]\n", noiselevel, adj_strength);
+  const FRAME_TYPE frame_type =
+      (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+  const int q = (int)av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+  strength = strength - AOMMAX(0, (16 - q) / 2);
 
-  if (q > 16) {
-    strength = adj_strength;
-  } else {
-    strength = adj_strength - ((16 - q) / 2);
-    if (strength < 0) strength = 0;
-  }
-
-  // Adjust number of frames in filter and strength based on gf boost level.
-  if (frames > group_boost / 150) {
-    frames = group_boost / 150;
-    frames += !(frames & 1);
-  }
-
-  if (strength > group_boost / 300) {
-    strength = group_boost / 300;
-  }
-
-  *arnr_frames = frames;
-  *arnr_strength = strength;
+  return CLIP(strength, 0, group_boost / 300);
 }
 
-void av1_temporal_filter(AV1_COMP *cpi, int distance) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  int frame;
-  int frames_to_blur;
-  int start_frame;
-  int strength;
-  int frames_to_blur_backward;
-  int frames_to_blur_forward;
-  struct scale_factors sf;
+// Setups the frame buffer for temporal filtering. Basically, this fuction
+// determines how many frames will be used for temporal filtering and then
+// groups them into a buffer.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   filter_frame_lookahead_idx: The index of the to-filter frame in the
+//                               lookahead buffer `cpi->lookahead`.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  will affect the number of frames used for filtering.
+//   frames: Pointer to the frame buffer to setup.
+//   num_frames_for_filtering: Number of frames used for filtering.
+//   filter_frame_idx: Index of the to-filter frame in the setup frame buffer.
+// Returns:
+//   Nothing will be returned. But the frame buffer `frames`, number of frames
+//   in the buffer `num_frames_for_filtering`, and the index of the to-filter
+//   frame in the buffer `filter_frame_idx` will be updated in this function.
+static void tf_setup_filtering_buffer(const AV1_COMP *cpi,
+                                      const int filter_frame_lookahead_idx,
+                                      const int is_second_arf,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int *num_frames_for_filtering,
+                                      int *filter_frame_idx) {
+  int num_frames = 0;          // Number of frames used for filtering.
+  int num_frames_before = -1;  // Number of frames before the to-filter frame.
+  int filter_frame_offset;
 
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  int rdmult = 0;
-
-  // Apply context specific adjustments to the arnr filter parameters.
-  if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-    // TODO(weitinglin): Currently, we enforce the filtering strength on
-    // internal ARFs to be zeros. We should investigate in which case it is more
-    // beneficial to use non-zero strength filtering.
-    strength = 0;
-    frames_to_blur = 1;
+  if (filter_frame_lookahead_idx == -1) {  // Key frame.
+    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
+    num_frames_before = 0;
+    filter_frame_offset = filter_frame_lookahead_idx;
+  } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
+    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
+    num_frames_before = num_frames - 1;
+    filter_frame_offset = -filter_frame_lookahead_idx;
   } else {
-    adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur,
-                       &strength);
+    num_frames = cpi->oxcf.arnr_max_frames;
+    if (is_second_arf) {  // Only use 2 neighbours for the second ARF.
+      num_frames = AOMMIN(num_frames, 3);
+    }
+    if (num_frames > cpi->rc.gfu_boost / 150) {
+      num_frames = cpi->rc.gfu_boost / 150;
+      num_frames += !(num_frames & 1);
+    }
+    num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1);
+    const int lookahead_depth =
+        av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
+    const int num_frames_after =
+        AOMMIN((num_frames - 1) >> 1,
+               lookahead_depth - filter_frame_lookahead_idx - 1);
+    num_frames = num_frames_before + 1 + num_frames_after;
+    filter_frame_offset = filter_frame_lookahead_idx;
+  }
+  *num_frames_for_filtering = num_frames;
+  *filter_frame_idx = num_frames_before;
+
+  // Setup the frame buffer.
+  for (int frame = 0; frame < num_frames; ++frame) {
+    const int lookahead_idx = frame - num_frames_before + filter_frame_offset;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->lookahead, lookahead_idx, cpi->compressor_stage);
+    frames[frame] = (buf == NULL) ? NULL : &buf->img;
+  }
+}
+
+int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                        int *show_existing_arf) {
+  // Basic informaton of the current frame.
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const uint8_t group_idx = gf_group->index;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx];
+  // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
+  // This frame is ALWAYS a show existing frame.
+  const int is_second_arf = (update_type == INTNL_ARF_UPDATE) &&
+                            (filter_frame_lookahead_idx >= 7) &&
+                            cpi->sf.hl_sf.second_alt_ref_filtering;
+  // TODO(anyone): Currently, we enforce the filtering strength on internal
+  // ARFs except the second ARF to be zero. We should investigate in which case
+  // it is more beneficial to use non-zero strength filtering.
+  if (update_type == INTNL_ARF_UPDATE && !is_second_arf) {
+    return 0;
   }
 
-  int which_arf = gf_group->arf_update_idx[gf_group->index];
+  // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization
+  // is used somewhere unexpectedly. Should be resolved later.
+  // Initialize errorperbit, sadperbit16 and sadperbit4.
+  const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX);
+  av1_fill_mv_costs(cpi->common.fc,
+                    cpi->common.features.cur_frame_force_integer_mv,
+                    cpi->common.features.allow_high_precision_mv, &cpi->td.mb);
 
-  // Set the temporal filtering status for the corresponding OVERLAY frame
-  if (strength == 0 && frames_to_blur == 1)
-    cpi->is_arf_filter_off[which_arf] = 1;
-  else
-    cpi->is_arf_filter_off[which_arf] = 0;
-  cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
+  // Setup frame buffer for filtering.
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  int num_frames_for_filtering = 0;
+  int filter_frame_idx = -1;
+  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
+                            frames, &num_frames_for_filtering,
+                            &filter_frame_idx);
 
-  frames_to_blur_backward = (frames_to_blur / 2);
-  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
-  start_frame = distance + frames_to_blur_forward;
-
-  // Setup frame pointers, NULL indicates frame not included in filter.
-  for (frame = 0; frame < frames_to_blur; ++frame) {
-    const int which_buffer = start_frame - frame;
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, which_buffer);
-    frames[frames_to_blur - 1 - frame] = &buf->img;
+  // Estimate noise and strength.
+  const int bit_depth = cpi->common.seq_params.bit_depth;
+  const int num_planes = av1_num_planes(&cpi->common);
+  double noise_levels[MAX_MB_PLANE] = { 0 };
+  for (int plane = 0; plane < num_planes; ++plane) {
+    noise_levels[plane] = av1_estimate_noise_from_single_plane(
+        frames[filter_frame_idx], plane, bit_depth);
+  }
+  const int strength =
+      tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
+  if (filter_frame_lookahead_idx >= 0) {
+    cpi->common.showable_frame =
+        (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
+        (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames);
   }
 
-  if (frames_to_blur > 0) {
+  // Do filtering.
+  const int is_key_frame = (filter_frame_lookahead_idx < 0);
+  FRAME_DIFF diff = { 0, 0 };
+  if (num_frames_for_filtering > 0 && frames[0] != NULL) {
     // Setup scaling factors. Scaling on each of the arnr frames is not
     // supported.
     // ARF is produced at the native frame size and resized when coded.
+    struct scale_factors sf;
     av1_setup_scale_factors_for_frame(
         &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
         frames[0]->y_crop_width, frames[0]->y_crop_height);
+    diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
+                           filter_frame_idx, is_key_frame, is_second_arf,
+                           TF_BLOCK_SIZE, &sf, strength, noise_levels);
   }
 
-  // Initialize errorperbit, sadperbit16 and sadperbit4.
-  rdmult = av1_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  av1_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
-  av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
+  if (is_key_frame) {  // Key frame should always be filtered.
+    return 1;
+  }
 
-  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &sf);
+  if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) ||
+      is_second_arf) {
+    const int frame_height = frames[filter_frame_idx]->y_crop_height;
+    const int frame_width = frames[filter_frame_idx]->y_crop_width;
+    const int block_height = block_size_high[TF_BLOCK_SIZE];
+    const int block_width = block_size_wide[TF_BLOCK_SIZE];
+    const int mb_rows = get_num_blocks(frame_height, block_height);
+    const int mb_cols = get_num_blocks(frame_width, block_width);
+    const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+    const float mean = (float)diff.sum / num_mbs;
+    const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean);
+
+    aom_clear_system_state();
+    // TODO(yunqing): This can be combined with TPL q calculation later.
+    cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
+    av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
+    int top_index = 0;
+    int bottom_index = 0;
+    const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width,
+                                           cpi->oxcf.height, group_idx,
+                                           &bottom_index, &top_index);
+    const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth);
+    const float threshold = 0.7f * ac_q * ac_q;
+
+    if (!is_second_arf) {
+      *show_existing_arf = 0;
+      if (mean < threshold && std < mean * 1.2) {
+        *show_existing_arf = 1;
+      }
+      cpi->common.showable_frame |= *show_existing_arf;
+    } else {
+      // Use source frame if the filtered frame becomes very different.
+      if (!(mean < threshold && std < mean * 1.2)) {
+        return 0;
+      }
+    }
+  }
+
+  return 1;
 }

diff --git a/libaom/av1/encoder/temporal_filter.h b/libaom/av1/encoder/temporal_filter.h
index bb26c36..5a6bde2 100644
--- a/libaom/av1/encoder/temporal_filter.h
+++ b/libaom/av1/encoder/temporal_filter.h

@@ -16,21 +16,69 @@
 extern "C" {
 #endif
 
-#define ARNR_FILT_QINDEX 128
-
-// Block size used in temporal filtering
-#define TF_BLOCK BLOCK_32X32
+// TODO(any): These two variables are only used in avx2, sse2, sse4
+// implementations, where the block size is still hard coded. This should be
+// fixed to align with the c implementation.
 #define BH 32
-#define BH_LOG2 5
 #define BW 32
-#define BW_LOG2 5
-#define BLK_PELS 1024  // Pixels in the block
-#define THR_SHIFT 2
-#define TF_SUB_BLOCK BLOCK_16X16
-#define SUB_BH 16
-#define SUB_BW 16
 
-void av1_temporal_filter(AV1_COMP *cpi, int distance);
+// Block size used in temporal filtering.
+#define TF_BLOCK_SIZE BLOCK_32X32
+
+// Window size for YUV temporal filtering.
+// This is particually used for function `av1_apply_temporal_filter_yuv()`.
+#define TF_YUV_FILTER_WINDOW_LENGTH 3
+// A scale factor used in YUV temporal filtering for weight adjustment.
+#define TF_YUV_FILTER_WEIGHT_SCALE 3
+
+#define TF_ENABLE_PLANEWISE_STRATEGY 1
+// Window size for plane-wise temporal filtering.
+// This is particually used for function `av1_apply_temporal_filter_planewise()`
+#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5
+// A scale factor used in plane-wise temporal filtering to raise the filter
+// weight from `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000
+
+#define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+// Estimates noise level from a given frame using a single plane (Y, U, or V).
+// This is an adaptation of the mehtod in the following paper:
+// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
+// estimation using Laplacian operator and adaptive edge detection",
+// Proc. 3rd International Symposium on Communications, Control and
+// Signal Processing, 2008, St Julians, Malta.
+// Inputs:
+//   frame: Pointer to the frame to estimate noise level from.
+//   plane: Index of the plane used for noise estimation. Commonly, 0 for
+//          Y-plane, 1 for U-plane, and 2 for V-plane.
+//   bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
+// Returns:
+//   The estimated noise, or -1.0 if there are too few smooth pixels.
+double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
+                                            const int plane,
+                                            const int bit_depth);
+
+#define TF_QINDEX 128  // Q-index used in temporal filtering.
+#define TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME 7
+// Performs temporal filtering if needed.
+// NOTE: In this function, the lookahead index is different from the 0-based
+// real index. For example, if we want to filter the first frame in the
+// pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+// of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+// second frame in the pre-fetched buffer. Another example: if we want to filter
+// the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+// Futhermore, negative number is used for key frame in one-pass mode, where key
+// frame is filtered with the frames before it instead of after it. For example,
+// -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   filter_frame_lookahead_idx: The index of the to-filter frame in the
+//                               lookahead buffer `cpi->lookahead`.
+//   show_existing_arf: Whether to show existing ARF. This field will be updated
+//                      in this function.
+// Returns:
+//   Whether temporal filtering is successfully done.
+int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                        int *show_existing_arf);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/tokenize.c b/libaom/av1/encoder/tokenize.c
index ce1a212..e674153 100644
--- a/libaom/av1/encoder/tokenize.c
+++ b/libaom/av1/encoder/tokenize.c

@@ -130,9 +130,9 @@
                         counts, map_pb_cdf);
 }
 
-static void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
-                           TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
-                           int blk_col, int block, int plane, void *arg) {
+static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
+                           BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+                           int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -151,16 +151,9 @@
   if (tx_size == plane_tx_size || plane) {
     plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
                                        pd->subsampling_y);
-    if (!dry_run) {
-      av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
-                                        plane_bsize, tx_size, arg);
-    } else if (dry_run == DRY_RUN_NORMAL) {
-      av1_update_txb_context_b(plane, block, blk_row, blk_col, plane_bsize,
-                               tx_size, arg);
-    } else {
-      printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
-      assert(0);
-    }
+    av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                      plane_bsize, tx_size, arg);
+
   } else {
     // Half the block size in transform block unit.
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
@@ -177,72 +170,68 @@
 
         if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-        tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
-                       block, plane, arg);
+        tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
+                       arg);
         block += step;
       }
     }
   }
 }
 
-void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                           RUN_TYPE dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize, int *rate,
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
                            uint8_t allow_update_cdf) {
+  assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  (void)t;
-  struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
 
   if (mbmi->skip) {
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+    av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
 
   for (int plane = 0; plane < num_planes; ++plane) {
-    if (!is_chroma_reference(mi_row, mi_col, bsize,
-                             xd->plane[plane].subsampling_x,
-                             xd->plane[plane].subsampling_y)) {
-      continue;
-    }
+    if (plane && !xd->is_chroma_ref) break;
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bsizec =
-        scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
     const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
-    int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
-    int idx, idy;
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
     int block = 0;
-    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
 
     const BLOCK_SIZE max_unit_bsize =
-        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-    int mu_blocks_wide =
-        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-    int mu_blocks_high =
-        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+        get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
 
     mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
     mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
 
-    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
-      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
-        int blk_row, blk_col;
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
         const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
         const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
-        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
-          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
-            tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, blk_row,
-                           blk_col, block, plane, &arg);
+        for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col,
+                           block, plane, &arg);
             block += step;
           }
         }

diff --git a/libaom/av1/encoder/tokenize.h b/libaom/av1/encoder/tokenize.h
index c80af7b..52caacb 100644
--- a/libaom/av1/encoder/tokenize.h
+++ b/libaom/av1/encoder/tokenize.h

@@ -22,7 +22,6 @@
 
 typedef struct {
   aom_cdf_prob *color_map_cdf;
-  // TODO(yaowu: use packed enum type if appropriate)
   uint8_t token;
 } TOKENEXTRA;
 
@@ -30,26 +29,25 @@
 struct ThreadData;
 struct FRAME_COUNTS;
 
-struct tokenize_b_args {
-  const struct AV1_COMP *cpi;
-  struct ThreadData *td;
-  TOKENEXTRA **tp;
-  int this_rate;
-  uint8_t allow_update_cdf;
-};
-
 enum {
   OUTPUT_ENABLED = 0,
   DRY_RUN_NORMAL,
   DRY_RUN_COSTCOEFFS,
 } UENUM1BYTE(RUN_TYPE);
 
+struct tokenize_b_args {
+  const struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int this_rate;
+  uint8_t allow_update_cdf;
+  RUN_TYPE dry_run;
+};
+
 // Note in all the tokenize functions rate if non NULL is incremented
 // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
 // otherwise rate is not incremented.
 void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
-                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize, int *rate,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
                            uint8_t allow_update_cdf);
 
 int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,

diff --git a/libaom/av1/encoder/tpl_model.c b/libaom/av1/encoder/tpl_model.c
index 79afb6d..79b94f3 100644
--- a/libaom/av1/encoder/tpl_model.c
+++ b/libaom/av1/encoder/tpl_model.c

@@ -10,37 +10,41 @@
  */
 
 #include <stdint.h>
+#include <float.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
+#include "aom_ports/system_state.h"
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
 
-typedef struct GF_PICTURE {
-  YV12_BUFFER_CONFIG *frame;
-  int ref_frame[7];
-} GF_PICTURE;
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
+static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
+                                          const tran_low_t *coeff,
+                                          tran_low_t *qcoeff,
+                                          tran_low_t *dqcoeff, TX_SIZE tx_size,
+                                          uint16_t *eob, int64_t *recon_error,
+                                          int64_t *sse) {
   const struct macroblock_plane *const p = &x->plane[plane];
   const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
-  uint16_t eob;
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
-  av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
-                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
-                        p->dequant_QTX, &eob, scan_order->scan,
-                        scan_order->iscan);
+  av1_quantize_fp(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+                  p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+                  scan_order->scan, scan_order->iscan);
 
   *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
   *recon_error = AOMMAX(*recon_error, 1);
@@ -49,196 +53,413 @@
   *sse = AOMMAX(*sse, 1);
 }
 
-static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
+static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
+                                    tran_low_t *coeff, TX_SIZE tx_size,
+                                    int bit_depth, int is_hbd) {
+  TxfmParam txfm_param;
+  txfm_param.tx_type = DCT_DCT;
+  txfm_param.tx_size = tx_size;
+  txfm_param.lossless = 0;
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  txfm_param.bd = bit_depth;
+  txfm_param.is_hbd = is_hbd;
+  av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
 }
 
-static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
-                                              uint8_t *cur_frame_buf,
-                                              uint8_t *ref_frame_buf,
-                                              int stride, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col) {
+static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
+                                            int16_t *src_diff, int diff_stride,
+                                            const uint8_t *src, int src_stride,
+                                            const uint8_t *dst, int dst_stride,
+                                            tran_low_t *coeff, int bw, int bh,
+                                            TX_SIZE tx_size) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int pix_num = bw * bh;
+
+  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  return aom_satd(coeff, pix_num);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+
+  int rate_cost = 1;
+
+  for (int idx = 0; idx < eob; ++idx) {
+    int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += (int)(log(abs_level + 1.0) / log(2.0)) + 1;
+  }
+
+  return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+static AOM_INLINE void txfm_quant_rdcost(
+    const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
+    int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
+    int *rate_cost, int64_t *recon_error, int64_t *sse) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  uint16_t eob;
+  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
+               is_cur_buf_hbd(xd));
+
+  get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
+                     sse);
+
+  *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+
+  av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride,
+                              eob, 0);
+}
+
+static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *cur_frame_buf,
+                                  uint8_t *ref_frame_buf, int stride,
+                                  int stride_ref, BLOCK_SIZE bsize,
+                                  MV center_mv, int_mv *best_mv) {
   AV1_COMMON *cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = NSTEP;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
   int step_param;
-  int sadpb = x->sadperbit16;
   uint32_t bestsme = UINT_MAX;
   int distortion;
   uint32_t sse;
   int cost_list[5];
-  const MvLimits tmp_mv_limits = x->mv_limits;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+  FULLPEL_MV start_mv = get_fullmv_from_mv(&center_mv);
 
   // Setup frame pointers
   x->plane[0].src.buf = cur_frame_buf;
   x->plane[0].src.stride = stride;
   xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
+  xd->plane[0].pre[0].stride = stride_ref;
 
-  step_param = mv_sf->reduce_first_step_size;
+  step_param = tpl_sf->reduce_first_step_size;
   step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+  search_site_config *ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  if (ss_cfg->stride != stride_ref)
+    ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
 
-  av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                        search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
-                        (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
+  assert(ss_cfg->stride == stride_ref);
 
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     ss_cfg);
 
-  const int pw = block_size_wide[bsize];
-  const int ph = block_size_high[bsize];
-  bestsme = cpi->find_fractional_mv_step(
-      x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
-      0, 0, pw, ph, 1, 1);
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv->as_fullmv,
+                        NULL);
+
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &center_mv,
+                                    cost_list);
+  ms_params.forced_stop = tpl_sf->subpel_force_stop;
+  ms_params.var_params.subpel_search_type = USE_2_TAPS;
+  ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+  MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+  bestsme = cpi->mv_search_params.find_fractional_mv_step(
+      xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &distortion, &sse,
+      NULL);
 
   return bestsme;
 }
 
-static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
-                            struct scale_factors *sf, GF_PICTURE *gf_picture,
-                            int frame_idx, int16_t *src_diff, tran_low_t *coeff,
-                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            TplDepStats *tpl_stats) {
+static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs,
+                       int center_mvs_count, int skip_alike_starting_mv) {
+  // MV difference threshold is in 1/8 precision.
+  const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
+  int thr = mv_diff_thr[skip_alike_starting_mv];
+  int i;
+
+  for (i = 0; i < center_mvs_count; i++) {
+    if (abs(center_mvs[i].as_mv.col - candidate_mv.as_mv.col) < thr &&
+        abs(center_mvs[i].as_mv.row - candidate_mv.as_mv.row) < thr)
+      return 1;
+  }
+
+  return 0;
+}
+
+static AOM_INLINE void mode_estimation(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf,
+    int frame_idx, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+    const YV12_BUFFER_CONFIG *ref_frame[],
+    const YV12_BUFFER_CONFIG *src_ref_frame[], TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+
+  (void)gf_group;
+
+  TplParams *tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
   const int bw = 4 << mi_size_wide_log2[bsize];
   const int bh = 4 << mi_size_high_log2[bsize];
-  const int pix_num = bw * bh;
-  int best_rf_idx = -1;
-  int_mv best_mv;
-  int64_t best_inter_cost = INT64_MAX;
-  int64_t inter_cost;
-  int rf_idx;
-  const InterpFilters kernel =
-      av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
   int64_t best_intra_cost = INT64_MAX;
   int64_t intra_cost;
-  PREDICTION_MODE mode;
+  PREDICTION_MODE best_mode = DC_PRED;
+
   int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  MB_MODE_INFO mi_above, mi_left;
+  uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
+  const int src_stride = xd->cur_buf->y_stride;
+
+  const int dst_mb_offset =
+      mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
+  uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
+  const int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+
+  // Temporaray buffers
+  DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]);
+  DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]);
+  uint8_t *predictor =
+      is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+  int64_t recon_error = 1, sse = 1;
 
   memset(tpl_stats, 0, sizeof(*tpl_stats));
 
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-  xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
-  xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+  set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+               av1_num_planes(cm));
+  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
 
   // Intra prediction search
-  for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
-    uint8_t *src, *dst;
-    int src_stride, dst_stride;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
 
-    src = xd->cur_buf->y_buffer + mb_y_offset;
-    src_stride = xd->cur_buf->y_stride;
-
-    dst = &predictor[0];
-    dst_stride = bw;
-
-    xd->mi[0]->sb_type = bsize;
-    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
-    av1_predict_intra_block(
-        cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
-        0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
-
+  // Pre-load the bottom left line.
+  if (xd->left_available &&
+      mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+#if CONFIG_AV1_HIGHBITDEPTH
     if (is_cur_buf_hbd(xd)) {
-      aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                                dst_stride, xd->bd);
+      uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+      for (int i = 0; i < bw; ++i)
+        dst[(bw + i) * dst_buffer_stride - 1] =
+            dst[(bw - 1) * dst_buffer_stride - 1];
     } else {
-      aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                         dst_stride);
+      for (int i = 0; i < bw; ++i)
+        dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+            dst_buffer[(bw - 1) * dst_buffer_stride - 1];
     }
+#else
+    for (int i = 0; i < bw; ++i)
+      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+#endif
+  }
 
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+  // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
+  // H_PRED, and V_PRED
+  const PREDICTION_MODE last_intra_mode =
+      cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
+       ++mode) {
+    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                            block_size_high[bsize], tx_size, mode, 0, 0,
+                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                            predictor, bw, 0, 0, 0);
 
-    intra_cost = aom_satd(coeff, pix_num);
+    intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
+                                   predictor, bw, coeff, bw, bh, tx_size);
 
-    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+    if (intra_cost < best_intra_cost) {
+      best_intra_cost = intra_cost;
+      best_mode = mode;
+    }
   }
 
   // Motion compensated prediction
-  best_mv.as_int = 0;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
 
-  (void)mb_y_offset;
-  // Motion estimation column boundary
-  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
-  x->mv_limits.col_max =
-      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t inter_cost;
+  int64_t best_inter_cost = INT64_MAX;
+  int rf_idx;
 
-  for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
-    if (ref_frame[rf_idx] == NULL) continue;
+  best_mv.as_int = INVALID_MV;
 
-    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
-                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                  xd->cur_buf->y_stride, bsize, mi_row, mi_col);
-
-    // TODO(jingning): Not yet support high bit-depth in the next three
-    // steps.
-    ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
-    WarpTypesAllowed warp_types;
-    memset(&warp_types, 0, sizeof(WarpTypesAllowed));
-
-    av1_build_inter_predictor(
-        ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
-        &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
-        &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
-        mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
-    if (is_cur_buf_hbd(xd)) {
-      aom_highbd_subtract_block(
-          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
-          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
-    } else {
-      aom_subtract_block(bh, bw, src_diff, bw,
-                         xd->cur_buf->y_buffer + mb_y_offset,
-                         xd->cur_buf->y_stride, &predictor[0], bw);
+  for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
+    if (ref_frame[rf_idx] == NULL || src_ref_frame[rf_idx] == NULL) {
+      tpl_stats->mv[rf_idx].as_int = INVALID_MV;
+      continue;
     }
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
 
-    inter_cost = aom_satd(coeff, pix_num);
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = src_ref_frame[rf_idx];
+    int ref_mb_offset =
+        mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+    uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+    int ref_stride = ref_frame_ptr->y_stride;
+
+    int_mv best_rfidx_mv = { 0 };
+    uint32_t bestsme = UINT32_MAX;
+
+    int_mv center_mvs[4] = { { 0 } };
+    int refmv_count = 1;
+
+    if (xd->up_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->left_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col + mi_width, tpl_frame->stride,
+          block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    for (int idx = 0; idx < refmv_count; ++idx) {
+      int_mv this_mv;
+      uint32_t thissme =
+          motion_estimation(cpi, x, src_mb_buffer, ref_mb, src_stride,
+                            ref_stride, bsize, center_mvs[idx].as_mv, &this_mv);
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        best_rfidx_mv = this_mv;
+      }
+    }
+
+    tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
+                                      &inter_pred_params);
+
+    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
+                                   predictor, bw, coeff, bw, bh, tx_size);
+    // Store inter cost for each ref frame
+    tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
+
     if (inter_cost < best_inter_cost) {
-      int64_t recon_error, sse;
-
+      memcpy(best_coeff, coeff, sizeof(best_coeff));
       best_rf_idx = rf_idx;
+
       best_inter_cost = inter_cost;
-      best_mv.as_int = x->best_mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &recon_error,
-                         &sse);
+      best_mv.as_int = best_rfidx_mv.as_int;
+      if (best_inter_cost < best_intra_cost) {
+        best_mode = NEWMV;
+        xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
+        xd->mi[0]->mv[0].as_int = best_mv.as_int;
+      }
     }
   }
+
+  if (best_inter_cost < INT64_MAX) {
+    uint16_t eob;
+    get_quantize_error(x, 0, best_coeff, qcoeff, dqcoeff, tx_size, &eob,
+                       &recon_error, &sse);
+
+    const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  }
+
   best_intra_cost = AOMMAX(best_intra_cost, 1);
-  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  if (frame_idx == 0) {
+    best_inter_cost = 0;
+  } else {
+    best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  }
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
-  tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
 
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
-  tpl_stats->mv.as_int = best_mv.as_int;
+  tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+
+  // Final encode
+  if (is_inter_mode(best_mode)) {
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = ref_frame[best_rf_idx];
+
+    InterPredParams inter_pred_params;
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+                                      &best_mv.as_mv, &inter_pred_params);
+  } else {
+    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                            block_size_high[bsize], tx_size, best_mode, 0, 0,
+                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                            dst_buffer, dst_buffer_stride, 0, 0, 0);
+  }
+
+  int rate_cost;
+  txfm_quant_rdcost(x, src_diff, bw, src_mb_buffer, src_stride, dst_buffer,
+                    dst_buffer_stride, coeff, qcoeff, dqcoeff, bw, bh, tx_size,
+                    &rate_cost, &recon_error, &sse);
+
+  tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  if (!is_inter_mode(best_mode)) {
+    tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  }
+  tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+  tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
+
+  if (best_rf_idx >= 0) {
+    tpl_stats->mv[best_rf_idx].as_int = best_mv.as_int;
+    tpl_stats->ref_frame_index = best_rf_idx;
+  }
+
+  for (int idy = 0; idy < mi_height; ++idy) {
+    for (int idx = 0; idx < mi_width; ++idx) {
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) {
+        xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0];
+      }
+    }
+  }
 }
 
 static int round_floor(int ref_pos, int bsize_pix) {
@@ -280,16 +501,61 @@
   return width * height;
 }
 
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
-  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
-  MV mv = tpl_stats->mv.as_mv;
-  int mv_row = mv.row >> 3;
-  int mv_col = mv.col >> 3;
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
+  return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
 
-  int ref_pos_row = mi_row * MI_SIZE + mv_row;
-  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                               int64_t srcrf_dist, int pix_num) {
+  double beta = (double)srcrf_dist / recrf_dist;
+  int64_t rate_cost = delta_rate;
+
+  if (srcrf_dist <= 128) return rate_cost;
+
+  double dr =
+      (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+      pix_num;
+
+  double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+  if (log_den > log(10.0) / log(2.0)) {
+    rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+    rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+    return rate_cost;
+  }
+
+  double num = pow(2.0, log_den);
+  double den = num * beta + (1 - beta) * beta;
+
+  rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+  rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+  return rate_cost;
+}
+
+static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+                                          int mi_col, const BLOCK_SIZE bsize,
+                                          int frame_idx) {
+  TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
+  TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
+  TplDepFrame *tpl_frame = tpl_data->tpl_frame;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
+      mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
+
+  if (tpl_stats_ptr->ref_frame_index < 0) return;
+  const int ref_frame_index = tpl_stats_ptr->ref_frame_index;
+  TplDepFrame *ref_tpl_frame =
+      &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
+  TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return;
+
+  const FULLPEL_MV full_mv =
+      get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv);
+  const int ref_pos_row = mi_row * MI_SIZE + full_mv.row;
+  const int ref_pos_col = mi_col * MI_SIZE + full_mv.col;
 
   const int bw = 4 << mi_size_wide_log2[bsize];
   const int bh = 4 << mi_size_high_log2[bsize];
@@ -302,6 +568,16 @@
   int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
   int block;
 
+  int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist;
+  int64_t mc_dep_dist = (int64_t)(
+      tpl_stats_ptr->mc_dep_dist *
+      ((double)(tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist) /
+       tpl_stats_ptr->recrf_dist));
+  int64_t delta_rate = tpl_stats_ptr->recrf_rate - tpl_stats_ptr->srcrf_rate;
+  int64_t mc_dep_rate =
+      delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                      tpl_stats_ptr->srcrf_dist, pix_num);
+
   for (block = 0; block < 4; ++block) {
     int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
     int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
@@ -312,20 +588,18 @@
           grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
       int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
       int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+      const int step = 1 << block_mis_log2;
 
-      int64_t mc_flow = tpl_stats->mc_dep_cost -
-                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
-                            tpl_stats->intra_cost;
+      for (int idy = 0; idy < mi_height; idy += step) {
+        for (int idx = 0; idx < mi_width; idx += step) {
+          TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+              ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride,
+              block_mis_log2)];
+          des_stats->mc_dep_dist +=
+              ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+          des_stats->mc_dep_rate +=
+              ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
 
-      int idx, idy;
-
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                         (ref_mi_col + idx)];
-
-          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
           assert(overlap_area >= 0);
         }
       }
@@ -333,75 +607,89 @@
   }
 }
 
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  int idx, idy;
+static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
+                                        int mi_col, const BLOCK_SIZE bsize,
+                                        int frame_idx) {
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const BLOCK_SIZE tpl_block_size =
+      convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
 
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         BLOCK_4X4);
+  for (int idy = 0; idy < mi_height; idy += step) {
+    for (int idx = 0; idx < mi_width; idx += step) {
+      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size,
+                         frame_idx);
     }
   }
 }
 
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride,
-                            const TplDepStats *src_stats) {
+static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize, int stride,
+                                       const TplDepStats *src_stats,
+                                       uint8_t block_mis_log2) {
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
-  int idx, idy;
+  const int step = 1 << block_mis_log2;
 
   int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
   int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-
-  TplDepStats *tpl_ptr;
+  int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width);
+  int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width);
+  int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width);
+  int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width);
 
   intra_cost = AOMMAX(1, intra_cost);
   inter_cost = AOMMAX(1, inter_cost);
+  srcrf_dist = AOMMAX(1, srcrf_dist);
+  recrf_dist = AOMMAX(1, recrf_dist);
+  srcrf_rate = AOMMAX(1, srcrf_rate);
+  recrf_rate = AOMMAX(1, recrf_rate);
 
-  for (idy = 0; idy < mi_height; ++idy) {
-    tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
-    for (idx = 0; idx < mi_width; ++idx) {
+  for (int idy = 0; idy < mi_height; idy += step) {
+    TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos(
+        mi_row + idy, mi_col, stride, block_mis_log2)];
+    for (int idx = 0; idx < mi_width; idx += step) {
       tpl_ptr->intra_cost = intra_cost;
       tpl_ptr->inter_cost = inter_cost;
-      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+      tpl_ptr->srcrf_dist = srcrf_dist;
+      tpl_ptr->recrf_dist = recrf_dist;
+      tpl_ptr->srcrf_rate = srcrf_rate;
+      tpl_ptr->recrf_rate = recrf_rate;
+      memcpy(tpl_ptr->mv, src_stats->mv, sizeof(tpl_ptr->mv));
+      memcpy(tpl_ptr->pred_error, src_stats->pred_error,
+             sizeof(tpl_ptr->pred_error));
       tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
-      tpl_ptr->mv.as_int = src_stats->mv.as_int;
       ++tpl_ptr;
     }
   }
 }
 
-static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
-                              int frame_idx) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
-  YV12_BUFFER_CONFIG *ref_frame[7] = {
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL
-  };
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                         int pframe_qindex) {
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (frame_idx == gf_group->size) return;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
+  const YV12_BUFFER_CONFIG *ref_frame[7] = { NULL, NULL, NULL, NULL,
+                                             NULL, NULL, NULL };
+  const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+  int ref_frame_flags;
+  const YV12_BUFFER_CONFIG *src_frame[7] = { NULL, NULL, NULL, NULL,
+                                             NULL, NULL, NULL };
 
   AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   struct scale_factors sf;
   int rdmult, idx;
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
   int mi_row, mi_col;
+  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  av1_tile_init(&xd->tile, cm, 0, 0);
 
-  DECLARE_ALIGNED(32, uint16_t, predictor16[32 * 32 * 3]);
-  DECLARE_ALIGNED(32, uint8_t, predictor8[32 * 32 * 3]);
-  uint8_t *predictor;
-  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
-
-  const BLOCK_SIZE bsize = BLOCK_32X32;
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
@@ -411,182 +699,491 @@
       &sf, this_frame->y_crop_width, this_frame->y_crop_height,
       this_frame->y_crop_width, this_frame->y_crop_height);
 
-  if (is_cur_buf_hbd(xd))
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  else
-    predictor = predictor8;
-
-  // Prepare reference frame pointers. If any reference frame slot is
-  // unavailable, the pointer will be set to Null.
-  for (idx = 0; idx < 7; ++idx) {
-    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
-    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
   xd->cur_buf = this_frame;
 
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    ref_frame[idx] =
+        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].rec_picture;
+    src_frame[idx] =
+        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].gf_picture;
+  }
+
+  // Store the reference frames based on priority order
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    ref_frames_ordered[i] = ref_frame[ref_frame_priority_order[i] - 1];
+  }
+
+  // Work out which reference frame slots may be used.
+  ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered,
+                                        cpi->ext_flags.ref_frame_flags);
+
+  enforce_max_ref_frames(cpi, &ref_frame_flags);
+
+  // Prune reference frames
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    if ((ref_frame_flags & (1 << idx)) == 0) {
+      ref_frame[idx] = NULL;
+    }
+  }
+
+  // Make a temporary mbmi for tpl model
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+
+  xd->block_ref_scale_factors[0] = &sf;
+
+  const int base_qindex = pframe_qindex;
   // Get rd multiplier set up.
-  rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
+  rdmult = (int)av1_compute_rd_mult(cpi, base_qindex);
   if (rdmult < 1) rdmult = 1;
   set_error_per_bit(x, rdmult);
-  av1_initialize_me_consts(cpi, x, tpl_frame->base_qindex);
+  av1_initialize_me_consts(cpi, x, base_qindex);
 
   tpl_frame->is_valid = 1;
 
-  cm->base_qindex = tpl_frame->base_qindex;
+  cm->quant_params.base_qindex = base_qindex;
   av1_frame_init_quantizer(cpi);
 
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+  tpl_frame->base_rdmult =
+      av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+
+  for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
-    x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
-    x->mv_limits.row_max =
-        (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->oxcf.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += mi_width) {
       TplDepStats tpl_stats;
-      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
-                      qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
-                      ref_frame, predictor, &tpl_stats);
+
+      // Motion estimation column boundary
+      av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+                            cpi->oxcf.border_in_pixels);
+      xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+      xd->mb_to_right_edge =
+          GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+      mode_estimation(cpi, x, xd, &sf, frame_idx, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, src_frame, &tpl_stats);
 
       // Motion flow dependency dispenser.
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride, &tpl_stats);
-
-      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
-                       bsize);
+                      tpl_frame->stride, &tpl_stats,
+                      tpl_data->tpl_stats_block_mis_log2);
     }
   }
 }
 
-static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames,
-                            const EncodeFrameInput *const frame_input) {
+static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
   AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  int frame_idx = 0;
-  int i;
-  int gld_index = -1;
-  int alt_index = -1;
-  int lst_index = -1;
-  int extend_frame_count = 0;
-  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
 
-  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
-  int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
-                                                      -1, -1, -1, -1 };
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (frame_idx == gf_group->size) return;
 
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
+  TplParams *const tpl_data = &cpi->tpl_data;
 
-  for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
-    if (frame_bufs[i].ref_count == 0) {
-      alloc_frame_mvs(cm, &frame_bufs[i]);
-      if (aom_realloc_frame_buffer(
-              &frame_bufs[i].buf, cm->width, cm->height,
-              seq_params->subsampling_x, seq_params->subsampling_y,
-              seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-              cm->byte_alignment, NULL, NULL, NULL))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffer");
+  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
 
-      recon_frame_index[frame_idx] = i;
-      ++frame_idx;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) {
+      if (frame_idx) {
+        tpl_model_update(tpl_data, mi_row, mi_col, bsize, frame_idx);
+      }
     }
   }
+}
 
-  for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
-    assert(recon_frame_index[i] >= 0);
-    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+static AOM_INLINE void init_gop_frames_for_tpl(
+    AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
+    GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames,
+    const EncodeFrameInput *const frame_input, int *pframe_qindex) {
+  AV1_COMMON *cm = &cpi->common;
+  int cur_frame_idx = gf_group->index;
+  *pframe_qindex = 0;
+
+  RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack;
+  EncodeFrameParams frame_params = *init_frame_params;
+  TplParams *const tpl_data = &cpi->tpl_data;
+
+  int ref_picture_map[REF_FRAMES];
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (frame_params.frame_type == KEY_FRAME || gop_eval) {
+      tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
+      tpl_data->tpl_frame[-1 - 1].rec_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
+    } else {
+      tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].frame_display_index =
+          cm->ref_frame_map[i]->display_order_hint;
+    }
+
+    ref_picture_map[i] = -i - 1;
   }
 
-  *tpl_group_frames = 0;
+  *tpl_group_frames = cur_frame_idx;
 
-  // Initialize Golden reference frame.
-  gf_picture[0].frame = NULL;
-  RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
-  if (ref_buf) gf_picture[0].frame = &ref_buf->buf;
-  for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
-  gld_index = 0;
-  ++*tpl_group_frames;
+  int gf_index;
+  int use_arf = gf_group->update_type[1] == ARF_UPDATE;
+  int anc_frame_offset = gf_group->cur_frame_idx[cur_frame_idx] + 1;
+  int process_frame_count = 0;
+  const int gop_length =
+      AOMMIN(gf_group->size - 1 + use_arf, MAX_LENGTH_TPL_FRAME_STATS - 1);
+  for (gf_index = cur_frame_idx; gf_index <= gop_length; ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
 
-  // Initialize ARF frame
-  gf_picture[1].frame = frame_input->source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  gf_picture[1].ref_frame[1] = lst_index;
-  gf_picture[1].ref_frame[2] = alt_index;
-  // TODO(yuec) Need o  figure out full AV1 reference model
-  for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
-  alt_index = 1;
-  ++*tpl_group_frames;
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE ||
+        frame_update_type == OVERLAY_UPDATE;
+    frame_params.frame_type =
+        frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME;
 
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+    if (frame_update_type == LF_UPDATE)
+      *pframe_qindex = gf_group->q_val[gf_index];
 
-    if (buf == NULL) break;
+    if (gf_index == cur_frame_idx) {
+      tpl_frame->gf_picture = frame_input->source;
+      // frame display index = frame offset within the gf group + start frame of
+      // the gf group
+      tpl_frame->frame_display_index =
+          gf_group->frame_disp_idx[gf_index] +
+          cpi->common.current_frame.display_order_hint;
+    } else {
+      int frame_display_index = gf_index == gf_group->size
+                                    ? cpi->rc.baseline_gf_interval
+                                    : gf_group->frame_disp_idx[gf_index];
+      struct lookahead_entry *buf = av1_lookahead_peek(
+          cpi->lookahead, frame_display_index - anc_frame_offset,
+          cpi->compressor_stage);
+      if (buf == NULL) break;
+      tpl_frame->gf_picture = &buf->img;
+      // frame display index = frame offset within the gf group + start frame of
+      // the gf group
+      tpl_frame->frame_display_index =
+          frame_display_index + cpi->common.current_frame.display_order_hint;
+    }
 
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+    if (frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+      tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+      ++process_frame_count;
+    }
+
+    av1_get_ref_frames(cpi, &ref_buffer_stack);
+    int refresh_mask = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             frame_params.show_existing_frame,
+                             refresh_frame_map_index, &ref_buffer_stack);
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
 
     ++*tpl_group_frames;
-    lst_index = frame_idx;
-
-    if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
   }
 
-  gld_index = frame_idx;
-  lst_index = AOMMAX(0, frame_idx - 1);
-  alt_index = -1;
-  ++frame_idx;
+  if (cur_frame_idx == 0) return;
 
-  // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+  int extend_frame_count = 0;
+  int extend_frame_length =
+      AOMMIN(cpi->rc.baseline_gf_interval,
+             cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
+  int frame_display_index = cpi->rc.baseline_gf_interval + 1;
+
+  for (; gf_index < MAX_LENGTH_TPL_FRAME_STATS &&
+         extend_frame_count < extend_frame_length;
+       ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE;
+    frame_params.frame_type = INTER_FRAME;
+
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->lookahead, frame_display_index - anc_frame_offset,
+        cpi->compressor_stage);
 
     if (buf == NULL) break;
 
-    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+    tpl_frame->gf_picture = &buf->img;
+    tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+    tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+    ++process_frame_count;
 
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
-    lst_index = frame_idx;
+    // frame display index = frame offset within the gf group + start frame of
+    // the gf group
+    tpl_frame->frame_display_index =
+        frame_display_index + cpi->common.current_frame.display_order_hint;
+
+    gf_group->update_type[gf_index] = LF_UPDATE;
+    gf_group->q_val[gf_index] = *pframe_qindex;
+
+    av1_get_ref_frames(cpi, &ref_buffer_stack);
+    int refresh_mask = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             frame_params.show_existing_frame,
+                             refresh_frame_map_index, &ref_buffer_stack);
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+
+    tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1;
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
     ++*tpl_group_frames;
     ++extend_frame_count;
+    ++frame_display_index;
   }
+
+  av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
 }
 
-static void init_tpl_stats(AV1_COMP *cpi) {
-  int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    memset(tpl_frame->tpl_stats_ptr, 0,
+static AOM_INLINE void init_tpl_stats(TplParams *const tpl_data) {
+  for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    memset(tpl_data->tpl_stats_pool[frame_idx], 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
     tpl_frame->is_valid = 0;
   }
 }
 
-void av1_tpl_setup_stats(AV1_COMP *cpi,
-                         const EncodeFrameInput *const frame_input) {
-  GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int tpl_group_frames = 0;
-  int frame_idx;
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params,
+                        const EncodeFrameInput *const frame_input) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  int bottom_index, top_index;
+  EncodeFrameParams this_frame_params = *frame_params;
+  TplParams *const tpl_data = &cpi->tpl_data;
 
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames, frame_input);
+  if (cpi->superres_mode != SUPERRES_NONE) return 0;
 
-  init_tpl_stats(cpi);
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
+    av1_configure_buffer_updates(cpi, &this_frame_params,
+                                 gf_group->update_type[gf_index], 0);
+
+    cpi->refresh_golden_frame = this_frame_params.refresh_golden_frame;
+    cpi->refresh_bwd_ref_frame = this_frame_params.refresh_bwd_ref_frame;
+    cpi->refresh_alt_ref_frame = this_frame_params.refresh_alt_ref_frame;
+
+    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+
+    gf_group->q_val[gf_index] =
+        av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
+                                 &bottom_index, &top_index);
+
+    cm->current_frame.frame_type = INTER_FRAME;
+  }
+
+  int pframe_qindex;
+  int tpl_gf_group_frames;
+  init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval,
+                          &tpl_gf_group_frames, frame_input, &pframe_qindex);
+
+  cpi->rc.base_layer_qp = pframe_qindex;
+
+  init_tpl_stats(tpl_data);
 
   // Backward propagation from tpl_group_frames to 1.
-  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
-    mc_flow_dispenser(cpi, gf_picture, frame_idx);
+  for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames;
+       ++frame_idx) {
+    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+      continue;
+
+    mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+
+    aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
+                             av1_num_planes(cm));
+  }
+
+  for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index;
+       --frame_idx) {
+    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+      continue;
+
+    mc_flow_synthesizer(cpi, frame_idx);
+  }
+
+  av1_configure_buffer_updates(cpi, &this_frame_params,
+                               gf_group->update_type[gf_group->index], 0);
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+
+  if (cpi->common.tiles.large_scale) return 0;
+  if (gf_group->max_layer_depth_allowed == 0) return 1;
+
+  double beta[2] = { 0.0 };
+  for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2);
+       ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
+      for (int col = 0; col < mi_cols_sr; col += step) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      }
+    }
+    beta[frame_idx - 1] = (double)mc_dep_cost_base / intra_cost_base;
+  }
+
+  // Allow larger GOP size if the base layer ARF has higher dependency factor
+  // than the intermediate ARF and both ARFs have reasonably high dependency
+  // factors.
+  return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+}
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int tpl_idx = gf_group->index;
+
+  assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size));
+
+  TplParams *const tpl_data = &cpi->tpl_data;
+  const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+
+  if (!tpl_frame->is_valid) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+
+  const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  const int block_size = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const double c = 1.2;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  aom_clear_system_state();
+
+  // Loop through each 'block_size' X 'block_size' block.
+  for (int row = 0; row < num_rows; row++) {
+    for (int col = 0; col < num_cols; col++) {
+      double intra_cost = 0.0, mc_dep_cost = 0.0;
+      // Loop through each mi block.
+      for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h;
+           mi_row += step) {
+        for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w;
+             mi_col += step) {
+          if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue;
+          const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+              mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+          int64_t mc_dep_delta =
+              RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                     this_stats->mc_dep_dist);
+          intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
+          mc_dep_cost +=
+              (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+        }
+      }
+      const double rk = intra_cost / mc_dep_cost;
+      const int index = row * num_cols + col;
+      cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+    }
+  }
+  aom_clear_system_state();
+}
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+
+  if (tpl_frame->is_valid == 0) return;
+  if (!is_frame_tpl_eligible(cpi)) return;
+  if (tpl_idx >= MAX_LAG_BUFFERS) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+  if (cpi->oxcf.aq_mode != NO_AQ) return;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[sb_size] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+
+  double base_block_count = 0.0;
+  double log_sum = 0.0;
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  const int orig_rdmult = av1_compute_rd_mult(
+      cpi, quant_params->base_qindex + quant_params->y_dc_delta_q);
+  const int new_rdmult =
+      av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+                                   quant_params->y_dc_delta_q);
+  const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
+
+  double scale_adj = log(scaling_factor) - log_sum / base_block_count;
+  scale_adj = exp(scale_adj);
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->tpl_sb_rdmult_scaling_factors[index] =
+          scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+    }
+  }
+  aom_clear_system_state();
 }

diff --git a/libaom/av1/encoder/tpl_model.h b/libaom/av1/encoder/tpl_model.h
index f6b33b0..11a61b6 100644
--- a/libaom/av1/encoder/tpl_model.h
+++ b/libaom/av1/encoder/tpl_model.h

@@ -16,8 +16,29 @@
 extern "C" {
 #endif
 
-void av1_tpl_setup_stats(AV1_COMP *cpi,
-                         const EncodeFrameInput *const frame_input);
+static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+  switch (length) {
+    case 64: return BLOCK_64X64;
+    case 32: return BLOCK_32X32;
+    case 16: return BLOCK_16X16;
+    case 8: return BLOCK_8X8;
+    case 4: return BLOCK_4X4;
+    default:
+      assert(0 && "Invalid block size for tpl model");
+      return BLOCK_16X16;
+  }
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params,
+                        const EncodeFrameInput *const frame_input);
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/tune_vmaf.c b/libaom/av1/encoder/tune_vmaf.c
new file mode 100644
index 0000000..997f78e
--- /dev/null
+++ b/libaom/av1/encoder/tune_vmaf.c

@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/tune_vmaf.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/vmaf.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rdopt.h"
+
+static const double kBaselineVmaf = 97.42773;
+
+// TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
+                                           int source_stride,
+                                           const uint16_t *blurred,
+                                           int blurred_stride, uint16_t *dst,
+                                           int dst_stride, int w, int h,
+                                           double amount, int bit_depth) {
+  const int max_value = (1 << bit_depth) - 1;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+                                    const uint8_t *blurred, int blurred_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    double amount) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
+                               const YV12_BUFFER_CONFIG *source,
+                               const YV12_BUFFER_CONFIG *blurred,
+                               const YV12_BUFFER_CONFIG *dst, double amount) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  if (bit_depth > 8) {
+    highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                        CONVERT_TO_SHORTPTR(blurred->y_buffer),
+                        blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
+                        dst->y_stride, source->y_width, source->y_height,
+                        amount, bit_depth);
+  } else {
+    unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+                 blurred->y_stride, dst->y_buffer, dst->y_stride,
+                 source->y_width, source->y_height, amount);
+  }
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
+                                                               30, 8, 0,  0 };
+static AOM_INLINE void gaussian_blur(const int bit_depth,
+                                     const YV12_BUFFER_CONFIG *source,
+                                     const YV12_BUFFER_CONFIG *dst) {
+  const int block_size = BLOCK_128X128;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  int row, col;
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
+  InterpFilterParams filter = { .filter_ptr = gauss_filter,
+                                .taps = 8,
+                                .subpel_shifts = 0,
+                                .interp_filter = EIGHTTAP_REGULAR };
+
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      uint8_t *src_buf =
+          source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+      uint8_t *dst_buf =
+          dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
+
+      if (bit_depth > 8) {
+        av1_highbd_convolve_2d_sr(
+            CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+            &filter, &filter, 0, 0, &conv_params, bit_depth);
+      } else {
+        av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
+                           block_w, block_h, &filter, &filter, 0, 0,
+                           &conv_params);
+      }
+    }
+  }
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+                                     const YV12_BUFFER_CONFIG *const frame) {
+  const uint8_t *const y_buffer = frame->y_buffer;
+  const int y_stride = frame->y_stride;
+  const BLOCK_SIZE block_size = BLOCK_64X64;
+
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  int row, col;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double var = 0.0, var_count = 0.0;
+
+  // Loop through each block.
+  for (row = 0; row < frame->y_height / block_h; ++row) {
+    for (col = 0; col < frame->y_width / block_w; ++col) {
+      struct buf_2d buf;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+      buf.stride = y_stride;
+
+      if (bit_depth > 8) {
+        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
+                                                  bit_depth);
+      } else {
+        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
+      }
+      var_count += 1.0;
+    }
+  }
+  var /= var_count;
+  return var;
+}
+
+static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance,
+                              YV12_BUFFER_CONFIG *const source,
+                              YV12_BUFFER_CONFIG *const sharpened) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double new_vmaf;
+  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth,
+                &new_vmaf);
+  const double sharpened_var = frame_average_variance(cpi, sharpened);
+  return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+    double best_vmaf, const double baseline_variance,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_vmaf = best_vmaf;
+  double unsharp_amount = unsharp_amount_start;
+  do {
+    best_vmaf = approx_vmaf;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+    loop_count++;
+  } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
+                                             YV12_BUFFER_CONFIG *const source,
+                                             YV12_BUFFER_CONFIG *const blurred,
+                                             const double unsharp_amount_start,
+                                             const double step_size,
+                                             const int max_loop_count,
+                                             const double max_filter_amount) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  YV12_BUFFER_CONFIG sharpened;
+  memset(&sharpened, 0, sizeof(sharpened));
+  aom_alloc_frame_buffer(
+      &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  const double baseline_variance = frame_average_variance(cpi, source);
+  double unsharp_amount;
+  if (unsharp_amount_start <= step_size) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop(
+        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+        step_size, max_loop_count, max_filter_amount);
+  } else {
+    double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+    double v0, v1;
+    unsharp(cpi, source, blurred, &sharpened, a0);
+    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    unsharp(cpi, source, blurred, &sharpened, a1);
+    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    if (fabs(v0 - v1) < 0.01) {
+      unsharp_amount = a0;
+    } else if (v0 > v1) {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+          -step_size, max_loop_count, max_filter_amount);
+    } else {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+          step_size, max_loop_count, max_filter_amount);
+    }
+  }
+
+  aom_free_frame_buffer(&sharpened);
+  return unsharp_amount;
+}
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
+                                  YV12_BUFFER_CONFIG *const source) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&source_extended, 0, sizeof(source_extended));
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+  aom_clear_system_state();
+}
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  memset(&source_extended, 0, sizeof(source_extended));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+
+  const int block_size = BLOCK_64X64;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  double *best_unsharp_amounts =
+      aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+  memset(best_unsharp_amounts, 0,
+         sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+
+  YV12_BUFFER_CONFIG source_block, blurred_block;
+  memset(&source_block, 0, sizeof(source_block));
+  memset(&blurred_block, 0, sizeof(blurred_block));
+  aom_alloc_frame_buffer(
+      &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(width - col_offset_y, block_w);
+      const int block_height = AOMMIN(height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (bit_depth > 8) {
+        uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                                  row_offset_y * source->y_stride +
+                                  col_offset_y;
+        uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                      row_offset_y * blurred.y_stride +
+                                      col_offset_y;
+        uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+        uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      } else {
+        uint8_t *frame_src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *frame_blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        uint8_t *blurred_dst = blurred_block.y_buffer;
+        uint8_t *src_dst = source_block.y_buffer;
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      }
+
+      best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
+          cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+          1.5);
+    }
+  }
+
+  // Apply best blur amounts
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+      const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (bit_depth > 8) {
+        uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                            row_offset_y * source->y_stride + col_offset_y;
+        uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                row_offset_y * blurred.y_stride + col_offset_y;
+        highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+                            blurred.y_stride, src_buf, source->y_stride,
+                            block_width, block_height,
+                            best_unsharp_amounts[index], bit_depth);
+      } else {
+        uint8_t *src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                     src_buf, source->y_stride, block_width, block_height,
+                     best_unsharp_amounts[index]);
+      }
+    }
+  }
+
+  aom_free_frame_buffer(&source_block);
+  aom_free_frame_buffer(&blurred_block);
+  aom_free_frame_buffer(&blurred);
+  aom_free(best_unsharp_amounts);
+  aom_clear_system_state();
+}
+
+typedef struct FrameData {
+  const YV12_BUFFER_CONFIG *source, *blurred;
+  int block_w, block_h, num_rows, num_cols, row, col, bit_depth;
+} FrameData;
+
+// A callback function used to pass data to VMAF.
+// Returns 0 after reading a frame.
+// Returns 2 when there is no more frame to read.
+static int update_frame(float *ref_data, float *main_data, float *temp_data,
+                        int stride, void *user_data) {
+  FrameData *frames = (FrameData *)user_data;
+  const int width = frames->source->y_width;
+  const int height = frames->source->y_height;
+  const int row = frames->row;
+  const int col = frames->col;
+  const int num_rows = frames->num_rows;
+  const int num_cols = frames->num_cols;
+  const int block_w = frames->block_w;
+  const int block_h = frames->block_h;
+  const YV12_BUFFER_CONFIG *source = frames->source;
+  const YV12_BUFFER_CONFIG *blurred = frames->blurred;
+  const int bit_depth = frames->bit_depth;
+  const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+  (void)temp_data;
+  stride /= (int)sizeof(*ref_data);
+
+  for (int i = 0; i < height; ++i) {
+    float *ref, *main;
+    ref = ref_data + i * stride;
+    main = main_data + i * stride;
+    if (bit_depth == 8) {
+      uint8_t *src;
+      src = source->y_buffer + i * source->y_stride;
+      for (int j = 0; j < width; ++j) {
+        ref[j] = main[j] = (float)src[j];
+      }
+    } else {
+      uint16_t *src;
+      src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
+      for (int j = 0; j < width; ++j) {
+        ref[j] = main[j] = scale_factor * (float)src[j];
+      }
+    }
+  }
+  if (row < num_rows && col < num_cols) {
+    // Set current block
+    const int row_offset = row * block_h;
+    const int col_offset = col * block_w;
+    const int block_width = AOMMIN(width - col_offset, block_w);
+    const int block_height = AOMMIN(height - row_offset, block_h);
+
+    float *main_buf = main_data + col_offset + row_offset * stride;
+    if (bit_depth == 8) {
+      uint8_t *blurred_buf =
+          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          main_buf[j] = (float)blurred_buf[j];
+        }
+        main_buf += stride;
+        blurred_buf += blurred->y_stride;
+      }
+    } else {
+      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
+                              row_offset * blurred->y_stride + col_offset;
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          main_buf[j] = scale_factor * (float)blurred_buf[j];
+        }
+        main_buf += stride;
+        blurred_buf += blurred->y_stride;
+      }
+    }
+
+    frames->col++;
+    if (frames->col >= num_cols) {
+      frames->col = 0;
+      frames->row++;
+    }
+    return 0;
+  } else {
+    return 2;
+  }
+}
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int y_width = cpi->source->y_width;
+  const int y_height = cpi->source->y_height;
+  const int resized_block_size = BLOCK_32X32;
+  const int resize_factor = 2;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  aom_clear_system_state();
+  YV12_BUFFER_CONFIG resized_source;
+  memset(&resized_source, 0, sizeof(resized_source));
+  aom_alloc_frame_buffer(
+      &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
+      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
+  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
+                              av1_num_planes(cm));
+
+  const int resized_y_width = resized_source.y_width;
+  const int resized_y_height = resized_source.y_height;
+  const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+  const int resized_block_h = mi_size_high[resized_block_size] * 4;
+  const int num_cols =
+      (resized_y_width + resized_block_w - 1) / resized_block_w;
+  const int num_rows =
+      (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  gaussian_blur(bit_depth, &resized_source, &blurred);
+
+  double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
+  memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
+  FrameData frame_data;
+  frame_data.source = &resized_source;
+  frame_data.blurred = &blurred;
+  frame_data.block_w = resized_block_w;
+  frame_data.block_h = resized_block_h;
+  frame_data.num_rows = num_rows;
+  frame_data.num_cols = num_cols;
+  frame_data.row = 0;
+  frame_data.col = 0;
+  frame_data.bit_depth = bit_depth;
+  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
+                            update_frame, resized_y_width, resized_y_height,
+                            bit_depth, scores);
+
+  // Loop through each 'block_size' block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const int row_offset_y = row * resized_block_h;
+      const int col_offset_y = col * resized_block_w;
+
+      uint8_t *const orig_buf = resized_source.y_buffer +
+                                row_offset_y * resized_source.y_stride +
+                                col_offset_y;
+      uint8_t *const blurred_buf =
+          blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+
+      const double vmaf = scores[index];
+      const double dvmaf = kBaselineVmaf - vmaf;
+      unsigned int sse;
+      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                         blurred_buf, blurred.y_stride, &sse);
+
+      const double mse =
+          (double)sse / (double)(resized_y_width * resized_y_height);
+      double weight;
+      const double eps = 0.01 / (num_rows * num_cols);
+      if (dvmaf < eps || mse < eps) {
+        weight = 1.0;
+      } else {
+        weight = mse / dvmaf;
+      }
+
+      // Normalize it with a data fitted model.
+      weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
+      cpi->vmaf_rdmult_scaling_factors[index] = weight;
+    }
+  }
+
+  aom_free_frame_buffer(&resized_source);
+  aom_free_frame_buffer(&blurred);
+  aom_free(scores);
+  aom_clear_system_state();
+}
+
+void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_64X64;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->vmaf_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  aom_clear_system_state();
+}
+
+// TODO(sdeng): replace them with the SIMD versions.
+static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int w, int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int w,
+                                     int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static AOM_INLINE double calc_vmaf_motion_score(
+    const AV1_COMP *const cpi, const AV1_COMMON *const cm,
+    const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last,
+    const YV12_BUFFER_CONFIG *const next) {
+  const int y_width = cur->y_width;
+  const int y_height = cur->y_height;
+  YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  memset(&blurred_cur, 0, sizeof(blurred_cur));
+  memset(&blurred_last, 0, sizeof(blurred_last));
+  memset(&blurred_next, 0, sizeof(blurred_next));
+
+  aom_alloc_frame_buffer(
+      &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  gaussian_blur(bit_depth, cur, &blurred_cur);
+  gaussian_blur(bit_depth, last, &blurred_last);
+  if (next) gaussian_blur(bit_depth, next, &blurred_next);
+
+  double motion1, motion2 = 65536.0;
+  if (bit_depth > 8) {
+    const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+    motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                 blurred_cur.y_stride,
+                                 CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+                                 blurred_last.y_stride, y_width, y_height) *
+              scale_factor;
+    if (next) {
+      motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                   blurred_cur.y_stride,
+                                   CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+                                   blurred_next.y_stride, y_width, y_height) *
+                scale_factor;
+    }
+  } else {
+    motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                          blurred_last.y_buffer, blurred_last.y_stride, y_width,
+                          y_height);
+    if (next) {
+      motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                            blurred_next.y_buffer, blurred_next.y_stride,
+                            y_width, y_height);
+    }
+  }
+
+  aom_free_frame_buffer(&blurred_cur);
+  aom_free_frame_buffer(&blurred_last);
+  aom_free_frame_buffer(&blurred_next);
+
+  return AOMMIN(motion1, motion2);
+}
+
+// Calculates the new qindex from the VMAF motion score. This is based on the
+// observation: when the motion score becomes higher, the VMAF score of the
+// same source and distorted frames would become higher.
+int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
+    return current_qindex;
+  }
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const double approx_sse =
+      cpi->last_frame_ysse /
+      (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8)));
+  const double approx_dvmaf = kBaselineVmaf - cpi->last_frame_vmaf;
+  const double sse_threshold =
+      0.01 * cpi->source->y_width * cpi->source->y_height;
+  const double vmaf_threshold = 0.01;
+  if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
+    return current_qindex;
+  }
+  aom_clear_system_state();
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  YV12_BUFFER_CONFIG *cur_buf = cpi->source;
+  int src_index = 0;
+  if (cm->show_frame == 0) {
+    src_index = gf_group->arf_src_offset[gf_group->index];
+    struct lookahead_entry *cur_entry =
+        av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage);
+    cur_buf = &cur_entry->img;
+  }
+  assert(cur_buf);
+
+  const struct lookahead_entry *last_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage);
+  const struct lookahead_entry *next_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage);
+  const YV12_BUFFER_CONFIG *next_buf = &next_entry->img;
+  const YV12_BUFFER_CONFIG *last_buf =
+      cm->show_frame ? cpi->last_source : &last_entry->img;
+
+  assert(last_buf);
+
+  const double motion =
+      calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf);
+
+  // Get dVMAF through a data fitted model.
+  const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion));
+  const double dsse = dvmaf * approx_sse / approx_dvmaf;
+
+  const double beta = approx_sse / (dsse + approx_sse);
+  const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta);
+  int qindex = current_qindex + offset;
+
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  aom_clear_system_state();
+  return qindex;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *recon) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, recon, bit_depth,
+                &cpi->last_frame_vmaf);
+  if (bit_depth > 8) {
+    cpi->last_frame_ysse = (double)aom_highbd_get_y_sse(source, recon);
+  } else {
+    cpi->last_frame_ysse = (double)aom_get_y_sse(source, recon);
+  }
+}

diff --git a/libaom/av1/encoder/tune_vmaf.h b/libaom/av1/encoder/tune_vmaf.h
new file mode 100644
index 0000000..c4cf072
--- /dev/null
+++ b/libaom/av1/encoder/tune_vmaf.h

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
+#define AOM_AV1_ENCODER_TUNE_VMAF_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/encoder.h"
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                         int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *recon);
+
+#endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_

diff --git a/libaom/av1/encoder/tx_prune_model_weights.h b/libaom/av1/encoder/tx_prune_model_weights.h
index 405bc9e..76efe93 100644
--- a/libaom/av1/encoder/tx_prune_model_weights.h
+++ b/libaom/av1/encoder/tx_prune_model_weights.h

@@ -18,6 +18,1381 @@
 
 #include "av1/encoder/ml.h"
 
+/***************************CONFIG_NN_V2 (New)********************************/
+#if CONFIG_NN_V2
+// Tx type model for 4x4 block.
+static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_hor_layer1_weights,
+          av1_tx_type_nn_4x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_ver_layer1_weights,
+          av1_tx_type_nn_4x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_hor_layer1_weights,
+          av1_tx_type_nn_4x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_4x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_ver_layer1_weights,
+          av1_tx_type_nn_4x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_hor_layer1_weights,
+          av1_tx_type_nn_8x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_8x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_ver_layer1_weights,
+          av1_tx_type_nn_8x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_hor_layer1_weights,
+          av1_tx_type_nn_8x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_ver_layer1_weights,
+          av1_tx_type_nn_8x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_hor_layer1_weights,
+          av1_tx_type_nn_8x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_ver_layer1_weights,
+          av1_tx_type_nn_8x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_hor_layer1_weights,
+          av1_tx_type_nn_16x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_ver_layer1_weights,
+          av1_tx_type_nn_16x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static float av1_tx_type_nn_16x16_layer0_weights[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_bias[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_weights[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_bias[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                    // num_inputs
+          16,                                   // num_outputs
+          av1_tx_type_nn_16x16_layer0_weights,  // weights
+          av1_tx_type_nn_16x16_layer0_bias,     // bias
+          RELU,                                 // activation
+          av1_tx_type_nn_16x16_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x16_layer1_weights,
+          av1_tx_type_nn_16x16_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x16_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                // num_outputs
+  av1_tx_type_nn_16x16_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_4x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_hor_layer1_weights,
+          av1_tx_type_nn_4x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_4x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_ver_layer1_weights,
+          av1_tx_type_nn_4x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_hor_layer1_weights,
+          av1_tx_type_nn_16x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_16x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_ver_layer1_weights,
+          av1_tx_type_nn_16x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+#else
+/******************************CONFIG_NN***************************************/
 // Tx type model for 4x4 block.
 static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
   -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
@@ -985,6 +2360,7 @@
   NULL,                            // 16x64 transform
   NULL,                            // 64x16 transform
 };
+#endif  // CONFIG_NN_V2
 
 // Tx split model for 4x8 block.
 static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {

diff --git a/libaom/av1/encoder/tx_search.c b/libaom/av1/encoder/tx_search.c
new file mode 100644
index 0000000..65b9a24
--- /dev/null
+++ b/libaom/av1/encoder/tx_search.c

@@ -0,0 +1,3602 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/common/idct.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+#include "av1/encoder/tx_search.h"
+
+struct rdcost_block_args {
+  const AV1_COMP *cpi;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  int64_t current_rd;
+  int64_t best_rd;
+  int exit_early;
+  int incomplete_exit;
+  int use_fast_coef_costing;
+  FAST_TX_SEARCH_MODE ftxs_mode;
+  int skip_trellis;
+};
+
+typedef struct {
+  int64_t rd;
+  int txb_entropy_ctx;
+  TX_TYPE tx_type;
+} TxCandidateInfo;
+
+typedef struct {
+  int leaf;
+  int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+                                const uint32_t hash) {
+  // Linear search through the circular buffer to find matching hash.
+  for (int i = cur_record->index_start - 1; i >= 0; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  int index;
+  // If not found - add new RD info into the buffer and return its index
+  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+    index = (cur_record->index_start + cur_record->num) %
+            TX_SIZE_RD_RECORD_BUFFER_LEN;
+    cur_record->num++;
+  } else {
+    index = cur_record->index_start;
+    cur_record->index_start =
+        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+  }
+
+  cur_record->hash_vals[index] = hash;
+  av1_zero(cur_record->tx_rd_info[index]);
+  return index;
+}
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0, 0, 0, 0 } },
+  { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0 } },
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 5, 6 } },
+  { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 7, 8 } },
+  { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
+  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
+  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
+  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
+  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+  { 0, { 1, -1, 2, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+  NULL,                    // BLOCK_4X4
+  NULL,                    // BLOCK_4X8
+  NULL,                    // BLOCK_8X4
+  rd_record_tree_8x8,      // BLOCK_8X8
+  rd_record_tree_8x16,     // BLOCK_8X16
+  rd_record_tree_16x8,     // BLOCK_16X8
+  rd_record_tree_16x16,    // BLOCK_16X16
+  rd_record_tree_1_2,      // BLOCK_16X32
+  rd_record_tree_2_1,      // BLOCK_32X16
+  rd_record_tree_sqr,      // BLOCK_32X32
+  rd_record_tree_1_2,      // BLOCK_32X64
+  rd_record_tree_2_1,      // BLOCK_64X32
+  rd_record_tree_sqr,      // BLOCK_64X64
+  rd_record_tree_64x128,   // BLOCK_64X128
+  rd_record_tree_128x64,   // BLOCK_128X64
+  rd_record_tree_128x128,  // BLOCK_128X128
+  NULL,                    // BLOCK_4X16
+  NULL,                    // BLOCK_16X4
+  rd_record_tree_1_4,      // BLOCK_8X32
+  rd_record_tree_4_1,      // BLOCK_32X8
+  rd_record_tree_1_4,      // BLOCK_16X64
+  rd_record_tree_4_1,      // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+  0,                                                            // BLOCK_4X4
+  0,                                                            // BLOCK_4X8
+  0,                                                            // BLOCK_8X4
+  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
+  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
+  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
+  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
+  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
+  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
+  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
+  0,                                                            // BLOCK_4X16
+  0,                                                            // BLOCK_16X4
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+                                       BLOCK_SIZE bsize) {
+  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+  const int size = rd_record_tree_size[bsize];
+  for (int i = 0; i < size; ++i) {
+    if (rd_record[i].leaf) {
+      av1_zero(tree[i].children);
+    } else {
+      for (int j = 0; j < 4; ++j) {
+        const int8_t idx = rd_record[i].children[j];
+        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+      }
+    }
+  }
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                   TXB_RD_INFO_NODE *dst_rd_info) {
+  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+                                         x->txb_rd_record_16X16,
+                                         x->txb_rd_record_32X32,
+                                         x->txb_rd_record_64X64 };
+  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+
+  // Hashing is performed only for square TX sizes larger than TX_4X4
+  if (max_square_tx_size < TX_8X8) return 0;
+  const int diff_stride = bw;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+  init_rd_record_tree(dst_rd_info, bsize);
+  // Coordinates of the top-left corner of current block within the superblock
+  // measured in pixels:
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  int cur_rd_info_idx = 0;
+  int cur_tx_depth = 0;
+  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+    const int cur_tx_bw = tx_size_wide[cur_tx_size];
+    const int cur_tx_bh = tx_size_high[cur_tx_size];
+    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    const int tx_size_idx = cur_tx_size - TX_8X8;
+    for (int row = 0; row < bh; row += cur_tx_bh) {
+      for (int col = 0; col < bw; col += cur_tx_bw) {
+        if (cur_tx_bw != cur_tx_bh) {
+          // Use dummy nodes for all rectangular transforms within the
+          // TX size search tree.
+          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+        } else {
+          // Get spatial location of this TX block within the superblock
+          // (measured in cur_tx_bsize units).
+          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+          int16_t hash_data[MAX_SB_SQUARE];
+          int16_t *cur_hash_row = hash_data;
+          const int16_t *cur_diff_row = diff + row * diff_stride + col;
+          for (int i = 0; i < cur_tx_bh; i++) {
+            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+            cur_hash_row += cur_tx_bw;
+            cur_diff_row += diff_stride;
+          }
+          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                                (uint8_t *)hash_data,
+                                                2 * cur_tx_bw * cur_tx_bh);
+          // Find corresponding RD info based on the hash value.
+          const int record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+          int idx = find_tx_size_rd_info(records, hash);
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &records->tx_rd_info[idx];
+        }
+        ++cur_rd_info_idx;
+      }
+    }
+    cur_tx_size = next_tx_size;
+    ++cur_tx_depth;
+  }
+  return 1;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+                                      const int64_t ref_best_rd,
+                                      const uint32_t hash) {
+  int32_t match_index = -1;
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        match_index = index;
+        break;
+      }
+    }
+  }
+  return match_index;
+}
+
+static AOM_INLINE void fetch_tx_rd_info(int n4,
+                                        const MB_RD_INFO *const tx_rd_info,
+                                        RD_STATS *const rd_stats,
+                                        MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_rd_info->tx_size;
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+  av1_copy_array(xd->tx_type_map, tx_rd_info->tx_type_map, n4);
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize,
+                                      unsigned int *block_mse_q8) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse =
+      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+  if (block_mse_q8 != NULL) {
+    if (visible_cols > 0 && visible_rows > 0)
+      *block_mse_q8 =
+          (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+    else
+      *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // For faster early skip decision, use dist to compare against threshold so
+  // that quality risk is less for the skip=1 decision. Otherwise, use mse
+  // since the fwd_txfm coeff checks will take care of quality
+  // TODO(any): Use dist to return 0 when predict_skip_level is 1
+  int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse;
+  // Predict not to skip when error is larger than threshold.
+  if (pred_err > mse_thresh) return 0;
+  // Return as skip otherwise for aggressive early skip
+  else if (x->predict_skip_level >= 2)
+    return 1;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = is_cur_buf_hbd(xd);
+  param.lossless = 0;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
+  }
+  return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
+                                     int bsize, int64_t dist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+  mbmi->tx_size = tx_size;
+  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+  rd_stats->skip = 1;
+  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
+  // Though decision is to make the block as skip based on luma stats,
+  // it is possible that block becomes non skip after chroma rd. In addition
+  // intermediate non skip costs calculated by caller function will be
+  // incorrect, if rate is set as  zero (i.e., if zero_blk_rate is not
+  // accounted). Hence intermediate rate is populated to code the luma tx blks
+  // as skip, the caller function based on final rd decision (i.e., skip vs
+  // non-skip) sets the final rate accordingly. Here the rate populated
+  // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+  // size possible) in the current block. Eg: For 128*128 block, rate would be
+  // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+  // block as 'all zeros'
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  ENTROPY_CONTEXT *ta = ctxa;
+  ENTROPY_CONTEXT *tl = ctxl;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->rate = zero_blk_rate *
+                   (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+                   (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static AOM_INLINE void save_tx_rd_info(int n4, uint32_t hash,
+                                       const MACROBLOCK *const x,
+                                       const RD_STATS *const rd_stats,
+                                       MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_size = mbmi->tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy_array(tx_rd_info->tx_type_map, xd->tx_type_map, n4);
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf,
+                                 int tx_size_search_method) {
+  if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_sf.tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.inter_tx_size_search_init_depth_rect
+               : sf->tx_sf.inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.intra_tx_size_search_init_depth_rect
+               : sf->tx_sf.intra_tx_size_search_init_depth_sqr;
+  }
+}
+
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
+    TXB_RD_INFO_NODE *rd_info_node);
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void get_energy_distribution_fine(
+    const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
+    const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
+    double *verdist) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+    // functions for the 16 (very small) sub-blocks of this block.
+    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+    assert(bw <= 32);
+    assert(bh <= 32);
+    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+    if (cpi->common.seq_params.use_highbitdepth) {
+      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+    }
+  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+    const int f_index =
+        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[1]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[2]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[5]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[6]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[9]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[10]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[13]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[14]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[15]);
+  }
+
+  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+                 esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
+    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
+  }
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        if (sse_norm_arr) {
+          sse_norm_arr[row * 2 + col] =
+              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+        if (sad_norm_arr) {
+          sad_norm_arr[row * 2 + col] =
+              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
+
+        if (sse_norm_arr) {
+          unsigned int this_sse;
+          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                        dst_stride, &this_sse);
+          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        }
+
+        if (sad_norm_arr) {
+          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+              this_src, src_stride, this_dst, dst_stride);
+          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+static AOM_INLINE void PrintTransformUnitStats(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
+    int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    TX_TYPE tx_type, int64_t rd) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int num_samples = txw * txh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  unsigned int sse;
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  aom_clear_system_state();
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+                                   const uint8_t *dst8, int dst_stride, int w,
+                                   int h) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_diff_mean(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+                                                const TileDataEnc *tile_data,
+                                                MACROBLOCK *x,
+                                                const RD_STATS *const rd_stats,
+                                                BLOCK_SIZE plane_bsize) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      (tile_data == NULL ||
+       !tile_data->inter_mode_rd_models[plane_bsize].ready))
+    return;
+  (void)tile_data;
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+
+  if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+      1)
+    return;
+
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int shift = (xd->bd - 8);
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+  const double rdcost_norm =
+      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  const int16_t *const src_diff = p->src_diff;
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm =
+      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rdcost_norm =
+      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+          model_rdcost_norm);
+
+  double mean;
+  if (is_cur_buf_hbd(xd)) {
+    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+                                pd->dst.stride, bw, bh);
+  } else {
+    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  }
+  mean /= (1 << shift);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+    const int64_t overall_sse = get_sse(cpi, x);
+    int est_residue_cost = 0;
+    int64_t est_dist = 0;
+    get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+                      &est_dist);
+    const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+    const double est_dist_norm = (double)est_dist / num_samples;
+    const double est_rdcost_norm =
+        (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+    fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+            est_rdcost_norm);
+  }
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS >= 2
+#endif  // CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCKD *xd,
+                                                      int plane, int block,
+                                                      int blk_row, int blk_col,
+                                                      int eob,
+                                                      int reduced_tx_set) {
+  if (!eob) return;
+
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx, int skip_trellis,
+                               TX_TYPE best_tx_type, int do_quant,
+                               int *rate_cost, uint16_t best_eob) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // if the quantized coefficients are stored in the dqcoeff buffer, we don't
+    // need to do transform and quantization again.
+    if (do_quant) {
+      TxfmParam txfm_param_intra;
+      QUANT_PARAM quant_param_intra;
+      av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra);
+      av1_setup_quant(tx_size, !skip_trellis,
+                      skip_trellis
+                          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                    : AV1_XFORM_QUANT_FP)
+                          : AV1_XFORM_QUANT_FP,
+                      cpi->oxcf.quant_b_adapt, &quant_param_intra);
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
+                        &quant_param_intra);
+      av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                      &txfm_param_intra, &quant_param_intra);
+      if (quant_param_intra.use_optimize_b) {
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+                       cpi->sf.rd_sf.trellis_eob_fast, rate_cost);
+      }
+    }
+
+    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->features.reduced_tx_set_used);
+
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    }
+  }
+}
+
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
+  int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  assert(visible_rows > 0);
+  assert(visible_cols > 0);
+
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
+  return sse;
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2;
+  const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+  } else {
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
+#else
+  recon = (uint8_t *)recon16;
+  av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                          NULL, 0, 0, NULL);
+#endif
+
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                                    cpi->common.features.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.features.reduced_tx_set_used);
+
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+                                   int blk_col, BLOCK_SIZE plane_bsize,
+                                   TX_SIZE tx_size) {
+  int16_t tmp_data[64 * 64];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int txb_w = tx_size_wide[tx_size];
+  const int txb_h = tx_size_high[tx_size];
+  uint8_t *hash_data = (uint8_t *)cur_diff_row;
+  if (txb_w != diff_stride) {
+    int16_t *cur_hash_row = tmp_data;
+    for (int i = 0; i < txb_h; i++) {
+      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+      cur_hash_row += txb_w;
+      cur_diff_row += diff_stride;
+    }
+    hash_data = (uint8_t *)tmp_data;
+  }
+  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+  return (hash << 5) + tx_size;
+}
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
+
+static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      int plane, int blk_row, int blk_col,
+                                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                      const TXB_CTX *const txb_ctx,
+                                      TXB_RD_INFO **intra_txb_rd_info,
+                                      const int tx_type_map_idx,
+                                      uint16_t *cur_joint_ctx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  assert(cpi->sf.tx_sf.use_intra_txb_hash &&
+         frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
+         plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
+  const uint32_t intra_hash =
+      get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+  const int intra_hash_idx =
+      find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+  *intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+  *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+  if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
+      x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+    xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+                        cpi->common.features.reduced_tx_set_used);
+    return (ref_tx_type == (*intra_txb_rd_info)->tx_type);
+  }
+  return 0;
+}
+
+// R-D costs are sorted in ascending order.
+static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; ++i) {
+    for (j = 0; j < i; ++j) {
+      if (rds[j] > rds[i]) {
+        int64_t temprd;
+        int tempi;
+
+        temprd = rds[i];
+        tempi = txk[i];
+
+        for (k = i; k > j; k--) {
+          rds[k] = rds[k - 1];
+          txk[k] = txk[k - 1];
+        }
+
+        rds[j] = temprd;
+        txk[j] = tempi;
+        break;
+      }
+    }
+  }
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd))
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+#else
+  *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+#endif
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                              int block, TX_SIZE tx_size, int blk_row,
+                              int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+                              int16_t allowed_tx_mask, int prune_factor,
+                              const TXB_CTX *const txb_ctx,
+                              int reduced_tx_set_used, int64_t ref_best_rd,
+                              int num_sel) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  int idx;
+
+  int64_t rds_v[4];
+  int64_t rds_h[4];
+  int idx_v[4] = { 0, 1, 2, 3 };
+  int idx_h[4] = { 0, 1, 2, 3 };
+  int skip_v[4] = { 0 };
+  int skip_h[4] = { 0 };
+  const int idx_map[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+
+  const int sel_pattern_v[16] = {
+    0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3
+  };
+  const int sel_pattern_h[16] = {
+    0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3
+  };
+
+  QUANT_PARAM quant_param;
+  TxfmParam txfm_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+  int tx_type;
+  // to ensure we can try ones even outside of ext_tx_set of current block
+  // this function should only be called for size < 16
+  assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  int rate_cost = 0;
+  int64_t dist = 0, sse = 0;
+  // evaluate horizontal with vertical DCT
+  for (idx = 0; idx < 4; ++idx) {
+    tx_type = idx_map[idx];
+    txfm_param.tx_type = tx_type;
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) {
+      skip_h[idx] = 1;
+    }
+  }
+  sort_rd(rds_h, idx_h, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
+  }
+
+  if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
+
+  // evaluate vertical with the best horizontal chosen
+  rds_v[0] = rds_h[0];
+  int start_v = 1, end_v = 4;
+  const int *idx_map_v = idx_map + idx_h[0];
+
+  for (idx = start_v; idx < end_v; ++idx) {
+    tx_type = idx_map_v[idx_v[idx] * 4];
+    txfm_param.tx_type = tx_type;
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) {
+      skip_v[idx] = 1;
+    }
+  }
+  sort_rd(rds_v, idx_v, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1;
+  }
+
+  // combine rd_h and rd_v to prune tx candidates
+  int i_v, i_h;
+  int64_t rds[16];
+  int num_cand = 0, last = TX_TYPES - 1;
+
+  for (int i = 0; i < 16; i++) {
+    i_v = sel_pattern_v[i];
+    i_h = sel_pattern_h[i];
+    tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]];
+    if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] ||
+        skip_v[idx_v[i_v]]) {
+      txk_map[last] = tx_type;
+      last--;
+    } else {
+      txk_map[num_cand] = tx_type;
+      rds[num_cand] = rds_v[i_v] + rds_h[i_h];
+      if (rds[num_cand] == 0) rds[num_cand] = 1;
+      num_cand++;
+    }
+  }
+  sort_rd(rds, txk_map, num_cand);
+
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+  num_sel = AOMMIN(num_sel, num_cand);
+
+  for (int i = 1; i < num_sel; i++) {
+    int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]);
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[i]);
+    else
+      break;
+  }
+  return prune;
+}
+
+uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                        int block, TX_SIZE tx_size, int blk_row, int blk_col,
+                        BLOCK_SIZE plane_bsize, int *txk_map,
+                        uint16_t allowed_tx_mask, int prune_factor,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const AV1_COMMON *cm = &cpi->common;
+  int tx_type;
+
+  int64_t rds[TX_TYPES];
+
+  int num_cand = 0;
+  int last = TX_TYPES - 1;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+
+  for (int idx = 0; idx < TX_TYPES; idx++) {
+    tx_type = idx;
+    int rate_cost = 0;
+    int64_t dist = 0, sse = 0;
+    if (!(allowed_tx_mask & (1 << tx_type))) {
+      txk_map[last] = tx_type;
+      last--;
+      continue;
+    }
+    txfm_param.tx_type = tx_type;
+
+    // do txfm and quantization
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+    // estimate rate cost
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+    // tx domain dist
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    txk_map[num_cand] = tx_type;
+    rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
+    if (rds[num_cand] == 0) rds[num_cand] = 1;
+    num_cand++;
+  }
+
+  if (num_cand == 0) return (uint16_t)0xFFFF;
+
+  sort_rd(rds, txk_map, num_cand);
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+
+  // 0 < prune_factor <= 1000 controls aggressiveness
+  int64_t factor = 0;
+  for (int idx = 1; idx < num_cand; idx++) {
+    factor = 1000 * (rds[idx] - rds[0]) / rds[0];
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[idx]);
+    else
+      break;
+  }
+  return prune;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+             0.09778f, 0.11780f },
+  // TX_8X8
+  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+             0.10803f, 0.14124f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+             0.10168f, 0.12585f },
+  // TX_8X4
+  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+             0.10583f, 0.13123f },
+  // TX_8X16
+  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+             0.10730f, 0.14221f },
+  // TX_16X8
+  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+             0.10339f, 0.13464f },
+  // TX_16X32
+  NULL,
+  // TX_32X16
+  NULL,
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+             0.10242f, 0.12878f },
+  // TX_16X4
+  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+             0.10217f, 0.12610f },
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+// Probablities are sorted in descending order.
+static INLINE void sort_probability(float prob[], int txk[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; ++i) {
+    for (j = 0; j < i; ++j) {
+      if (prob[j] < prob[i]) {
+        float temp;
+        int tempi;
+
+        temp = prob[i];
+        tempi = txk[i];
+
+        for (k = i; k > j; k--) {
+          prob[k] = prob[k - 1];
+          txk[k] = txk[k - 1];
+        }
+
+        prob[j] = temp;
+        txk[j] = tempi;
+        break;
+      }
+    }
+  }
+}
+
+static INLINE float get_adaptive_thresholds(TX_SIZE tx_size,
+                                            TxSetType tx_set_type,
+                                            TX_TYPE_PRUNE_MODE prune_mode) {
+  const int prune_aggr_table[4][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 } };
+  int pruning_aggressiveness = 0;
+  if (tx_set_type == EXT_TX_SET_ALL16)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+  else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+
+  return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
+}
+
+static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
+                                                     int stride, int bw, int bh,
+                                                     float *hordist,
+                                                     float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw >> w_shift;
+  const int esq_h = bh >> h_shift;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  if (w_shift) {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j += 2) {
+        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+      }
+    }
+  } else {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j++) {
+        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+      }
+    }
+  }
+
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                        int blk_row, int blk_col, TxSetType tx_set_type,
+                        TX_TYPE_PRUNE_MODE prune_mode, int *txk_map,
+                        uint16_t *allowed_tx_mask) {
+  int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return;
+#if CONFIG_NN_V2
+  NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#else
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#endif
+  if (!nn_config_hor || !nn_config_ver) return;  // Model not established yet.
+
+  aom_clear_system_state();
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D_raw[16];
+  float scores_2D[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+                                  &hfeatures[hfeatures_num - 1],
+                                  &vfeatures[vfeatures_num - 1]);
+  aom_clear_system_state();
+#if CONFIG_NN_V2
+  av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+  av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
+#else
+  av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
+#endif
+  aom_clear_system_state();
+
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D_raw + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+  }
+
+  av1_nn_softmax(scores_2D_raw, scores_2D, 16);
+
+  const float score_thresh =
+      get_adaptive_thresholds(tx_size, tx_set_type, prune_mode);
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  uint16_t allow_bitmask = 0;
+  float sum_score = 0.0;
+  // Calculate sum of allowed tx type score and Populate allow bit mask based
+  // on score_thresh and allowed_tx_mask
+  for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+    int allow_tx_type = *allowed_tx_mask & (1 << tx_type_table_2D[tx_idx]);
+    if (scores_2D[tx_idx] > max_score && allow_tx_type) {
+      max_score = scores_2D[tx_idx];
+      max_score_i = tx_idx;
+    }
+    if (scores_2D[tx_idx] >= score_thresh && allow_tx_type) {
+      // Set allow mask based on score_thresh
+      allow_bitmask |= (1 << tx_type_table_2D[tx_idx]);
+
+      // Accumulate score of allowed tx type
+      sum_score += scores_2D[tx_idx];
+    }
+  }
+  if (!((allow_bitmask >> max_score_i) & 0x01)) {
+    // Set allow mask based on tx type with max score
+    allow_bitmask |= (1 << tx_type_table_2D[max_score_i]);
+    sum_score += scores_2D[max_score_i];
+  }
+  // Sort tx type probability of all types
+  sort_probability(scores_2D, tx_type_table_2D, TX_TYPES);
+
+  // Enable more pruning based on tx type probability and number of allowed tx
+  // types
+  if (prune_mode == PRUNE_2D_AGGRESSIVE) {
+    float temp_score = 0.0;
+    float score_ratio = 0.0;
+    int tx_idx, tx_count = 0;
+    const float inv_sum_score = 100 / sum_score;
+    // Get allowed tx types based on sorted probability score and tx count
+    for (tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+      // Skip the tx type which has more than 30% of cumulative
+      // probability and allowed tx type count is more than 2
+      if (score_ratio > 30.0 && tx_count >= 2) break;
+
+      // Calculate cumulative probability of allowed tx types
+      if (allow_bitmask & (1 << tx_type_table_2D[tx_idx])) {
+        // Calculate cumulative probability
+        temp_score += scores_2D[tx_idx];
+
+        // Calculate percentage of cumulative probability of allowed tx type
+        score_ratio = temp_score * inv_sum_score;
+        tx_count++;
+      }
+    }
+    // Set remaining tx types as pruned
+    for (; tx_idx < TX_TYPES; tx_idx++)
+      allow_bitmask &= ~(1 << tx_type_table_2D[tx_idx]);
+  }
+  memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+  *allowed_tx_mask = allow_bitmask;
+}
+
+static float get_dev(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  const float dev = (diff > 0) ? sqrtf(diff) : 0;
+  return dev;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static AOM_INLINE void get_mean_dev_features(const int16_t *data, int stride,
+                                             int bw, int bh, float *feature) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int feature_idx = 2;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int blk_idx = 0;
+  double mean2_sum = 0.0f;
+  float dev_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      // TODO(any): Write a SIMD version. Clear registers.
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      aom_clear_system_state();
+      const float mean = (float)x_sum / sub_num;
+      const float dev = get_dev(mean, (double)x2_sum, sub_num);
+      feature[feature_idx++] = mean;
+      feature[feature_idx++] = dev;
+      mean2_sum += (double)(mean * mean);
+      dev_sum += dev;
+      blk_idx++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  feature[0] = lvl0_mean;
+  feature[1] = get_dev(lvl0_mean, (double)total_x2_sum, num);
+
+  if (blk_idx > 1) {
+    // Deviation of means.
+    feature[feature_idx++] = get_dev(lvl0_mean, mean2_sum, blk_idx);
+    // Mean of deviations.
+    feature[feature_idx++] = dev_sum / blk_idx;
+  }
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
+
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
+
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  int int_score = (int)(score * 10000);
+  return clamp(int_score, -80000, 80000);
+}
+
+static INLINE uint16_t
+get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+            int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+            const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+            int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
+  // TX_TYPES, only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+
+  if ((!is_inter && x->use_default_intra_tx_type) ||
+      (is_inter && x->use_default_inter_tx_type)) {
+    txk_allowed =
+        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
+  } else if (x->rd_model == LOW_TXFM_RD) {
+    if (plane == 0) txk_allowed = DCT_DCT;
+  }
+
+  const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter, cm->features.reduced_tx_set_used);
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_allowed =
+        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+                        cm->features.reduced_tx_set_used);
+  }
+  PREDICTION_MODE intra_dir =
+      mbmi->filter_intra_mode_info.use_filter_intra
+          ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
+          : mbmi->mode;
+  uint16_t ext_tx_used_flag =
+      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset &&
+              tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
+          ? av1_reduced_intra_tx_used_flag[intra_dir]
+          : av1_ext_tx_used_flag[tx_set_type];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001 ||
+      (is_inter && cpi->oxcf.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+    txk_allowed = DCT_DCT;
+  }
+
+  if (cpi->oxcf.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK;
+
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_allowed < TX_TYPES) {
+    allowed_tx_mask = 1 << txk_allowed;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else {
+    assert(plane == 0);
+    allowed_tx_mask = ext_tx_used_flag;
+    int num_allowed = 0;
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int *tx_type_probs =
+        cpi->frame_probs.tx_type_probs[update_type][tx_size];
+    int i;
+
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 },
+                                            { 10, 17, 17, 10, 17, 17, 17 } };
+      const int thresh =
+          thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1]
+                    [update_type];
+      uint16_t prune = 0;
+      int max_prob = -1;
+      int max_idx = 0;
+      for (i = 0; i < TX_TYPES; i++) {
+        if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) {
+          max_prob = tx_type_probs[i];
+          max_idx = i;
+        }
+        if (tx_type_probs[i] < thresh) prune |= (1 << i);
+      }
+      if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx);
+      allowed_tx_mask &= (~prune);
+    }
+    for (i = 0; i < TX_TYPES; i++) {
+      if (allowed_tx_mask & (1 << i)) num_allowed++;
+    }
+    assert(num_allowed > 0);
+
+    if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+      int pf = prune_factors[x->prune_mode];
+      int mf = mul_factors[x->prune_mode];
+      if (num_allowed <= 7) {
+        const uint16_t prune =
+            prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
+                           plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx,
+                           cm->features.reduced_tx_set_used);
+        allowed_tx_mask &= (~prune);
+      } else {
+        const int num_sel = (num_allowed * mf + 50) / 100;
+        const uint16_t prune = prune_txk_type_separ(
+            cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+            txk_map, allowed_tx_mask, pf, txb_ctx,
+            cm->features.reduced_tx_set_used, ref_best_rd, num_sel);
+
+        allowed_tx_mask &= (~prune);
+      }
+    } else {
+      assert(num_allowed > 0);
+      int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+      // !fast_tx_search && txk_end != txk_start && plane == 0
+      if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+          num_allowed > allowed_tx_count) {
+        prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                    x->prune_mode, txk_map, &allowed_tx_mask);
+      }
+    }
+  }
+
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_mask == 0) {
+    txk_allowed = (plane ? uv_tx_type : DCT_DCT);
+    allowed_tx_mask = (1 << txk_allowed);
+  }
+
+  assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
+  *allowed_txk_types = txk_allowed;
+  return allowed_tx_mask;
+}
+
+#if CONFIG_RD_DEBUG
+static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+                                         TX_SIZE tx_size, int blk_row,
+                                         int blk_col, int txb_coeff_cost) {
+  (void)blk_row;
+  (void)blk_col;
+  (void)tx_size;
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+  {
+    const int txb_h = tx_size_high_unit[tx_size];
+    const int txb_w = tx_size_wide_unit[tx_size];
+    int idx, idy;
+    for (idy = 0; idy < txb_h; ++idy)
+      for (idx = 0; idx < txb_w; ++idx)
+        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+  }
+  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+}
+#endif
+
+static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+                              TX_SIZE tx_size, const TX_TYPE tx_type,
+                              const TXB_CTX *const txb_ctx,
+                              int use_fast_coef_costing,
+                              int reduced_tx_set_used) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  (void)use_fast_coef_costing;
+  const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
+                                       txb_ctx, reduced_tx_set_used);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
+#endif
+  return cost;
+}
+
+// Search for the best transform type for a given transform block.
+// This function can be used for both inter and intra, both luma and chroma.
+static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                           int block, int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                           const TXB_CTX *const txb_ctx,
+                           FAST_TX_SEARCH_MODE ftxs_mode,
+                           int use_fast_coef_costing, int skip_trellis,
+                           int64_t ref_best_rd, RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  int rate_cost = 0;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int tx_type_map_idx =
+      plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
+  av1_invalid_rd_stats(best_rd_stats);
+
+  skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
+                                   DRY_RUN_NORMAL);
+
+  // Hashing based speed feature for intra block. If the hash of the residue
+  // is found in the hash table, use the previous RD search results stored in
+  // the table and terminate early.
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int is_inter = is_inter_block(mbmi);
+  const int use_intra_txb_hash =
+      cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
+  if (use_intra_txb_hash) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+    if (within_border &&
+        is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize,
+                            tx_size, txb_ctx, &intra_txb_rd_info,
+                            tx_type_map_idx, &cur_joint_ctx)) {
+      best_rd_stats->rate = intra_txb_rd_info->rate;
+      best_rd_stats->dist = intra_txb_rd_info->dist;
+      best_rd_stats->sse = intra_txb_rd_info->sse;
+      best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+      x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+      x->plane[plane].txb_entropy_ctx[block] =
+          intra_txb_rd_info->txb_entropy_ctx;
+      best_eob = intra_txb_rd_info->eob;
+      best_tx_type = intra_txb_rd_info->tx_type;
+      skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt;
+      update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+      recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
+      pd->dqcoeff = orig_dqcoeff;
+      return;
+    }
+  }
+
+  uint8_t best_txb_ctx = 0;
+  // txk_allowed = TX_TYPES: >1 tx types are allowed
+  // txk_allowed < TX_TYPES: only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+  int txk_map[TX_TYPES] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  const uint16_t allowed_tx_mask =
+      get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+
+  unsigned int block_mse_q8;
+  int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                      txsize_to_bsize[tx_size], &block_mse_q8);
+  assert(block_mse_q8 != UINT_MAX);
+  if (is_cur_buf_hbd(xd)) {
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+  }
+  block_sse *= 16;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+  // Use mse / qstep^2 based threshold logic to take decision of R-D
+  // optimization of coeffs. For smaller residuals, coeff optimization
+  // would be helpful. For larger residuals, R-D optimization may not be
+  // effective.
+  // TODO(any): Experiment with variance and mean based thresholds
+  const int perform_block_coeff_opt =
+      ((uint64_t)block_mse_q8 <=
+       (uint64_t)x->coeff_opt_dist_threshold * qstep * qstep);
+  skip_trellis |= !perform_block_coeff_opt;
+
+  // Flag to indicate if distortion should be calculated in transform domain or
+  // not during iterating through transform type candidates.
+  // Transform domain distortion is accurate for higher residuals.
+  // TODO(any): Experiment with variance and mean based thresholds
+  int use_transform_domain_distortion =
+      (x->use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= x->tx_domain_dist_threshold) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      txsize_sqr_up_map[tx_size] != TX_64X64;
+  // Flag to indicate if an extra calculation of distortion in the pixel domain
+  // should be performed at the end, after the best transform type has been
+  // decided.
+  int calc_pixel_domain_distortion_final =
+      x->use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
+  if (calc_pixel_domain_distortion_final &&
+      (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, !skip_trellis,
+                  skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                         : AV1_XFORM_QUANT_FP)
+                               : AV1_XFORM_QUANT_FP,
+                  cpi->oxcf.quant_b_adapt, &quant_param);
+
+  // Iterate through all transform type candidates.
+  for (int idx = 0; idx < TX_TYPES; ++idx) {
+    const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
+    if (!(allowed_tx_mask & (1 << tx_type))) continue;
+    txfm_param.tx_type = tx_type;
+    if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                        &quant_param);
+    }
+    if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Calculate rate cost of quantized coefficients.
+    if (quant_param.use_optimize_b) {
+      if (cpi->sf.rd_sf.optimize_b_precheck && best_rd < INT64_MAX &&
+          eobs_ptr[block] >= 4) {
+        // Calculate distortion quickly in transform domain.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+
+        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+        const int64_t dist_cost_estimate =
+            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+      }
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                     cpi->sf.rd_sf.trellis_eob_fast, &rate_cost);
+    } else {
+      rate_cost =
+          cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+                      use_fast_coef_costing, cm->features.reduced_tx_set_used);
+    }
+
+    // If rd cost based on coeff rate alone is already more than best_rd,
+    // terminate early.
+    if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
+
+    // Calculate distortion.
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      int64_t sse_diff = INT64_MAX;
+      // high_energy threshold assumes that every pixel within a txfm block
+      // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+      // for 8 bit, then the threshold is scaled based on input bit depth.
+      const int64_t high_energy_thresh =
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+      const int is_high_energy = (block_sse >= high_energy_thresh);
+      if (tx_size == TX_64X64 || is_high_energy) {
+        // Because 3 out 4 quadrants of transform coefficients are forced to
+        // zero, the inverse transform has a tendency to overflow. sse_diff
+        // is effectively the energy of those 3 quadrants, here we use it
+        // to decide if we should do pixel domain distortion. If the energy
+        // is mostly in first quadrant, then it is unlikely that we have
+        // overflow issue in inverse transform.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        sse_diff = block_sse - this_rd_stats.sse;
+      }
+      if (tx_size != TX_64X64 || !is_high_energy ||
+          (sse_diff * 2) < this_rd_stats.sse) {
+        const int64_t tx_domain_dist = this_rd_stats.dist;
+        this_rd_stats.dist = dist_block_px_domain(
+            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+        // For high energy blocks, occasionally, the pixel domain distortion
+        // can be artificially low due to clamping at reconstruction stage
+        // even when inverse transform output is hugely different from the
+        // actual residue.
+        if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+          this_rd_stats.dist = tx_domain_dist;
+      } else {
+        assert(sse_diff < INT64_MAX);
+        this_rd_stats.dist += sse_diff;
+      }
+      this_rd_stats.sse = block_sse;
+    }
+
+    this_rd_stats.rate = rate_cost;
+
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      // Swap dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = pd->dqcoeff;
+      pd->dqcoeff = tmp_dqcoeff;
+    }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type, rd);
+    }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if COLLECT_TX_SIZE_DATA
+    // Generate small sample to restrict output size.
+    static unsigned int seed = 21743;
+    if (lcg_rand16(&seed) % 200 == 0) {
+      FILE *fp = NULL;
+
+      if (within_border) {
+        fp = fopen(av1_tx_size_data_output_file, "a");
+      }
+
+      if (fp) {
+        // Transform info and RD
+        const int txb_w = tx_size_wide[tx_size];
+        const int txb_h = tx_size_high[tx_size];
+
+        // Residue signal.
+        const int diff_stride = block_size_wide[plane_bsize];
+        struct macroblock_plane *const p = &x->plane[plane];
+        const int16_t *src_diff =
+            &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+
+        for (int r = 0; r < txb_h; ++r) {
+          for (int c = 0; c < txb_w; ++c) {
+            fprintf(fp, "%d,", src_diff[c]);
+          }
+          src_diff += diff_stride;
+        }
+
+        fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
+        fprintf(fp, "\n");
+        fclose(fp);
+      }
+    }
+#endif  // COLLECT_TX_SIZE_DATA
+
+    // If the current best RD cost is much worse than the reference RD cost,
+    // terminate early.
+    if (cpi->sf.tx_sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
+        break;
+      }
+    }
+
+    // Terminate transform type search if the block has been quantized to
+    // all zero.
+    if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
+  }
+
+  assert(best_rd != INT64_MAX);
+
+  best_rd_stats->skip = best_eob == 0;
+  if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+
+  // Point dqcoeff to the quantized coefficients corresponding to the best
+  // transform type, then we can skip transform and quantization, e.g. in the
+  // final pixel domain distortion calculation and recon_intra().
+  pd->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  if (intra_txb_rd_info != NULL) {
+    intra_txb_rd_info->valid = 1;
+    intra_txb_rd_info->entropy_context = cur_joint_ctx;
+    intra_txb_rd_info->rate = best_rd_stats->rate;
+    intra_txb_rd_info->dist = best_rd_stats->dist;
+    intra_txb_rd_info->sse = best_rd_stats->sse;
+    intra_txb_rd_info->eob = best_eob;
+    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+    intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt;
+    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+  }
+
+  // Intra mode needs decoded pixels such that the next transform block
+  // can use them for prediction.
+  recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+              txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
+  pd->dqcoeff = orig_dqcoeff;
+}
+
+// Pick transform type for a luma transform block of tx_size. Note this function
+// is used only for inter-predicted blocks.
+static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  TX_SIZE tx_size, int blk_row, int blk_col,
+                                  int block, int plane_bsize, TXB_CTX *txb_ctx,
+                                  RD_STATS *rd_stats,
+                                  FAST_TX_SEARCH_MODE ftxs_mode,
+                                  int64_t ref_rdcost,
+                                  TXB_RD_INFO *rd_info_array) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint16_t cur_joint_ctx =
+      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+  const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col;
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residue with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx) {
+    xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size,
+                        cpi->common.features.reduced_tx_set_used);
+    if (ref_tx_type == rd_info_array->tx_type) {
+      rd_stats->rate += rd_info_array->rate;
+      rd_stats->dist += rd_info_array->dist;
+      rd_stats->sse += rd_info_array->sse;
+      rd_stats->skip &= rd_info_array->eob == 0;
+      p->eobs[block] = rd_info_array->eob;
+      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+      return;
+    }
+  }
+
+  RD_STATS this_rd_stats;
+  const int skip_trellis = 0;
+  search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
+                 txb_ctx, ftxs_mode, 0, skip_trellis, ref_rdcost,
+                 &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx];
+  }
+}
+
+static AOM_INLINE void try_tx_block_no_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    TxCandidateInfo *no_split) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = mi_size_wide[plane_bsize];
+  const ENTROPY_CONTEXT *const pta = ta + blk_col;
+  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->zero_rate = zero_blk_rate;
+  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+  mbmi->inter_tx_size[index] = tx_size;
+  tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+             rd_stats, ftxs_mode, ref_best_rd,
+             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+  assert(rd_stats->rate < INT_MAX);
+
+  const int pick_skip = !xd->lossless[mbmi->segment_id] &&
+                        (rd_stats->skip == 1 ||
+                         RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+                             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+  if (pick_skip) {
+#if CONFIG_RD_DEBUG
+    update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
+                          zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+    p->eobs[block] = 0;
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+  rd_stats->skip = pick_skip;
+  set_blk_skip(x, 0, blk_row * bw + blk_col, pick_skip);
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+  no_split->tx_type =
+      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+}
+
+static AOM_INLINE void try_tx_block_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    RD_STATS *split_rd_stats) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  const int txb_width = tx_size_wide_unit[tx_size];
+  const int txb_height = tx_size_high_unit[tx_size];
+  // Transform size after splitting current block.
+  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+  const int sub_txb_width = tx_size_wide_unit[sub_txs];
+  const int sub_txb_height = tx_size_high_unit[sub_txs];
+  const int sub_step = sub_txb_width * sub_txb_height;
+  const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
+  assert(nblks > 0);
+  av1_init_rd_stats(split_rd_stats);
+  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+  for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+    for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
+      assert(blk_idx < 4);
+      const int offsetr = blk_row + r;
+      const int offsetc = blk_col + c;
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      RD_STATS this_rd_stats;
+      int this_cost_valid = 1;
+      select_tx_block(
+          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
+          ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode,
+          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+      if (!this_cost_valid) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+      split_rd_stats->rdcost =
+          RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+      if (split_rd_stats->rdcost > ref_best_rd) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      block += sub_step;
+    }
+  }
+}
+
+// Search for the best transform partition(recursive)/type for a given
+// inter-predicted luma block. The obtained transform selection will be saved
+// in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
+    TXB_RD_INFO_NODE *rd_info_node) {
+  assert(tx_size < TX_SIZES_ALL);
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(blk_row < max_block_high(xd, plane_bsize, 0) &&
+         blk_col < max_block_wide(xd, plane_bsize, 0));
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  const int try_no_split =
+      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+  // Try using current block as a single transform block without split.
+  if (try_no_split) {
+    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+                          ftxs_mode, rd_info_node, &no_split);
+
+    // Speed features for early termination.
+    const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+    if (search_level) {
+      if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
+        *is_cost_valid = 0;
+        return;
+      }
+      if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
+        try_split = 0;
+      }
+    }
+    if (cpi->sf.tx_sf.txb_split_cap) {
+      if (p->eobs[block] == 0) try_split = 0;
+    }
+  }
+
+  // ML based speed feature to skip searching for split transform blocks.
+  if (x->e_mbd.bd == 8 && try_split &&
+      !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) {
+    const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score < -threshold) try_split = 0;
+    }
+  }
+
+  RD_STATS split_rd_stats;
+  split_rd_stats.rdcost = INT64_MAX;
+  // Try splitting current block into smaller transform blocks.
+  if (try_split) {
+    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+                       rd_info_node, &split_rd_stats);
+  }
+
+  if (no_split.rd < split_rd_stats.rdcost) {
+    ENTROPY_CONTEXT *pta = ta + blk_col;
+    ENTROPY_CONTEXT *ptl = tl + blk_row;
+    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size, pta, ptl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
+    const int bw = mi_size_wide[plane_bsize];
+    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+  } else {
+    *rd_stats = split_rd_stats;
+    if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
+                                              MACROBLOCK *x, RD_STATS *rd_stats,
+                                              int64_t ref_best_rd,
+                                              BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+
+  // If tx64 is not enabled, we need to go down to the next available size
+  if (!cpi->oxcf.enable_tx64) {
+    static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X8,    // 4x8 transform
+      TX_8X4,    // 8x4 transform
+      TX_8X16,   // 8x16 transform
+      TX_16X8,   // 16x8 transform
+      TX_16X32,  // 16x32 transform
+      TX_32X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X16,   // 4x16 transform
+      TX_16X4,   // 16x4 transform
+      TX_8X32,   // 8x32 transform
+      TX_32X8,   // 32x8 transform
+      TX_16X32,  // 16x64 transform
+      TX_32X16,  // 64x16 transform
+    };
+
+    mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+  }
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  // Skip RDcost is used only for Inter blocks
+  const int64_t skip_rd =
+      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_rate, 0);
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_skip_rd, skip_rd), AOM_PLANE_Y, bs,
+                       mbmi->tx_size, cpi->sf.rd_sf.use_fast_coef_costing,
+                       FTXS_NONE, skip_trellis);
+}
+
+static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
+                                               MACROBLOCK *x,
+                                               RD_STATS *rd_stats,
+                                               int64_t ref_best_rd,
+                                               BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  mbmi->tx_size = TX_4X4;
+  // TODO(any) : Pass this_rd based on skip/non-skip cost
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
+                       cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
+                       skip_trellis);
+}
+
+// Search for the best uniform transform size and type for current coding block.
+static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                                   MACROBLOCK *x,
+                                                   RD_STATS *rd_stats,
+                                                   int64_t ref_best_rd,
+                                                   BLOCK_SIZE bs) {
+  av1_invalid_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT;
+  int start_tx;
+  // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
+  // how many times of splitting is allowed during the RD search.
+  int init_depth;
+
+  if (tx_select) {
+    start_tx = max_rect_tx_size;
+    init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                       is_inter_block(mbmi), &cpi->sf,
+                                       x->tx_size_search_method);
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+    start_tx = chosen_tx_size;
+    init_depth = MAX_TX_DEPTH;
+  }
+
+  const int skip_trellis = 0;
+  uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  int64_t best_rd = INT64_MAX;
+  const int num_blks = bsize_to_num_blk(bs);
+  x->rd_model = FULL_TXFM_RD;
+  int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
+       depth++, tx_size = sub_tx_size_map[tx_size]) {
+    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+      continue;
+    }
+
+    RD_STATS this_rd_stats;
+    rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs,
+                                     tx_size, FTXS_NONE, skip_trellis);
+    if (rd[depth] < best_rd) {
+      av1_copy_array(best_blk_skip, x->blk_skip, num_blks);
+      av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
+      best_tx_size = tx_size;
+      best_rd = rd[depth];
+      *rd_stats = this_rd_stats;
+    }
+    if (tx_size == TX_4X4) break;
+    // If we are searching three depths, prune the smallest size depending
+    // on rd results for the first two depths for low contrast blocks.
+    if (depth > init_depth && depth != MAX_TX_DEPTH &&
+        x->source_variance < 256) {
+      if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
+    av1_copy_array(x->blk_skip, best_blk_skip, num_blks);
+  }
+}
+
+// Search for the best transform type for the given transform block in the
+// given plane/channel, and calculate the corresponding RD cost.
+static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
+                                     int blk_col, BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  if (args->exit_early) {
+    args->incomplete_exit = 1;
+    return;
+  }
+
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_inter = is_inter_block(xd->mi[0]);
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+  const AV1_COMMON *cm = &cpi->common;
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (!is_inter) {
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  }
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                 &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+                 args->skip_trellis, args->best_rd - args->current_rd,
+                 &this_rd_stats);
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    assert(!is_inter || plane_bsize < BLOCK_8X8);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+
+#if CONFIG_RD_DEBUG
+  update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+                        this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  const int blk_idx =
+      blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+  if (plane == 0)
+    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+  else
+    set_blk_skip(x, plane, blk_idx, 0);
+
+  int64_t rd;
+  if (is_inter) {
+    const int64_t no_skip_rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+    rd = AOMMIN(no_skip_rd, skip_rd);
+    this_rd_stats.skip &= !x->plane[plane].eobs[block];
+  } else {
+    // Signal non-skip for Intra blocks
+    rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    this_rd_stats.skip = 0;
+  }
+
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->current_rd += rd;
+  if (args->current_rd > args->best_rd) args->exit_early = 1;
+}
+
+// Search for the best transform type and return the transform coefficients RD
+// cost of current luma coding block with the given uniform transform size.
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->sb_type);
+  int tx_size_rate = 0;
+  if (tx_select) {
+    const int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+    tx_size_rate = is_inter ? x->txfm_partition_cost[ctx][0]
+                            : tx_size_cost(x, bs, tx_size);
+  }
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int64_t skip_rd =
+      is_inter ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int64_t no_this_rd =
+      RDCOST(x->rdmult, no_skip_flag_rate + tx_size_rate, 0);
+
+  mbmi->tx_size = tx_size;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size,
+                       cpi->sf.rd_sf.use_fast_coef_costing, ftxs_mode,
+                       skip_trellis);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  int64_t rd;
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
+  if (rd_stats->skip && is_inter) {
+    rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  } else {
+    // Intra blocks are always signalled as non-skip
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate + tx_size_rate,
+                rd_stats->dist);
+    rd_stats->rate += tx_size_rate;
+  }
+  // Check if forcing the block to skip transform leads to smaller RD cost.
+  if (is_inter && !rd_stats->skip && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+    if (temp_skip_rd <= rd) {
+      rd = temp_skip_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+    }
+  }
+
+  return rd;
+}
+
+// Search for the best transform type for a luma inter-predicted block, given
+// the transform block partitions.
+// This function is used only when some speed features are enabled.
+static AOM_INLINE void tx_block_yrd(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
+    ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+    TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
+    RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(is_inter_block(mbmi));
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+               rd_stats, ftxs_mode, ref_best_rd, NULL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    } else {
+      rd_stats->skip = 0;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+    }
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int txb_width = tx_size_wide_unit[sub_txs];
+    const int txb_height = tx_size_high_unit[sub_txs];
+    const int step = txb_height * txb_width;
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(txb_width > 0 && txb_height > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+  }
+}
+
+// search for tx type with tx sizes already decided for a inter-predicted luma
+// partition block. It's used only when some speed features are enabled.
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  if (ref_best_rd < 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
+                                               x->tx_size_search_method);
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+  int64_t this_rd = 0;
+  for (int idy = 0, block = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      RD_STATS pn_rd_stats;
+      av1_init_rd_stats(&pn_rd_stats);
+      tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth,
+                   ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd,
+                   &pn_rd_stats, ftxs_mode);
+      if (pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return 0;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      this_rd +=
+          AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                 RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+      block += step;
+    }
+  }
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int64_t skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  this_rd =
+      RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate, rd_stats->dist);
+  if (skip_rd < this_rd) {
+    this_rd = skip_rd;
+    rd_stats->rate = 0;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
+
+  const int is_cost_valid = this_rd > ref_best_rd;
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+// Search for the best transform size and type for current inter-predicted
+// luma block with recursive transform block partitioning. The obtained
+// transform selection will be saved in xd->mi[0], the corresponding RD stats
+// will be saved in rd_stats. The returned value is the corresponding RD cost.
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd,
+                                       TXB_RD_INFO_NODE *rd_info_tree) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int fast_tx_search = x->tx_size_search_method > USE_FULL_RD;
+  int64_t rd_thresh = ref_best_rd;
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
+  const FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
+                                               x->tx_size_search_method);
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_cost = x->skip_cost[skip_ctx][0];
+  const int skip_flag_cost = x->skip_cost[skip_ctx][1];
+  int64_t skip_rd = RDCOST(x->rdmult, skip_flag_cost, 0);
+  int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_cost, 0);
+  int block = 0;
+
+  av1_init_rd_stats(rd_stats);
+  for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
+    for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) {
+      const int64_t best_rd_sofar =
+          (rd_thresh == INT64_MAX)
+              ? INT64_MAX
+              : (rd_thresh - (AOMMIN(skip_rd, no_skip_rd)));
+      int is_cost_valid = 1;
+      RD_STATS pn_rd_stats;
+      // Search for the best transform block size and type for the sub-block.
+      select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
+                      ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
+                      best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree);
+      if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return INT64_MAX;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      skip_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+      no_skip_rd =
+          RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+      block += step;
+      if (rd_info_tree != NULL) rd_info_tree += 1;
+    }
+  }
+
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  rd_stats->skip = (skip_rd <= no_skip_rd);
+
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
+  }
+
+  int64_t final_rd;
+  if (rd_stats->skip) {
+    final_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+  } else {
+    final_rd =
+        RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+    if (!xd->lossless[xd->mi[0]->segment_id]) {
+      final_rd =
+          AOMMIN(final_rd, RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse));
+    }
+  }
+
+  return final_rd;
+}
+
+// Return 1 to terminate transform search early. The decision is made based on
+// the comparison with the reference RD cost and the model-estimated RD cost.
+static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
+                                                  MACROBLOCK *x,
+                                                  BLOCK_SIZE bsize,
+                                                  int64_t ref_best_rd) {
+  const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
+  assert(level >= 0 && level <= 2);
+  int model_rate;
+  int64_t model_dist;
+  int model_skip;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+      cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
+      NULL, NULL, NULL);
+  if (model_skip) return 0;
+  const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+  // TODO(debargha, urvang): Improve the model and make the check below
+  // tighter.
+  static const int prune_factor_by8[] = { 3, 5 };
+  const int factor = prune_factor_by8[level - 1];
+  return ((model_rd * factor) >> 3) > ref_best_rd;
+}
+
+// Search for best transform size and type for luma inter blocks. The transform
+// block partitioning can be recursive resulting in non-uniform transform sizes.
+// The best transform size and type, if found, will be saved in the MB_MODE_INFO
+// structure, and the corresponding RD stats will be saved in rd_stats.
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+
+  av1_invalid_rd_stats(rd_stats);
+
+  // If modeled RD cost is a lot worse than the best so far, terminate early.
+  if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
+      ref_best_rd != INT64_MAX) {
+    if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
+  }
+
+  // Hashing based speed feature. If the hash of the prediction residue block is
+  // found in the hash table, use previous search results and terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+  const int is_mb_rd_hash_enabled =
+      (within_border && cpi->sf.rd_sf.use_mb_rd_hash);
+  const int n4 = bsize_to_num_blk(bsize);
+  if (is_mb_rd_hash_enabled) {
+    hash = get_block_residue_hash(x, bsize);
+    mb_rd_record = &x->mb_rd_record;
+    const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+    if (match_index != -1) {
+      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+      return;
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (x->predict_skip_level &&
+      predict_skip_flag(x, bsize, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    set_skip_flag(x, rd_stats, bsize, dist);
+    // Save the RD search results into tx_rd_record.
+    if (is_mb_rd_hash_enabled)
+      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
+  }
+#if CONFIG_SPEED_STATS
+  ++x->tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  // Pre-compute residue hashes (transform block level) and find existing or
+  // add new RD records to store and reuse rate and distortion values to speed
+  // up TX size/type search.
+  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
+  int found_rd_info = 0;
+  if (ref_best_rd != INT64_MAX && within_border &&
+      cpi->sf.tx_sf.use_inter_txb_hash) {
+    found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info);
+  }
+
+  const int64_t rd =
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd,
+                              found_rd_info ? matched_rd_info : NULL);
+
+  if (rd == INT64_MAX) {
+    // We should always find at least one candidate unless ref_best_rd is less
+    // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+    // might have failed to find something better)
+    assert(ref_best_rd != INT64_MAX);
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  // Save the RD search results into tx_rd_record.
+  if (is_mb_rd_hash_enabled) {
+    assert(mb_rd_record != NULL);
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+// Search for the best transform size and type for current coding block, with
+// the assumption that all the transform blocks have a uniform size (VP9 style).
+// The selected transform size and type will be saved in the MB_MODE_INFO
+// structure; the corresponding RD stats will be saved in rd_stats.
+// This function may be used for both intra and inter predicted blocks.
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(bs == mbmi->sb_type);
+  const int is_inter = is_inter_block(mbmi);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  av1_init_rd_stats(rd_stats);
+
+  // Hashing based speed feature for inter blocks. If the hash of the residue
+  // block is found in the table, use previously saved search results and
+  // terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int num_blks = bsize_to_num_blk(bs);
+  if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) {
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+    if (within_border) {
+      hash = get_block_residue_hash(x, bs);
+      mb_rd_record = &x->mb_rd_record;
+      const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+      if (match_index != -1) {
+        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+        fetch_tx_rd_info(num_blks, tx_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (x->predict_skip_level && is_inter && !xd->lossless[mbmi->segment_id] &&
+      predict_skip_flag(x, bs, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    // Populate rdstats as per skip decision
+    set_skip_flag(x, rd_stats, bs, dist);
+    // Save the RD search results into tx_rd_record.
+    if (mb_rd_record) {
+      save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+    }
+    return;
+  }
+
+  if (xd->lossless[mbmi->segment_id]) {
+    // Lossless mode can only pick the smallest (4x4) transform size.
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (x->tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+
+  // Save the RD search results into tx_rd_record for possible reuse in future.
+  if (mb_rd_record) {
+    save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+// Calculate the transform coefficient RD cost for the given chroma coding block
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd) {
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) return 0;
+  if (!x->e_mbd.is_chroma_ref) return 1;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t this_rd = 0, skip_rd = 0;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  if (is_inter) {
+    for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, plane_bsize, plane);
+  }
+
+  const int skip_trellis = 0;
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+  int is_cost_valid = 1;
+  for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    RD_STATS this_rd_stats;
+    int64_t chroma_ref_best_rd = ref_best_rd;
+    // For inter blocks, refined ref_best_rd is used for early exit
+    // For intra blocks, even though current rd crosses ref_best_rd, early
+    // exit is not recommended as current rd is used for gating subsequent
+    // modes as well (say, for angular modes)
+    // TODO(any): Extend the early exit mechanism for intra modes as well
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+        chroma_ref_best_rd != INT64_MAX)
+      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+    av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
+                         plane_bsize, uv_tx_size,
+                         cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
+                         skip_trellis);
+    if (this_rd_stats.rate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    av1_merge_rd_stats(rd_stats, &this_rd_stats);
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+    if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+// Search for the best transform type and calculate the transform coefficients
+// RD cost of the current coding block with the specified (uniform) transform
+// size and channel. The RD results will be saved in rd_stats.
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int use_fast_coef_costing,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+  assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
+
+  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  if (current_rd > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.current_rd = current_rd;
+  args.use_fast_coef_costing = use_fast_coef_costing;
+  args.ftxs_mode = ftxs_mode;
+  args.skip_trellis = skip_trellis;
+  av1_init_rd_stats(&args.rd_stats);
+
+  av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left);
+  av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm,
+                                         &args);
+
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+  if (invalid_rd) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+// This function combines y and uv planes' transform search processes together
+// for inter-predicted blocks (including IntraBC), when the prediction is
+// already generated. It first does subtraction to obtain the prediction error.
+// Then it calls
+// av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+// av1_txfm_uvrd sequentially and handles the early terminations
+// happening in those functions. At the end, it computes the
+// rd_stats/_y/_uv accordingly.
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
+                                  x->skip_cost[skip_ctx][1] };
+  const int64_t min_header_rate =
+      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
+  // Account for minimum skip and non_skip rd.
+  // Eventually either one of them will be added to mode_rate
+  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+  if (min_header_rd_possible > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats_y);
+    return 0;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+  const int64_t rd_thresh =
+      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  rd_stats->rate = mode_rate;
+
+  // cost and distortion
+  av1_subtract_plane(x, bsize, 0);
+  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+      !xd->lossless[mbmi->segment_id]) {
+    av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+    PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    for (int i = 0; i < xd->height * xd->width; ++i)
+      set_blk_skip(x, 0, i, rd_stats_y->skip);
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return 0;
+
+  av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+  const int64_t non_skip_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
+  const int64_t skip_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+  if (min_rdcosty > ref_best_rd) {
+    const int64_t tokenonly_rdy =
+        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+               RDCOST(x->rdmult, 0, rd_stats_y->sse));
+    // Invalidate rd_stats_y to skip the rest of the motion modes search
+    if (tokenonly_rdy -
+            (tokenonly_rdy >> cpi->sf.inter_sf.prune_motion_mode_level) >
+        rd_thresh) {
+      av1_invalid_rd_stats(rd_stats_y);
+    }
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    int64_t ref_best_chroma_rd = ref_best_rd;
+    // Calculate best rd cost possible for chroma
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
+        (ref_best_chroma_rd != INT64_MAX)) {
+      ref_best_chroma_rd =
+          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+    }
+    const int is_cost_valid_uv =
+        av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+    if (!is_cost_valid_uv) return 0;
+    av1_merge_rd_stats(rd_stats, rd_stats_uv);
+  }
+
+  int choose_skip = rd_stats->skip;
+  if (!choose_skip && !xd->lossless[mbmi->segment_id]) {
+    const int64_t rdcost_no_skip = RDCOST(
+        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+        rd_stats->dist);
+    const int64_t rdcost_skip =
+        RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse);
+    if (rdcost_no_skip >= rdcost_skip) choose_skip = 1;
+  }
+  if (choose_skip) {
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    rd_stats->rate = mode_rate + skip_flag_cost[1];
+    rd_stats->dist = rd_stats->sse;
+    rd_stats_y->dist = rd_stats_y->sse;
+    rd_stats_uv->dist = rd_stats_uv->sse;
+    mbmi->skip = 1;
+    if (rd_stats->skip) {
+      const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (tmprd > ref_best_rd) return 0;
+    }
+  } else {
+    rd_stats->rate += skip_flag_cost[0];
+    mbmi->skip = 0;
+  }
+
+  return 1;
+}

diff --git a/libaom/av1/encoder/tx_search.h b/libaom/av1/encoder/tx_search.h
new file mode 100644
index 0000000..82d5671
--- /dev/null
+++ b/libaom/av1/encoder/tx_search.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
+enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
+
+static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                   TX_SIZE tx_size) {
+  assert(bsize == x->e_mbd.mi[0]->sb_type);
+  if (x->tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize))
+    return 0;
+
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int depth = tx_size_to_depth(tx_size, bsize);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int tx_size_ctx = get_tx_size_context(xd);
+  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+}
+
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd);
+
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd);
+
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd);
+
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t this_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int use_fast_coef_costing,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_

diff --git a/libaom/av1/encoder/use_flat_gop_model_params.h b/libaom/av1/encoder/use_flat_gop_model_params.h
new file mode 100644
index 0000000..cf07766
--- /dev/null
+++ b/libaom/av1/encoder/use_flat_gop_model_params.h

@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// A binary classifier that returns true (score > 0) if it is better to use a
+// flat GOP structure, rather than a GOP structure that uses ALT-REFs and
+// internal ARFs.
+
+#define NUM_FEATURES 21
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES_LAYER0 48
+#define NUM_LABELS 1
+
+static const float
+    av1_use_flat_gop_nn_weights_layer0[NUM_FEATURES *
+                                       NUM_HIDDEN_NODES_LAYER0] = {
+      0.3801f,  -2.1832f, 1.7469f,  2.0130f,  2.1264f,  -0.7293f, -0.2814f,
+      0.0692f,  -4.6589f, -1.4591f, 0.3023f,  -0.4310f, -0.1911f, -0.8284f,
+      -1.3322f, -0.4621f, -0.1148f, -0.3531f, -0.0794f, -0.3114f, -0.1664f,
+      -0.1615f, 0.2913f,  -0.0394f, -0.0620f, 0.1845f,  0.0204f,  -0.2124f,
+      -0.1233f, -0.1685f, 0.1215f,  -0.2372f, -0.2865f, -0.1976f, 0.2137f,
+      -0.1318f, -0.0324f, 0.0415f,  -0.1172f, 0.1077f,  -0.1135f, -0.2462f,
+      -0.0743f, -0.1584f, -0.3267f, -0.0566f, -0.1615f, -0.3931f, -0.5200f,
+      -0.1786f, -0.1811f, -0.2812f, -0.1986f, -0.4393f, -0.3941f, -0.2500f,
+      -0.2029f, -0.4605f, -0.4973f, -0.2238f, -0.2599f, -0.1951f, -0.2034f,
+      -0.3186f, -0.1368f, -0.5076f, -0.4718f, -0.1815f, -0.3338f, -0.0550f,
+      -0.3920f, -0.5328f, -0.1658f, -0.2194f, -0.2867f, -0.0916f, -0.1678f,
+      -0.1760f, -0.5055f, -0.2322f, -0.4668f, -0.0121f, -0.3903f, -0.2721f,
+      -0.1306f, 0.1199f,  0.2894f,  0.1098f,  -0.0155f, -0.0844f, 0.0421f,
+      -0.2364f, -0.1073f, -0.0878f, -0.2146f, -0.1713f, -0.2283f, 0.0342f,
+      0.0394f,  -0.2808f, -0.0048f, 0.2640f,  -0.1371f, 0.1709f,  0.0155f,
+      -0.3614f, -0.1843f, -0.3215f, -0.3121f, -0.2609f, -0.0254f, -0.2474f,
+      -0.4674f, -0.3674f, -0.2076f, 0.0149f,  -0.3304f, -0.2678f, -0.0465f,
+      -0.1326f, -0.4504f, -0.5101f, -0.1280f, -0.0416f, -0.4296f, -0.4568f,
+      -0.6762f, -2.8105f, 0.7249f,  1.4288f,  1.3731f,  0.3034f,  0.1841f,
+      -0.0912f, -0.1508f, 1.2637f,  -0.2009f, 0.3236f,  -0.2500f, -0.0736f,
+      0.8655f,  -0.2599f, 0.1150f,  -0.0368f, -0.1122f, -0.7650f, -0.2004f,
+      -0.0891f, -0.3832f, -0.2576f, -0.3532f, -0.1735f, -0.4018f, -0.0265f,
+      -0.2988f, 0.2555f,  -0.1041f, -0.3391f, -0.5316f, -0.0171f, -0.3232f,
+      -0.0565f, -0.3359f, -0.1842f, -0.0582f, 0.0073f,  -0.0278f, -0.5517f,
+      0.0892f,  -0.1354f, 0.0548f,  -0.0401f, -0.1697f, 0.0432f,  0.0832f,
+      -0.3538f, 0.2602f,  -0.0066f, -0.2130f, -0.3085f, 0.0025f,  0.2464f,
+      -0.0103f, -0.3082f, -0.1136f, -0.2359f, -0.3421f, 0.1335f,  -0.3016f,
+      -1.0355f, -1.0572f, -0.3316f, -0.1235f, -0.3730f, -0.1751f, -0.1921f,
+      0.0031f,  -0.6297f, -0.5179f, 0.1082f,  -0.3130f, -0.1120f, -0.5430f,
+      -0.1782f, 0.0534f,  -0.1052f, 0.1471f,  -0.7156f, -0.5453f, -0.5437f,
+      1.8709f,  1.9696f,  -1.0343f, -0.3150f, -0.8399f, -0.0052f, -0.1123f,
+      -0.1059f, 0.6755f,  1.2593f,  -0.2512f, -0.2053f, 0.0835f,  0.3261f,
+      -0.0172f, 0.1230f,  -0.3687f, 0.1993f,  0.9390f,  -0.0165f, 0.6856f,
+      -0.4372f, -0.4041f, -0.2869f, -0.3871f, -0.3587f, -0.2418f, 0.0518f,
+      0.0110f,  -1.4713f, -0.1307f, -0.3246f, -0.5091f, -0.4652f, -0.4288f,
+      -0.0763f, -0.1755f, 0.0662f,  -0.3026f, -0.4462f, -0.4123f, -0.2891f,
+      -0.2251f, -0.4925f, -0.3820f, -0.1840f, -0.2878f, -0.1973f, -0.1010f,
+      -0.1622f, -0.3108f, -0.5292f, -0.1017f, -0.0607f, -0.2426f, -0.6406f,
+      -0.3834f, -0.2313f, -0.2433f, -0.1773f, -0.1581f, -0.3295f, -0.3799f,
+      -0.4447f, -0.2389f, -0.4231f, -0.1498f, -0.0181f, -0.4429f, -0.3515f,
+      0.0425f,  -0.5280f, -0.3462f, -0.3659f, 0.0153f,  -0.1002f, -0.5057f,
+      -0.2134f, -0.2859f, -0.1988f, -0.4758f, 0.0967f,  -0.4784f, 0.1868f,
+      -0.4387f, -1.3376f, -0.4452f, 0.3837f,  0.1698f,  -0.7076f, -0.4320f,
+      0.0382f,  -1.8053f, -0.6589f, 0.1406f,  -0.4340f, 0.0641f,  -0.2558f,
+      -0.4496f, -0.5003f, -0.6241f, -0.2217f, -0.8312f, -0.6793f, -0.3563f,
+      0.5153f,  -0.7851f, 1.0570f,  0.9702f,  0.5238f,  -0.6932f, -0.4443f,
+      0.0407f,  -3.0961f, -0.8461f, 0.0562f,  -0.0642f, 0.2471f,  -0.5911f,
+      -0.7715f, -0.1574f, -0.0375f, -0.1951f, -0.3097f, -0.2040f, 0.0128f,
+      -0.0918f, -0.0698f, -0.0970f, -0.2946f, -0.1723f, -0.2569f, -0.4382f,
+      -0.5174f, -0.2058f, -0.2973f, -0.0858f, -0.2526f, -0.2648f, -0.2339f,
+      -0.3474f, 0.0607f,  0.0272f,  -0.3142f, -0.1306f, -0.4938f, -0.1894f,
+      -0.0551f, -0.1061f, -0.1613f, -0.1942f, 0.0590f,  -0.2009f, -0.1286f,
+      -0.2035f, -0.0393f, -0.0650f, -0.1110f, 0.0123f,  -0.1122f, -0.0246f,
+      -0.2042f, 0.0411f,  -0.2771f, -0.0189f, 0.0927f,  0.0286f,  -0.1559f,
+      -0.3217f, -0.1039f, 0.1471f,  0.2489f,  0.2085f,  -0.4199f, -0.2404f,
+      0.0358f,  -0.7567f, -0.2413f, -0.3437f, -0.2433f, -0.3687f, -0.1194f,
+      -0.4289f, -0.1138f, -0.0721f, -0.3461f, -0.0244f, -0.3530f, -0.2842f,
+      -0.3823f, -0.1238f, -0.5475f, -0.2688f, -0.0073f, 0.0491f,  -0.4500f,
+      0.0201f,  0.0303f,  -0.2160f, -0.4219f, -0.4831f, -0.4593f, -0.2304f,
+      -0.2082f, -0.0367f, -0.5226f, -0.0082f, -0.1867f, -0.1812f, -0.2753f,
+      2.6650f,  1.9698f,  -2.9425f, 1.2119f,  1.5000f,  0.3356f,  0.3905f,
+      -0.2006f, -1.4038f, -1.0917f, 0.1423f,  -0.3528f, 0.0888f,  0.5802f,
+      1.0977f,  0.1083f,  -0.0693f, -0.0784f, 0.4247f,  0.4108f,  0.4970f,
+      -0.7290f, -0.1659f, -0.0517f, 0.0776f,  -0.0550f, -0.2374f, -0.4245f,
+      -0.0165f, -0.6804f, -0.3211f, -0.3101f, -0.1883f, -0.0786f, -0.3971f,
+      -0.4130f, -0.0606f, 0.1432f,  -0.0518f, -0.4179f, -0.4949f, -0.3451f,
+      -0.7559f, -4.0792f, 1.5526f,  0.2824f,  0.6086f,  -0.2148f, 0.0959f,
+      0.0506f,  -5.5176f, -3.9702f, 0.1597f,  -0.1760f, -0.0627f, 0.1657f,
+      -1.2996f, -0.2899f, -0.0600f, -0.0531f, -1.5160f, -0.4837f, -1.6961f,
+      -0.1134f, -0.1838f, -0.3071f, -0.4215f, -0.4184f, 0.0192f,  -0.2128f,
+      -0.3094f, -0.2607f, -0.4855f, -0.1881f, 0.0258f,  -0.5085f, -0.3630f,
+      -0.4824f, -0.3762f, -0.3324f, -0.1134f, -0.3350f, 0.0217f,  -0.2803f,
+      -0.5669f, -0.5674f, -0.5441f, -0.5965f, -0.3062f, -0.4666f, -0.4079f,
+      -0.0065f, -0.7566f, -0.3437f, -0.2474f, -0.2360f, -0.5683f, -0.3853f,
+      -0.6670f, -0.4158f, -0.2831f, -0.3327f, -0.7419f, -0.6481f, -0.4004f,
+      -0.4025f, -0.6405f, -0.4265f, -0.0167f, 0.3195f,  -0.0822f, -0.4350f,
+      -0.0032f, -1.0448f, -0.4407f, 0.0488f,  0.0776f,  -0.3828f, -0.3380f,
+      -0.2983f, -0.2220f, -0.4105f, -0.2312f, -0.4166f, -0.3258f, -0.1424f,
+      -0.6588f, -0.9433f, 0.3402f,  0.5800f,  0.6368f,  -0.4298f, -0.5743f,
+      0.0822f,  -1.0843f, -0.1645f, -0.1990f, 0.0255f,  -0.1039f, -0.3673f,
+      0.4367f,  -0.5491f, -0.0932f, -0.0323f, -0.2405f, -0.2922f, -0.4019f,
+      -0.4936f, -1.2338f, 0.4681f,  0.7454f,  0.8181f,  -0.3680f, -0.1613f,
+      -0.0008f, -1.3326f, -0.0667f, 0.1569f,  -0.0978f, -0.3229f, -0.4222f,
+      0.0330f,  0.1064f,  -0.1325f, 0.0121f,  -0.3976f, -0.2254f, -0.3942f,
+      -0.4771f, -0.1887f, 0.1020f,  0.3331f,  0.3098f,  -0.1256f, -0.4736f,
+      0.0295f,  -0.3919f, -0.0931f, -0.2484f, -0.4629f, -0.2800f, -0.2851f,
+      -0.2243f, -0.3958f, -0.3053f, -0.6585f, -0.1159f, -0.2330f, -0.1989f,
+      0.2273f,  0.1963f,  0.0283f,  0.0198f,  -0.1298f, -0.0627f, -0.2753f,
+      -0.1552f, 0.2734f,  -0.0551f, -0.2927f, -0.3772f, -0.4522f, -0.0786f,
+      0.0079f,  0.1664f,  -0.0228f, -0.2908f, -0.1714f, 0.1223f,  -0.0680f,
+      -0.5048f, -0.0852f, -0.4653f, -0.5142f, -0.1818f, -0.1659f, 0.0678f,
+      -0.1296f, 0.0295f,  -0.3487f, -0.1224f, -0.2690f, -0.3217f, -0.1957f,
+      -0.3196f, -0.4530f, -0.1746f, -0.2307f, -0.0504f, -0.0131f, -0.4613f,
+      -0.1476f, -0.5596f, -0.3829f, -0.4302f, -0.2910f, -0.2182f, -0.0811f,
+      -0.3967f, -0.3912f, -0.0371f, -0.1109f, -0.0793f, -0.2063f, -0.0060f,
+      -0.0236f, -0.4098f, -0.0276f, -0.3352f, -0.1888f, -0.2439f, -0.3748f,
+      0.0371f,  0.8460f,  -0.5547f, -1.2680f, -1.1623f, -0.1740f, -0.4815f,
+      -0.0294f, 4.4764f,  0.3716f,  -0.2826f, -0.0549f, -0.2937f, 0.0632f,
+      0.0686f,  -0.4681f, -0.2555f, -0.2427f, -0.2261f, -0.1567f, -0.5199f,
+      -0.4079f, -0.0801f, -0.2075f, -0.3956f, -0.0307f, -0.3150f, -0.3490f,
+      -0.0379f, 0.3060f,  -0.1775f, -0.1651f, 0.0677f,  -0.1947f, 0.0032f,
+      -0.2014f, -0.1575f, -0.1289f, -0.0250f, -0.0762f, -0.2324f, -0.2895f,
+      -0.4531f, -0.4601f, -0.1718f, -0.3139f, -0.4350f, 0.0346f,  -0.0891f,
+      -0.1581f, 0.2123f,  -0.1074f, 0.0221f,  0.0951f,  0.1161f,  0.0245f,
+      -0.0701f, -0.1677f, -0.4170f, -0.2214f, -0.3419f, -0.4873f, -0.0701f,
+      -0.0613f, -0.1031f, 0.0141f,  -0.1299f, -0.3953f, -0.2182f, -0.2679f,
+      -0.0141f, 0.3392f,  -0.0722f, -0.2390f, 0.1638f,  -0.1596f, -0.1527f,
+      -0.3581f, -0.4037f, -0.0736f, 0.0397f,  -0.1288f, -0.1362f, -0.0249f,
+      -0.5099f, -0.4040f, -0.1893f, -0.0298f, -0.1332f, -0.1693f, -0.3301f,
+      -0.1058f, -0.1414f, -0.5737f, -0.2342f, -0.2560f, -0.3834f, -0.0917f,
+      -0.1334f, -0.5077f, -0.3666f, -0.2515f, -0.4824f, -0.4714f, -0.5723f,
+      -0.1361f, -0.5244f, -0.2468f, 0.0237f,  -0.1862f, -0.3124f, -0.0183f,
+      -0.4662f, -0.4444f, -0.5400f, -0.1730f, -0.0123f, -0.2134f, -0.1024f,
+      -0.0172f, -0.4430f, -0.1403f, -0.0751f, -0.2403f, -0.2100f, -0.0678f,
+      2.4232f,  1.9825f,  0.1260f,  1.9972f,  2.8061f,  0.3916f,  0.1842f,
+      -0.2603f, -1.6092f, -1.6037f, 0.1475f,  0.0516f,  -0.2593f, 0.0359f,
+      -0.1802f, 0.0159f,  -0.0529f, -0.0983f, 0.7638f,  0.5529f,  0.9662f,
+      -0.4049f, -0.6372f, 0.4907f,  0.7360f,  0.9271f,  -0.6879f, -0.1067f,
+      0.0323f,  -1.8447f, 0.2176f,  -0.1047f, -0.0048f, -0.1031f, -0.7931f,
+      -0.3059f, -0.4595f, -0.1287f, -0.4031f, 0.1441f,  -0.6651f, 0.2530f,
+      -0.4572f, -0.0614f, 0.0345f,  -0.0008f, 0.0333f,  -0.3431f, 0.0538f,
+      -0.2691f, 0.2930f,  -0.0820f, -0.0979f, -0.0307f, 0.1713f,  0.0783f,
+      -0.4337f, -0.2702f, -0.1677f, -0.1719f, -0.4669f, -0.2847f, -0.4495f,
+      -0.3692f, -0.2641f, -0.2833f, -0.1168f, -0.0523f, -0.2368f, -0.4922f,
+      -0.3453f, -0.4452f, -0.5212f, 0.0412f,  -0.3310f, -0.2656f, -0.4903f,
+      -0.3854f, -0.1009f, -0.1038f, -0.2350f, -0.4430f, -0.5097f, -0.1755f,
+      0.0110f,  -0.0712f, -0.0662f, -0.4493f, -0.2111f, -0.3402f, -0.3100f,
+      -0.2525f, -0.1856f, -0.2689f, -0.4288f, -0.3912f, -0.0754f, -0.5191f,
+      -0.0747f, -0.0626f, -0.4821f, -0.2014f, -0.3124f, -0.4858f, -0.1896f,
+      1.0673f,  -0.8529f, 13.7564f, 18.7299f, 19.0062f, -1.1047f, -0.8654f,
+      0.1089f,  -1.2958f, -0.7793f, 0.0780f,  -0.1679f, 0.0054f,  -1.2451f,
+      -0.1287f, 0.0082f,  -0.2960f, -0.0442f, 2.3817f,  0.4716f,  1.3862f,
+      -0.0782f, -0.1871f, -0.2596f, 0.0093f,  0.1451f,  -0.1124f, -0.2315f,
+      -0.2677f, -0.1086f, 0.2216f,  0.2928f,  0.0391f,  0.0372f,  -0.2551f,
+      0.0552f,  -0.1876f, -0.2361f, -0.1889f, -0.0279f, 0.1204f,  0.2016f,
+      -0.5787f, -0.5830f, 0.0530f,  -0.1452f, -0.4899f, -0.2937f, 0.1430f,
+      -0.2752f, -0.2320f, -0.1908f, -0.5538f, -0.0858f, -0.1378f, -0.1505f,
+      -0.3908f, -0.4732f, -0.3018f, 0.0244f,  -0.2392f, -0.2833f, -0.3997f,
+      -0.4495f, -0.2570f, -0.3189f, -0.1534f, -0.1040f, -0.5497f, -0.3524f,
+      -0.2053f, 0.2415f,  -0.5027f, 0.0288f,  -0.1904f, -0.2183f, -0.1062f,
+      -0.3560f, 0.0165f,  -0.4601f, -0.2144f, -0.0439f, -0.4913f, -0.3160f,
+      -0.1641f, 0.1010f,  -0.1044f, -0.4064f, -0.3580f, -0.4015f, 0.1010f,
+      -0.1973f, 0.6392f,  -0.5177f, -0.0472f, -0.1526f, 0.1533f,  -0.0819f,
+      -0.0252f, -0.0783f, 0.1301f,  0.0158f,  -0.2003f, -0.4700f, -0.2329f,
+    };
+
+static const float
+    av1_use_flat_gop_nn_biases_layer0[NUM_HIDDEN_NODES_LAYER0] = {
+      -1.113218f, 0.f,        -0.268537f, -0.268537f, 0.f,        -0.268534f,
+      -0.40681f,  -0.268537f, -0.061835f, -0.614956f, 0.984277f,  -0.280228f,
+      -0.354716f, -0.202312f, -0.772829f, -0.464005f, -0.230795f, 0.f,
+      -0.124187f, -0.265949f, 0.325168f,  -0.359008f, -2.455546f, -0.229222f,
+      -0.692233f, -0.29401f,  -0.632682f, -0.479061f, -0.166094f, 0.077291f,
+      -0.235293f, -0.268537f, 0.167899f,  -0.141991f, -0.210089f, -0.177294f,
+      -0.325401f, -0.268537f, 0.323627f,  -0.156593f, -0.218451f, -0.230792f,
+      -0.268537f, 0.833177f,  0.f,        -0.353177f, -0.260953f, -0.209537f,
+    };
+
+static const float
+    av1_use_flat_gop_nn_weights_layer1[NUM_HIDDEN_NODES_LAYER0 * NUM_LABELS] = {
+      -0.024695f, 0.146668f,  -0.02723f,  0.034577f,  -0.255426f, 0.22402f,
+      -0.112595f, -0.131262f, 0.091164f,  -0.045294f, 0.028304f,  -0.051683f,
+      0.310497f,  -0.077786f, -0.047873f, -0.057205f, -0.065119f, 0.227417f,
+      -0.051126f, -0.137241f, 0.035742f,  -0.058992f, -0.021466f, 0.107947f,
+      -0.077183f, -0.04144f,  0.003568f,  -0.027656f, 0.038196f,  0.19684f,
+      -0.128401f, 0.149629f,  0.024526f,  0.037376f,  0.090752f,  -0.061666f,
+      -0.15743f,  0.057773f,  -0.010582f, 0.120997f,  0.060368f,  0.210028f,
+      -0.192244f, -0.064764f, -0.237655f, 0.1852f,    -0.084281f, -0.010434f,
+    };
+
+static const float av1_use_flat_gop_nn_biases_layer1[NUM_LABELS] = {
+  -0.672434f,
+};
+
+static const NN_CONFIG av1_use_flat_gop_nn_config = {
+  NUM_FEATURES,
+  NUM_LABELS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_HIDDEN_NODES_LAYER0,
+  },
+  {
+      av1_use_flat_gop_nn_weights_layer0,
+      av1_use_flat_gop_nn_weights_layer1,
+  },
+  {
+      av1_use_flat_gop_nn_biases_layer0,
+      av1_use_flat_gop_nn_biases_layer1,
+  },
+};
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES_LAYER0
+#undef NUM_LABELS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_

diff --git a/libaom/av1/encoder/var_based_part.c b/libaom/av1/encoder/var_based_part.c
index 3cead91..504833e 100644
--- a/libaom/av1/encoder/var_based_part.c
+++ b/libaom/av1/encoder/var_based_part.c

@@ -34,100 +34,52 @@
 extern const uint8_t AV1_VAR_OFFS[];
 
 typedef struct {
-  // TODO(kyslov): consider changing to 64bit
-
-  // This struct is used for computing variance in choose_partitioning(), where
-  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
-  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
-  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
-  uint32_t sum_square_error;
-  int32_t sum_error;
-  int log2_count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-typedef struct {
-  partition_variance part_variances;
-  var split[4];
-} v4x4;
-
-typedef struct {
-  partition_variance part_variances;
-  v4x4 split[4];
-} v8x8;
-
-typedef struct {
-  partition_variance part_variances;
-  v8x8 split[4];
-} v16x16;
-
-typedef struct {
-  partition_variance part_variances;
-  v16x16 split[4];
-} v32x32;
-
-typedef struct {
-  partition_variance part_variances;
-  v32x32 split[4];
-} v64x64;
-
-typedef struct {
-  partition_variance part_variances;
-  v64x64 split[4];
-} v128x128;
-
-typedef struct {
-  partition_variance *part_variances;
-  var *split[4];
+  VPVariance *part_variances;
+  VPartVar *split[4];
 } variance_node;
 
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
+                                    variance_node *node) {
   int i;
   node->part_variances = NULL;
   switch (bsize) {
     case BLOCK_128X128: {
-      v128x128 *vt = (v128x128 *)data;
+      VP128x128 *vt = (VP128x128 *)data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
     case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *)data;
+      VP64x64 *vt = (VP64x64 *)data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
     case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *)data;
+      VP32x32 *vt = (VP32x32 *)data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
     case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *)data;
+      VP16x16 *vt = (VP16x16 *)data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
     case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *)data;
+      VP8x8 *vt = (VP8x8 *)data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
     default: {
-      v4x4 *vt = (v4x4 *)data;
+      VP4x4 *vt = (VP4x4 *)data;
       assert(bsize == BLOCK_4X4);
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
@@ -137,13 +89,14 @@
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+                                     VPartVar *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static void get_variance(var *v) {
+static AOM_INLINE void get_variance(VPartVar *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
                    (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -151,13 +104,14 @@
             v->log2_count);
 }
 
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+                                       VPartVar *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
 }
 
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   variance_node node;
   memset(&node, 0, sizeof(node));
   tree_to_node(data, bsize, &node);
@@ -169,11 +123,13 @@
                   &node.part_variances->none);
 }
 
-static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
-                           MACROBLOCKD *const xd, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
-  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
-    set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                      MACROBLOCKD *const xd, int mi_row,
+                                      int mi_col, BLOCK_SIZE bsize) {
+  if (cpi->common.mi_params.mi_cols > mi_col &&
+      cpi->common.mi_params.mi_rows > mi_row) {
+    set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                          mi_row, mi_col);
     xd->mi[0]->sb_type = bsize;
   }
 }
@@ -194,18 +150,14 @@
 
   if (force_split == 1) return 0;
 
-  if (mi_col + block_width > tile->mi_col_end ||
-      mi_row + block_height > tile->mi_row_end)
-    return 0;
-
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
     if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
       return 1;
@@ -221,15 +173,15 @@
       return 0;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
       return 1;
     }
-
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows) {
+    if (mi_row + block_height <= tile->mi_row_end &&
+        mi_col + block_width / 2 <= tile->mi_col_end) {
       BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
       get_variance(&vt.part_variances->vert[0]);
       get_variance(&vt.part_variances->vert[1]);
@@ -243,7 +195,8 @@
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols) {
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height / 2 <= tile->mi_row_end) {
       BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
       get_variance(&vt.part_variances->horz[0]);
       get_variance(&vt.part_variances->horz[1]);
@@ -256,16 +209,20 @@
         return 1;
       }
     }
-
     return 0;
   }
   return 0;
 }
 
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
-                                 int pixels_wide, int pixels_high,
-                                 int is_key_frame) {
+static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp,
+                                            const uint8_t *d, int dp,
+                                            int x16_idx, int y16_idx,
+                                            VP16x16 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            int highbd_flag,
+#endif
+                                            int pixels_wide, int pixels_high,
+                                            int is_key_frame) {
   int k;
   for (k = 0; k < 4; k++) {
     int x8_idx = x16_idx + ((k & 1) << 3);
@@ -275,9 +232,19 @@
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
       int s_avg;
       int d_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
       s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
       if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-
+#endif
       sum = s_avg - d_avg;
       sse = sum * sum;
     }
@@ -286,8 +253,11 @@
 }
 
 static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
-                              int dp, int x16_idx, int y16_idx, int pixels_wide,
-                              int pixels_high) {
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide, int pixels_high) {
   int k;
   int minmax_max = 0;
   int minmax_min = 255;
@@ -298,8 +268,18 @@
     int min = 0;
     int max = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        aom_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp, &min, &max);
+      } else {
+        aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx,
+                       dp, &min, &max);
+      }
+#else
       aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
                      &min, &max);
+#endif
       if ((max - min) > minmax_max) minmax_max = (max - min);
       if ((max - min) < minmax_min) minmax_min = (max - min);
     }
@@ -307,10 +287,14 @@
   return (minmax_max - minmax_min);
 }
 
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
-                                 int pixels_wide, int pixels_high,
-                                 int is_key_frame) {
+static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp,
+                                            const uint8_t *d, int dp,
+                                            int x8_idx, int y8_idx, VP8x8 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            int highbd_flag,
+#endif
+                                            int pixels_wide, int pixels_high,
+                                            int is_key_frame) {
   int k;
   for (k = 0; k < 4; k++) {
     int x4_idx = x8_idx + ((k & 1) << 2);
@@ -320,8 +304,20 @@
     if (x4_idx < pixels_wide && y4_idx < pixels_high) {
       int s_avg;
       int d_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = aom_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = aom_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
       s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
       if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+
       sum = s_avg - d_avg;
       sse = sum * sum;
     }
@@ -329,22 +325,15 @@
   }
 }
 
+// TODO(kyslov) Bring back threshold adjustment based on content state
 static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
                                          int width, int height,
                                          int content_state) {
+  (void)width;
+  (void)height;
+  (void)content_state;
   if (speed >= 8) {
-    if (width <= 640 && height <= 480)
-      return (5 * threshold_base) >> 2;
-    else if ((content_state == kLowSadLowSumdiff) ||
-             (content_state == kHighSadLowSumdiff) ||
-             (content_state == kLowVarHighSumdiff))
-      return (5 * threshold_base) >> 2;
-  } else if (speed == 7) {
-    if ((content_state == kLowSadLowSumdiff) ||
-        (content_state == kHighSadLowSumdiff) ||
-        (content_state == kLowVarHighSumdiff)) {
-      return (5 * threshold_base) >> 2;
-    }
+    return (5 * threshold_base) >> 2;
   }
   return threshold_base;
 }
@@ -353,13 +342,14 @@
 // 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
 // 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
 // currently only used on key frame.
-static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q,
-                               int content_state) {
+static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
+                                          int q, int content_state) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 40 : 1;
   int64_t threshold_base =
-      (int64_t)(threshold_multiplier * cpi->dequants.y_dequant_QTX[q][1]);
+      (int64_t)(threshold_multiplier *
+                cpi->enc_quant_dequant_params.dequants.y_dequant_QTX[q][1]);
 
   if (is_key_frame) {
     thresholds[0] = threshold_base;
@@ -372,58 +362,239 @@
     threshold_base = scale_part_thresh_sumdiff(
         threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
 
+    thresholds[0] = threshold_base >> 1;
     thresholds[1] = threshold_base;
     thresholds[3] = threshold_base << cpi->oxcf.speed;
     if (cm->width >= 1280 && cm->height >= 720)
       thresholds[3] = thresholds[3] << 1;
-    if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[1] = threshold_base >> 3;
-      thresholds[2] = threshold_base >> 1;
-      thresholds[3] = threshold_base << 3;
+    if (cm->width * cm->height <= 352 * 288) {
+      int last_qindex = cpi->rc.last_q[INTER_FRAME];
+      if (last_qindex >= QINDEX_HIGH_THR) {
+        threshold_base = (5 * threshold_base) >> 1;
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = threshold_base << 2;
+        thresholds[3] = threshold_base << 5;
+      } else if (last_qindex < QINDEX_LOW_THR) {
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = threshold_base >> 1;
+        thresholds[3] = threshold_base << 3;
+      } else {
+        int64_t qi_diff_low = last_qindex - QINDEX_LOW_THR;
+        int64_t qi_diff_high = QINDEX_HIGH_THR - last_qindex;
+        int64_t threshold_diff = QINDEX_HIGH_THR - QINDEX_LOW_THR;
+        int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+        threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+        threshold_base = (qi_diff_low * threshold_base_high +
+                          qi_diff_high * threshold_base) /
+                         threshold_diff;
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = ((qi_diff_low * threshold_base) +
+                         qi_diff_high * (threshold_base >> 1)) /
+                        threshold_diff;
+        thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+                         qi_diff_high * (threshold_base << 3)) /
+                        threshold_diff;
+      }
     } else if (cm->width < 1280 && cm->height < 720) {
       thresholds[2] = (5 * threshold_base) >> 2;
     } else if (cm->width < 1920 && cm->height < 1080) {
       thresholds[2] = threshold_base << 1;
-      thresholds[3] <<= 2;
     } else {
       thresholds[2] = (5 * threshold_base) >> 1;
     }
   }
 }
 
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(
+    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
+    VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
+  if (xd->mi[0]->sb_type == BLOCK_64X64) {
+    if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          mi_params->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+      MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str;
+
+      if (mi_params->mi_cols <= mi_col + idx[i][1] ||
+          mi_params->mi_rows <= mi_row + idx[i][0])
+        continue;
+
+      if (*this_mi == NULL) continue;
+
+      if ((*this_mi)->sb_type == BLOCK_32X32) {
+        int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+        if (vt->split[i].part_variances.none.variance < threshold_32x32)
+          x->variance_low[i + 5] = 1;
+      } else {
+        // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+        // inside.
+        if ((*this_mi)->sb_type == BLOCK_16X16 ||
+            (*this_mi)->sb_type == BLOCK_32X16 ||
+            (*this_mi)->sb_type == BLOCK_16X32) {
+          for (int j = 0; j < 4; j++) {
+            if (vt->split[i].split[j].part_variances.none.variance <
+                (thresholds[2] >> 8))
+              x->variance_low[(i << 2) + j + 9] = 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
+    VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
+  if (xd->mi[0]->sb_type == BLOCK_128X128) {
+    if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx64[4][2] = {
+      { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+    };
+    static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          mi_params->mi_stride * (mi_row + idx64[i][0]) + mi_col + idx64[i][1];
+      MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str;
+      if (*mi_64 == NULL) continue;
+      if (mi_params->mi_cols <= mi_col + idx64[i][1] ||
+          mi_params->mi_rows <= mi_row + idx64[i][0])
+        continue;
+      const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+      if ((*mi_64)->sb_type == BLOCK_64X64) {
+        if (vt->split[i].part_variances.none.variance < threshold_64x64)
+          x->variance_low[5 + i] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.horz[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[9 + (i << 1) + j] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.vert[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[17 + (i << 1) + j] = 1;
+      } else {
+        for (int k = 0; k < 4; k++) {
+          const int idx_str1 = mi_params->mi_stride * idx32[k][0] + idx32[k][1];
+          MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1;
+          if (*mi_32 == NULL) continue;
+
+          if (mi_params->mi_cols <= mi_col + idx64[i][1] + idx32[k][1] ||
+              mi_params->mi_rows <= mi_row + idx64[i][0] + idx32[k][0])
+            continue;
+          const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+          if ((*mi_32)->sb_type == BLOCK_32X32) {
+            if (vt->split[i].split[k].part_variances.none.variance <
+                threshold_32x32)
+              x->variance_low[25 + (i << 2) + k] = 1;
+          } else {
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*mi_32)->sb_type == BLOCK_16X16 ||
+                (*mi_32)->sb_type == BLOCK_32X16 ||
+                (*mi_32)->sb_type == BLOCK_16X32) {
+              for (int j = 0; j < 4; j++) {
+                if (vt->split[i]
+                        .split[k]
+                        .split[j]
+                        .part_variances.none.variance < (thresholds[3] >> 8))
+                  x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt,
+    int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
+    int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mv_thr = cm->width > 640 ? 8 : 4;
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+  // int_pro mv is small. If the temporal variance is small set the flag
+  // variance_low for the block. The variance threshold can be adjusted, the
+  // higher the more aggressive.
+  if (ref_frame_partition == LAST_FRAME &&
+      (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
+       (cpi->sf.rt_sf.estimate_motion_for_var_based_partition &&
+        xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+    if (is_small_sb)
+      set_low_temp_var_flag_64x64(&cm->mi_params, x, xd, &(vt->split[0]),
+                                  thresholds, mi_col, mi_row);
+    else
+      set_low_temp_var_flag_128x128(&cm->mi_params, x, xd, vt, thresholds,
+                                    mi_col, mi_row);
+  }
+}
+
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
                                            int content_state) {
-  AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const int is_key_frame = frame_is_intra_only(cm);
-  if (sf->partition_search_type != VAR_BASED_PARTITION) {
+  if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
-    // The thresholds below are not changed locally.
-    if (is_key_frame) {
-      cpi->vbp_threshold_sad = 0;
-      cpi->vbp_threshold_copy = 0;
-      cpi->vbp_bsize_min = BLOCK_8X8;
-    } else {
-      if (cm->width <= 352 && cm->height <= 288)
-        cpi->vbp_threshold_sad = 10;
-      else
-        cpi->vbp_threshold_sad = (cpi->dequants.y_dequant_QTX[q][1] << 1) > 1000
-                                     ? (cpi->dequants.y_dequant_QTX[q][1] << 1)
-                                     : 1000;
-      cpi->vbp_bsize_min = BLOCK_16X16;
-      if (cm->width <= 352 && cm->height <= 288)
-        cpi->vbp_threshold_copy = 4000;
-      else if (cm->width <= 640 && cm->height <= 360)
-        cpi->vbp_threshold_copy = 8000;
-      else
-        cpi->vbp_threshold_copy =
-            (cpi->dequants.y_dequant_QTX[q][1] << 3) > 8000
-                ? (cpi->dequants.y_dequant_QTX[q][1] << 3)
-                : 8000;
-    }
-    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_state);
+    // The threshold below is not changed locally.
+    cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, unsigned int y_sad,
+                                    int is_key_frame) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  if (is_key_frame) return;
+
+  for (i = 1; i <= 2; ++i) {
+    unsigned int uv_sad = UINT_MAX;
+    struct macroblock_plane *p = &x->plane[i];
+    struct macroblockd_plane *pd = &xd->plane[i];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+    if (bs != BLOCK_INVALID)
+      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                   pd->dst.stride);
+
+    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
   }
 }
 
@@ -432,31 +603,34 @@
 // TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
 // selection and most of all - retune the thresholds
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                      MACROBLOCK *x, int mi_row, int mi_col) {
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
+  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
 
   int i, j, k, m;
-  v128x128 *vt;
-  v16x16 *vt2 = NULL;
+  VP128x128 *vt;
+  VP16x16 *vt2 = NULL;
   unsigned char force_split[85];
   int avg_32x32;
-  int max_var_32x32 = 0;
-  int min_var_32x32 = INT_MAX;
+  int max_var_32x32[4];
+  int min_var_32x32[4];
   int var_32x32;
   int var_64x64;
   int min_var_64x64 = INT_MAX;
   int max_var_64x64 = 0;
-  int avg_16x16[4];
-  int maxvar_16x16[4];
-  int minvar_16x16[4];
+  int avg_16x16[4][4];
+  int maxvar_16x16[4][4];
+  int minvar_16x16[4][4];
   int64_t threshold_4x4avg;
   int content_state = 0;
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
-  int compute_minmax_variance = 1;
+  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+  int compute_minmax_variance = 0;
   int is_key_frame = frame_is_intra_only(cm);
   int pixels_wide = 128, pixels_high = 128;
   assert(cm->seq_params.sb_size == BLOCK_64X64 ||
@@ -464,11 +638,20 @@
   const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
 
-  CHECK_MEM_ERROR(cm, vt, aom_calloc(1, sizeof(*vt)));
+  unsigned int y_sad = UINT_MAX;
+  unsigned int y_sad_g = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
 
-  int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
-                            cpi->vbp_thresholds[2], cpi->vbp_thresholds[3],
-                            cpi->vbp_thresholds[4] };
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+
+  vt->split = td->vt64x64;
+
+  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+                            vbp_thresholds[2], vbp_thresholds[3],
+                            vbp_thresholds[4] };
 
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[64];
@@ -477,7 +660,15 @@
 
   segment_id = xd->mi[0]->segment_id;
 
-  set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id) &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, content_state);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
+                       content_state);
+  }
 
   if (is_small_sb) {
     pixels_wide = 64;
@@ -496,14 +687,30 @@
   // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
   // 5-20 for the 16x16 blocks.
   force_split[0] = 0;
+  memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (!is_key_frame) {
     // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
     // is!!
     MB_MODE_INFO *mi = xd->mi[0];
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
-
     assert(yv12 != NULL);
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+
+    // For non-SVC GOLDEN is another temporal reference. Check if it should be
+    // used as reference for partitioning.
+    if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+        cpi->sf.rt_sf.use_nonrd_pick_mode) {
+      yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+      if (yv12_g && yv12_g != yv12) {
+        av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                             get_ref_scale_factors(cm, GOLDEN_FRAME),
+                             num_planes);
+        y_sad_g = cpi->fn_ptr[bsize].sdf(
+            x->plane[0].src.buf, x->plane[0].src.stride,
+            xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+      }
+    }
 
     av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
                          get_ref_scale_factors(cm, LAST_FRAME), num_planes);
@@ -511,20 +718,36 @@
     mi->ref_frame[1] = NONE_FRAME;
     mi->sb_type = cm->seq_params.sb_size;
     mi->mv[0].as_int = 0;
-    mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR);
-    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-      const MV dummy_mv = { 0, 0 };
-      av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row,
-                                    mi_col, &dummy_mv);
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+        const MV dummy_mv = { 0, 0 };
+        y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+                                              mi_row, mi_col, &dummy_mv);
+      }
+    }
+    if (y_sad == UINT_MAX) {
+      y_sad = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
     }
 
-// TODO(kyslov): bring the small SAD functionality back
-#if 0
-    y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                   xd->plane[0].pre[0].buf,
-                                   xd->plane[0].pre[0].stride);
-#endif
-    x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+    // Pick the ref frame for partitioning, use golden frame only if its
+    // lower sad.
+    if (y_sad_g < 0.9 * y_sad) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
+      x->nonrd_prune_ref_frame_search = 0;
+    } else {
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
+      x->nonrd_prune_ref_frame_search =
+          cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+    }
 
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
@@ -533,27 +756,13 @@
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
-
-    // If the y_sad is very small, take 64x64 as partition and exit.
-    // Don't check on boosted segment for now, as 64x64 is suppressed there.
-#if 0
-        if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad)
-       { const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const
-       int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; if (mi_col +
-       block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows)
-       { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_128X128);
-            x->variance_low[0] = 1;
-            return 0;
-          }
-        }
-#endif
   } else {
     d = AV1_VAR_OFFS;
     dp = 0;
   }
 
   if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, aom_calloc(64, sizeof(*vt2)));
+    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (m = 0; m < num_64x64_blocks; m++) {
@@ -561,35 +770,40 @@
     const int y64_idx = ((m >> 1) << 6);
     const int m2 = m << 2;
     force_split[m + 1] = 0;
+    max_var_32x32[m] = 0;
+    min_var_32x32[m] = INT_MAX;
     for (i = 0; i < 4; i++) {
       const int x32_idx = x64_idx + ((i & 1) << 5);
       const int y32_idx = y64_idx + ((i >> 1) << 5);
       const int i2 = (m2 + i) << 2;
       force_split[5 + m2 + i] = 0;
-      avg_16x16[i] = 0;
-      maxvar_16x16[i] = 0;
-      minvar_16x16[i] = INT_MAX;
+      avg_16x16[m][i] = 0;
+      maxvar_16x16[m][i] = 0;
+      minvar_16x16[m][i] = INT_MAX;
       for (j = 0; j < 4; j++) {
         const int x16_idx = x32_idx + ((j & 1) << 4);
         const int y16_idx = y32_idx + ((j >> 1) << 4);
         const int split_index = 21 + i2 + j;
-        v16x16 *vst = &vt->split[m].split[i].split[j];
+        VP16x16 *vst = &vt->split[m].split[i].split[j];
         force_split[split_index] = 0;
         variance4x4downsample[i2 + j] = 0;
         if (!is_key_frame) {
-          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide,
-                               pixels_high, is_key_frame);
+          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                               xd->cur_buf->flags,
+#endif
+                               pixels_wide, pixels_high, is_key_frame);
           fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
           get_variance(&vt->split[m].split[i].split[j].part_variances.none);
-          avg_16x16[i] +=
+          avg_16x16[m][i] +=
               vt->split[m].split[i].split[j].part_variances.none.variance;
           if (vt->split[m].split[i].split[j].part_variances.none.variance <
-              minvar_16x16[i])
-            minvar_16x16[i] =
+              minvar_16x16[m][i])
+            minvar_16x16[m][i] =
                 vt->split[m].split[i].split[j].part_variances.none.variance;
           if (vt->split[m].split[i].split[j].part_variances.none.variance >
-              maxvar_16x16[i])
-            maxvar_16x16[i] =
+              maxvar_16x16[m][i])
+            maxvar_16x16[m][i] =
                 vt->split[m].split[i].split[j].part_variances.none.variance;
           if (vt->split[m].split[i].split[j].part_variances.none.variance >
               thresholds[3]) {
@@ -610,8 +824,11 @@
             // compute the minmax over the 8x8 sub-blocks, and if above
             // threshold, force split to 8x8 block for this 16x16 block.
             int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            xd->cur_buf->flags,
+#endif
                                             pixels_wide, pixels_high);
-            int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+            int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
             if (minmax > thresh_minmax) {
               force_split[split_index] = 1;
               force_split[5 + m2 + i] = 1;
@@ -627,8 +844,11 @@
           for (k = 0; k < 4; k++) {
             int x8_idx = x16_idx + ((k & 1) << 3);
             int y8_idx = y16_idx + ((k >> 1) << 3);
-            v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+            VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
             fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                 xd->cur_buf->flags,
+#endif
                                  pixels_wide, pixels_high, is_key_frame);
           }
         }
@@ -645,7 +865,7 @@
       for (j = 0; j < 4; j++) {
         const int split_index = 21 + i2 + j;
         if (variance4x4downsample[i2 + j] == 1) {
-          v16x16 *vtemp =
+          VP16x16 *vtemp =
               (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
           for (k = 0; k < 4; k++)
             fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
@@ -669,21 +889,22 @@
       if (!force_split[5 + m2 + i]) {
         get_variance(&vt->split[m].split[i].part_variances.none);
         var_32x32 = vt->split[m].split[i].part_variances.none.variance;
-        max_var_32x32 = AOMMAX(var_32x32, max_var_32x32);
-        min_var_32x32 = AOMMIN(var_32x32, min_var_32x32);
+        max_var_32x32[m] = AOMMAX(var_32x32, max_var_32x32[m]);
+        min_var_32x32[m] = AOMMIN(var_32x32, min_var_32x32[m]);
         if (vt->split[m].split[i].part_variances.none.variance >
                 thresholds[2] ||
             (!is_key_frame &&
              vt->split[m].split[i].part_variances.none.variance >
                  (thresholds[2] >> 1) &&
              vt->split[m].split[i].part_variances.none.variance >
-                 (avg_16x16[i] >> 1))) {
+                 (avg_16x16[m][i] >> 1))) {
           force_split[5 + m2 + i] = 1;
           force_split[m + 1] = 1;
           force_split[0] = 1;
         } else if (!is_key_frame && cm->height <= 360 &&
-                   (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[2] >> 1) &&
-                   maxvar_16x16[i] > thresholds[2]) {
+                   (maxvar_16x16[m][i] - minvar_16x16[m][i]) >
+                       (thresholds[2] >> 1) &&
+                   maxvar_16x16[m][i] > thresholds[2]) {
           force_split[5 + m2 + i] = 1;
           force_split[m + 1] = 1;
           force_split[0] = 1;
@@ -702,8 +923,8 @@
       // split. Only checking this for noise level >= medium for now.
 
       if (!is_key_frame &&
-          (max_var_32x32 - min_var_32x32) > 3 * (thresholds[1] >> 3) &&
-          max_var_32x32 > thresholds[1] >> 1)
+          (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
+          max_var_32x32[m] > thresholds[1] >> 1)
         force_split[1 + m] = 1;
     }
     if (is_small_sb) force_split[0] = 1;
@@ -718,7 +939,8 @@
       force_split[0] = 1;
   }
 
-  if (!set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+  if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+      !set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
                            thresholds[0], BLOCK_16X16, force_split[0])) {
     for (m = 0; m < num_64x64_blocks; ++m) {
       const int x64_idx = ((m & 1) << 4);
@@ -746,7 +968,7 @@
               // For inter frames: if variance4x4downsample[] == 1 for this
               // 16x16 block, then the variance is based on 4x4 down-sampling,
               // so use vt2 in set_vt_partioning(), otherwise use vt.
-              v16x16 *vtemp =
+              VP16x16 *vtemp =
                   (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                       ? &vt2[i2 + j]
                       : &vt->split[m].split[i].split[j];
@@ -772,6 +994,12 @@
     }
   }
 
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
+    set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
+                          mi_col, mi_row);
+  }
+  chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+
   if (vt2) aom_free(vt2);
   if (vt) aom_free(vt);
   return 0;

diff --git a/libaom/av1/encoder/var_based_part.h b/libaom/av1/encoder/var_based_part.h
index c355224..a80e25c 100644
--- a/libaom/av1/encoder/var_based_part.h
+++ b/libaom/av1/encoder/var_based_part.h

@@ -24,11 +24,19 @@
 extern "C" {
 #endif
 
+#define QINDEX_LOW_THR \
+  200  // Use low qindex variance partition thresholds when qindex is below this
+       // threshold
+#define QINDEX_HIGH_THR \
+  220  // Use high qindex variance partition thresholds when qindex is above
+       // this threshold
+
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
                                            int content_state);
 
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                      MACROBLOCK *x, int mi_row, int mi_col);
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libaom/av1/encoder/wedge_utils.c b/libaom/av1/encoder/wedge_utils.c
index e6edbb6..4067017 100644
--- a/libaom/av1/encoder/wedge_utils.c
+++ b/libaom/av1/encoder/wedge_utils.c

@@ -93,8 +93,8 @@
  *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
  *  being small, this should not cause a noticeable issue.
  */
-int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
-                                    int64_t limit) {
+int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
   int64_t acc = 0;
 
   do {

diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/libaom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index 865ac31..62eaa30 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c

@@ -11,8 +11,8 @@
 
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
 
-void av1_fdct32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
-                           const int stride) {
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride) {
   __m128i buf0[32];
   __m128i buf1[32];
   const int32_t *cospi;
@@ -396,8 +396,8 @@
   output[endidx] = buf0[1];
 }
 
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range) {
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range) {
   const int txfm_size = 4;
   const int num_per_128 = 4;
   const int32_t *cospi;
@@ -459,8 +459,8 @@
   }
 }
 
-void av1_fdct64_new_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
-                           const int instride, const int outstride) {
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
 
@@ -1408,8 +1408,8 @@
   output[endidx] = x10[1];
 }
 
-void av1_idtx32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
-                           const int col_num) {
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num) {
   (void)cos_bit;
   for (int i = 0; i < 32; i++) {
     output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2);

diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 9483063..634d50b 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c

@@ -113,8 +113,8 @@
   output[15] = x1[15];
 }
 
-static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
-                                      int8_t cos_bit) {
+static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
 
@@ -711,8 +711,8 @@
   output[63] = x1[63];
 }
 
-static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
-                                       int8_t cos_bit) {
+static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+                               int8_t cos_bit) {
   __m256i x1[32];
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -865,8 +865,8 @@
   output[31] = x1[31];
 }
 
-static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
-                                       int8_t cos_bit) {
+static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+                                   int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
 
@@ -1422,8 +1422,8 @@
   }
 }
 
-static INLINE void fidentity16x32_new_avx2(const __m256i *input,
-                                           __m256i *output, int8_t cos_bit) {
+static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 32; ++i) {
     output[i] = _mm256_slli_epi16(input[i], 2);
@@ -1499,41 +1499,41 @@
                                   int8_t cos_bit);
 
 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
-  fdct16x32_new_avx2,       // DCT_DCT
-  NULL,                     // ADST_DCT
-  NULL,                     // DCT_ADST
-  NULL,                     // ADST_ADST
-  NULL,                     // FLIPADST_DCT
-  NULL,                     // DCT_FLIPADST
-  NULL,                     // FLIPADST_FLIPADST
-  NULL,                     // ADST_FLIPADST
-  NULL,                     // FLIPADST_ADST
-  fidentity16x32_new_avx2,  // IDTX
-  fdct16x32_new_avx2,       // V_DCT
-  fidentity16x32_new_avx2,  // H_DCT
-  NULL,                     // V_ADST
-  NULL,                     // H_ADST
-  NULL,                     // V_FLIPADST
-  NULL                      // H_FLIPADST
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fdct16x32_avx2,       // V_DCT
+  fidentity16x32_avx2,  // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
 };
 
 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
-  fdct16x32_new_avx2,       // DCT_DCT
-  NULL,                     // ADST_DCT
-  NULL,                     // DCT_ADST
-  NULL,                     // ADST_ADST
-  NULL,                     // FLIPADST_DCT
-  NULL,                     // DCT_FLIPADST
-  NULL,                     // FLIPADST_FLIPADST
-  NULL,                     // ADST_FLIPADST
-  NULL,                     // FLIPADST_ADST
-  fidentity16x32_new_avx2,  // IDTX
-  fidentity16x32_new_avx2,  // V_DCT
-  fdct16x32_new_avx2,       // H_DCT
-  NULL,                     // V_ADST
-  NULL,                     // H_ADST
-  NULL,                     // V_FLIPADST
-  NULL                      // H_FLIPADST
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fidentity16x32_avx2,  // V_DCT
+  fdct16x32_avx2,       // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
 };
 
 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
@@ -1579,11 +1579,11 @@
   (void)bd;
   const TX_SIZE tx_size = TX_16X16;
   __m256i buf0[16], buf1[16];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
@@ -1620,11 +1620,11 @@
   (void)bd;
   const TX_SIZE tx_size = TX_32X32;
   __m256i buf0[32], buf1[128];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
@@ -1673,11 +1673,11 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_64X64;
   __m256i buf0[64], buf1[256];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
@@ -1702,8 +1702,8 @@
       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
     }
-    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
     av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
     av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
 
@@ -1721,11 +1721,11 @@
   (void)bd;
   const TX_SIZE tx_size = TX_16X32;
   __m256i buf0[32], buf1[32];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
@@ -1765,11 +1765,11 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m256i buf0[32], buf1[64];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
   const int txw_idx = get_txw_idx(TX_32X16);
   const int txh_idx = get_txh_idx(TX_32X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 32;
   const int height = 16;
   const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
@@ -1812,11 +1812,11 @@
   (void)bd;
   const TX_SIZE tx_size = TX_64X32;
   __m256i buf0[64], buf1[256];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
@@ -1841,8 +1841,8 @@
       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
     }
-    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
     av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
     av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
@@ -1862,11 +1862,11 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_32X64;
   __m256i buf0[64], buf1[256];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
@@ -1891,8 +1891,8 @@
       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
     }
-    av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
-    av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
+    fdct32_avx2(bufA, bufA, cos_bit_row);
+    fdct32_avx2(bufB, bufB, cos_bit_row);
     av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
     av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
@@ -1912,11 +1912,11 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_16X64;
   __m256i buf0[64], buf1[64];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
@@ -1956,11 +1956,11 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_64X16;
   __m256i buf0[64], buf1[64];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
@@ -2692,11 +2692,11 @@
   (void)bd;
   __m128i buf0[16], buf1[16];
   __m256i buf2[8];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   const int txw_idx = get_txw_idx(TX_8X16);
   const int txh_idx = get_txh_idx(TX_8X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 8;
   const int height = 16;
   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
@@ -2737,11 +2737,11 @@
   (void)bd;
   __m128i buf0[16], buf1[16];
   __m256i buf2[8];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   const int txw_idx = get_txw_idx(TX_16X8);
   const int txh_idx = get_txh_idx(TX_16X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 16;
   const int height = 8;
   const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];

diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/libaom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index 193f9d1..0bc3fbc 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c

@@ -32,15 +32,15 @@
 typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output,
                              const int8_t cos_bit, const int8_t *stage_range);
 
-static void fdct32_new_sse4_1(__m128i *input, __m128i *output,
-                              const int8_t cos_bit, const int8_t *stage_range) {
+static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit,
+                          const int8_t *stage_range) {
   const int txfm_size = 32;
   const int num_per_128 = 4;
   int col_num = txfm_size / num_per_128;
   int col;
   (void)stage_range;
   for (col = 0; col < col_num; col++) {
-    av1_fdct32_new_sse4_1((input + col), (output + col), cos_bit, col_num);
+    av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num);
   }
 }
 
@@ -51,8 +51,7 @@
   int col_num = txfm_size / num_per_128;
   (void)stage_range;
   for (int col = 0; col < col_num; col++) {
-    av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
-                          col_num);
+    av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num);
   }
 }
 static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
@@ -60,13 +59,13 @@
   (void)stage_range;
 
   for (int i = 0; i < 8; i++) {
-    av1_idtx32_new_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
+    av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
   }
 }
 
 static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
-    case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    case TXFM_TYPE_DCT32: return fdct32_sse4_1; break;
     case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
     case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; break;
     default: assert(0);
@@ -136,8 +135,8 @@
 
   /*row wise transform*/
   for (int col = 0; col < (col_num >> 1); col++) {
-    av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
-                          col_num, (col_num >> 1));
+    av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+                      (col_num >> 1));
   }
 
   txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
@@ -193,14 +192,14 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_64X64;
   __m128i buf0[64], buf1[512];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
   const int width_div8 = (width >> 3);
   const int height_div8 = (height >> 3);
 
@@ -221,8 +220,8 @@
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
     av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
     av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
 
@@ -239,11 +238,11 @@
   (void)bd;
   const TX_SIZE tx_size = TX_64X32;
   __m128i buf0[64], buf1[256];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
@@ -268,8 +267,8 @@
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
     av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
     av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
 
@@ -288,14 +287,14 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_32X64;
   __m128i buf0[64], buf1[256];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
   const int width_div8 = (width >> 3);
   const int height_div8 = (height >> 3);
 
@@ -317,8 +316,8 @@
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row, 1);
-    av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row, 1);
+    av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1);
+    av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1);
     av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
     av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
 

diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.c
index 6aae7ce..694e613 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.c

@@ -359,7 +359,8 @@
   output[15] = x6[15];
 }
 
-void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
 
@@ -682,7 +683,8 @@
   output[31] = x8[31];
 }
 
-void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
 
@@ -2106,7 +2108,7 @@
 };
 
 static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
-  fdct8x32_new_sse2,       // DCT_DCT
+  av1_fdct8x32_new_sse2,   // DCT_DCT
   NULL,                    // ADST_DCT
   NULL,                    // DCT_ADST
   NULL,                    // ADST_ADST
@@ -2117,7 +2119,7 @@
   NULL,                    // FLIPADST_ADST
   fidentity8x32_new_sse2,  // IDTX
   fidentity8x32_new_sse2,  // V_DCT
-  fdct8x32_new_sse2,       // H_DCT
+  av1_fdct8x32_new_sse2,   // H_DCT
   NULL,                    // V_ADST
   NULL,                    // H_ADST
   NULL,                    // V_FLIPADST
@@ -2128,11 +2130,11 @@
                                    int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[4], buf1[4], *buf;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
   const int txw_idx = get_txw_idx(TX_4X4);
   const int txh_idx = get_txh_idx(TX_4X4);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 4;
   const int height = 4;
   const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
@@ -2167,11 +2169,11 @@
   (void)stride;
   (void)bd;
   __m128i buf0[8], buf1[8], *buf;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
   const int txw_idx = get_txw_idx(TX_4X8);
   const int txh_idx = get_txh_idx(TX_4X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 4;
   const int height = 8;
   const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
@@ -2205,11 +2207,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[16], buf1[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
   const int txw_idx = get_txw_idx(TX_4X16);
   const int txh_idx = get_txh_idx(TX_4X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 4;
   const int height = 16;
   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
@@ -2247,11 +2249,11 @@
                                    int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[8], buf1[8], *buf;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
   const int txw_idx = get_txw_idx(TX_8X4);
   const int txh_idx = get_txh_idx(TX_8X4);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 8;
   const int height = 4;
   const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
@@ -2284,11 +2286,11 @@
                                    int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[8], buf1[8], *buf;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
   const int txw_idx = get_txw_idx(TX_8X8);
   const int txh_idx = get_txh_idx(TX_8X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 8;
   const int height = 8;
   const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
@@ -2321,11 +2323,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[16], buf1[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   const int txw_idx = get_txw_idx(TX_8X16);
   const int txh_idx = get_txh_idx(TX_8X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 8;
   const int height = 16;
   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
@@ -2363,11 +2365,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[32], buf1[32];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
   const int txw_idx = get_txw_idx(TX_8X32);
   const int txh_idx = get_txh_idx(TX_8X32);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 8;
   const int height = 32;
   const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
@@ -2407,11 +2409,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[16], buf1[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
   const int txw_idx = get_txw_idx(TX_16X4);
   const int txh_idx = get_txh_idx(TX_16X4);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 16;
   const int height = 4;
   const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
@@ -2450,11 +2452,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[16], buf1[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   const int txw_idx = get_txw_idx(TX_16X8);
   const int txh_idx = get_txh_idx(TX_16X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 16;
   const int height = 8;
   const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
@@ -2493,11 +2495,11 @@
                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[16], buf1[32];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
   const int txw_idx = get_txw_idx(TX_16X16);
   const int txh_idx = get_txh_idx(TX_16X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 16;
   const int height = 16;
   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
@@ -2540,11 +2542,11 @@
                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[32], buf1[64];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
   const int txw_idx = get_txw_idx(TX_16X32);
   const int txh_idx = get_txh_idx(TX_16X32);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 16;
   const int height = 32;
   const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
@@ -2595,11 +2597,11 @@
                                     int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[32], buf1[32];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
   const int txw_idx = get_txw_idx(TX_32X8);
   const int txh_idx = get_txh_idx(TX_32X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 32;
   const int height = 8;
   const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
@@ -2653,11 +2655,11 @@
                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[32], buf1[64];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
   const int txw_idx = get_txw_idx(TX_32X16);
   const int txh_idx = get_txh_idx(TX_32X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 32;
   const int height = 16;
   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
@@ -2712,11 +2714,11 @@
                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m128i buf0[32], buf1[128];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
   const int txw_idx = get_txw_idx(TX_32X32);
   const int txh_idx = get_txh_idx(TX_32X32);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = 32;
   const int height = 32;
   const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
@@ -2775,15 +2777,15 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_64X16;
   __m128i buf0[64], buf1[128];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
-  const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2;
   const int width_div8 = (width >> 3);
   const int height_div8 = (height >> 3);
 
@@ -2817,14 +2819,14 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_16X64;
   __m128i buf0[64], buf1[128];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
   const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
   const int width_div8 = (width >> 3);
   const int height_div8 = (height >> 3);

diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.h
index 99a6b90..a0e32f5 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.h

@@ -24,8 +24,10 @@
 extern "C" {
 #endif
 
-void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
-void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
 
 static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
                                          __m128i *const output,
@@ -92,7 +94,7 @@
 }
 
 static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
-  fdct8x32_new_sse2,       // DCT_DCT
+  av1_fdct8x32_new_sse2,   // DCT_DCT
   NULL,                    // ADST_DCT
   NULL,                    // DCT_ADST
   NULL,                    // ADST_ADST
@@ -102,7 +104,7 @@
   NULL,                    // ADST_FLIPADST
   NULL,                    // FLIPADST_ADST
   fidentity8x32_new_sse2,  // IDTX
-  fdct8x32_new_sse2,       // V_DCT
+  av1_fdct8x32_new_sse2,   // V_DCT
   fidentity8x32_new_sse2,  // H_DCT
   NULL,                    // V_ADST
   NULL,                    // H_ADST

diff --git a/libaom/av1/encoder/x86/av1_quantize_avx2.c b/libaom/av1/encoder/x86/av1_quantize_avx2.c
index df22aab..f5f7ee1 100644
--- a/libaom/av1/encoder/x86/av1_quantize_avx2.c
+++ b/libaom/av1/encoder/x86/av1_quantize_avx2.c

@@ -132,6 +132,121 @@
   }
 }
 
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+                                   __m256i *coeff256) {
+  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+  const __m256i zero256 = _mm256_setzero_si256();
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+  // Add one to convert from indices to counts
+  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE void store_zero_tran_low(int16_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+  _mm256_storeu_si256((__m256i *)(a), zero);
+}
+
+void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan) {
+  __m128i eob;
+  __m256i round256, quant256, dequant256;
+  __m256i eob256, thr256;
+
+  coeff_ptr += n_coeffs;
+  scan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  {
+    __m256i coeff256;
+
+    // Setup global values
+    {
+      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      round256 = _mm256_castsi128_si256(round);
+      round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+      quant256 = _mm256_castsi128_si256(quant);
+      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+      dequant256 = _mm256_castsi128_si256(dequant);
+      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+    }
+
+    {
+      __m256i qcoeff256;
+      __m256i qtmp256;
+      coeff256 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+      qcoeff256 = _mm256_abs_epi16(coeff256);
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+    }
+
+    eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256);
+    n_coeffs += 8 * 2;
+  }
+
+  // remove dc constants
+  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+  thr256 = _mm256_srai_epi16(dequant256, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m256i coeff256 =
+        _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+    int32_t nzflag =
+        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+    if (nzflag) {
+      __m256i qtmp256;
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+      eob256 = _mm256_max_epi16(
+          eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256));
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+                      _mm256_extracti128_si256(eob256, 1));
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
 void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
                           const int16_t *quant_ptr,

diff --git a/libaom/av1/encoder/x86/av1_quantize_sse2.c b/libaom/av1/encoder/x86/av1_quantize_sse2.c
index b07e771..5497c7e 100644
--- a/libaom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/libaom/av1/encoder/x86/av1_quantize_sse2.c

@@ -91,7 +91,7 @@
                                      _mm_cmpeq_epi16(qcoeff0, *thr0));
   const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
                                      _mm_cmpeq_epi16(qcoeff1, *thr1));
-  const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+  const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
 
   if (nzflag) {
     qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);

diff --git a/libaom/av1/encoder/x86/av1_txfm1d_sse4.h b/libaom/av1/encoder/x86/av1_txfm1d_sse4.h
index b3d5b22..7a0f328 100644
--- a/libaom/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/libaom/av1/encoder/x86/av1_txfm1d_sse4.h

@@ -20,43 +20,43 @@
 extern "C" {
 #endif
 
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t cos_bit, const int8_t *stage_range);
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t cos_bit, const int8_t *stage_range);
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_fdct32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
-                           const int stride);
-void av1_fdct64_new_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
-                           const int instride, const int outstride);
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride);
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride);
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
 
-void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t cos_bit, const int8_t *stage_range);
-void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t cos_bit, const int8_t *stage_range);
-void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
 
-void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t cos_bit, const int8_t *stage_range);
-void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
 
-void av1_idtx32_new_sse4_1(__m128i *input, __m128i *output, int cos_bit,
-                           const int col_num);
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num);
 
 static INLINE void transpose_32_4x4(int stride, const __m128i *input,
                                     __m128i *output) {

diff --git a/libaom/av1/encoder/x86/corner_match_avx2.c b/libaom/av1/encoder/x86/corner_match_avx2.c
index 7a3b999..8d7eb3f 100644
--- a/libaom/av1/encoder/x86/corner_match_avx2.c
+++ b/libaom/av1/encoder/x86/corner_match_avx2.c

@@ -15,11 +15,12 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
-  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
 #if MATCH_SZ != 13
 #error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
 #endif
@@ -28,9 +29,9 @@
 correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
 of each image, centered at (x1, y1) and (x2, y2) respectively.
 */
-double compute_cross_correlation_avx2(unsigned char *im1, int stride1, int x1,
-                                      int y1, unsigned char *im2, int stride2,
-                                      int x2, int y2) {
+double av1_compute_cross_correlation_avx2(unsigned char *im1, int stride1,
+                                          int x1, int y1, unsigned char *im2,
+                                          int stride2, int x2, int y2) {
   int i, stride1_i = 0, stride2_i = 0;
   __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
   const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
@@ -75,5 +76,6 @@
 
   int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
   int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }

diff --git a/libaom/av1/encoder/x86/corner_match_sse4.c b/libaom/av1/encoder/x86/corner_match_sse4.c
index 93f37b7..5c9ca20 100644
--- a/libaom/av1/encoder/x86/corner_match_sse4.c
+++ b/libaom/av1/encoder/x86/corner_match_sse4.c

@@ -19,11 +19,12 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
-  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
 #if MATCH_SZ != 13
 #error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
 #endif
@@ -32,9 +33,9 @@
    correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
    of each image, centered at (x1, y1) and (x2, y2) respectively.
 */
-double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1,
-                                        int y1, unsigned char *im2, int stride2,
-                                        int x2, int y2) {
+double av1_compute_cross_correlation_sse4_1(unsigned char *im1, int stride1,
+                                            int x1, int y1, unsigned char *im2,
+                                            int stride2, int x2, int y2) {
   int i;
   // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
   // 2)
@@ -99,5 +100,6 @@
 
   int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }

diff --git a/libaom/av1/encoder/x86/encodetxb_avx2.c b/libaom/av1/encoder/x86/encodetxb_avx2.c
index 2621301..30a4129 100644
--- a/libaom/av1/encoder/x86/encodetxb_avx2.c
+++ b/libaom/av1/encoder/x86/encodetxb_avx2.c

@@ -16,7 +16,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/mem_sse2.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"

diff --git a/libaom/av1/encoder/x86/encodetxb_sse2.c b/libaom/av1/encoder/x86/encodetxb_sse2.c
index dedb4d0..394befb 100644
--- a/libaom/av1/encoder/x86/encodetxb_sse2.c
+++ b/libaom/av1/encoder/x86/encodetxb_sse2.c

@@ -14,7 +14,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/mem_sse2.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 
 static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,

diff --git a/libaom/av1/encoder/x86/encodetxb_sse4.c b/libaom/av1/encoder/x86/encodetxb_sse4.c
index 34c9e4f..aeb57f2 100644
--- a/libaom/av1/encoder/x86/encodetxb_sse4.c
+++ b/libaom/av1/encoder/x86/encodetxb_sse4.c

@@ -14,7 +14,7 @@
 #include <smmintrin.h>  /* SSE4.1 */
 
 #include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 #include "aom_dsp/x86/synonyms.h"
 

diff --git a/libaom/av1/encoder/x86/error_intrin_avx2.c b/libaom/av1/encoder/x86/error_intrin_avx2.c
index 7d4f695..12dda3a 100644
--- a/libaom/av1/encoder/x86/error_intrin_avx2.c
+++ b/libaom/av1/encoder/x86/error_intrin_avx2.c

@@ -29,6 +29,59 @@
   }
 }
 
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (int i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+      const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
+
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {
   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;

diff --git a/libaom/av1/encoder/x86/error_sse2.asm b/libaom/av1/encoder/x86/error_sse2.asm
index 72e9e22..f4b4968 100644
--- a/libaom/av1/encoder/x86/error_sse2.asm
+++ b/libaom/av1/encoder/x86/error_sse2.asm

@@ -11,6 +11,21 @@
 
 ;
 
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+  lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
 %define private_prefix av1
 
 %include "third_party/x86inc/x86inc.asm"
@@ -25,14 +40,14 @@
   pxor      m4, m4                 ; sse accumulator
   pxor      m6, m6                 ; ssz accumulator
   pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
 .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
   psubw     m0, m2
   psubw     m1, m3
   ; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -41,25 +56,19 @@
   pmaddwd   m1, m1
   pmaddwd   m2, m2
   pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
   ; accumulate in 64bit
   punpckldq m7, m0, m5
   punpckhdq m0, m5
   paddq     m4, m7
-  punpckldq m7, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m7
   punpckldq m7, m2, m5
-  paddq     m4, m1
+  paddq     m4, m0
   punpckhdq m2, m5
   paddq     m6, m7
-  punpckldq m7, m3, m5
   paddq     m6, m2
-  punpckhdq m3, m5
-  paddq     m6, m7
-  paddq     m6, m3
-  add    sizeq, mmsize
-  jl .loop
+  jg .loop
 
   ; accumulate horizontally and store in return value
   movhlps   m5, m4

diff --git a/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
index 719734c..ee3714d 100644
--- a/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
+++ b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c

@@ -14,7 +14,8 @@
 #include "aom/aom_integer.h"
 #include "av1/common/common.h"
 
-int64_t av1_highbd_block_error_avx2(tran_low_t *coeff, tran_low_t *dqcoeff,
+int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
                                     intptr_t block_size, int64_t *ssz,
                                     int bps) {
   int i;

diff --git a/libaom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/libaom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
index 777304a..4579e4e 100644
--- a/libaom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
+++ b/libaom/av1/encoder/x86/highbd_block_error_intrin_sse2.c

@@ -14,7 +14,8 @@
 
 #include "av1/common/common.h"
 
-int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
                                     intptr_t block_size, int64_t *ssz,
                                     int bps) {
   int i, j, test;

diff --git a/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 24c513f..a81378c 100644
--- a/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c

@@ -20,9 +20,9 @@
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static INLINE void av1_load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
-                                            int stride, int flipud, int fliplr,
-                                            int shift) {
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+                                        int stride, int flipud, int fliplr,
+                                        int shift) {
   __m128i out1[8];
   if (!flipud) {
     out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
@@ -94,9 +94,9 @@
   in[6] = _mm256_srai_epi32(in[6], shift);
   in[7] = _mm256_srai_epi32(in[7], shift);
 }
-static INLINE void av1_load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
-                                             int stride, int flipud, int fliplr,
-                                             int shift) {
+static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int flipud, int fliplr,
+                                         int shift) {
   const int16_t *topL = input;
   const int16_t *botL = input + 8 * stride;
 
@@ -107,13 +107,12 @@
     topL = botL;
     botL = tmp;
   }
-  av1_load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
-  av1_load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+  load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
 }
-static INLINE void av1_load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
-                                             int stride, int height,
-                                             int outstride, int flipud,
-                                             int fliplr) {
+static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int height, int outstride,
+                                         int flipud, int fliplr) {
   __m256i out1[64];
   if (!flipud) {
     for (int i = 0; i < height; i++) {
@@ -142,9 +141,9 @@
   }
 }
 
-static void av1_fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
-                                            const int instride,
-                                            const int outstride) {
+static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+                                        const int instride,
+                                        const int outstride) {
   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
   __m256i x0, x1;
 
@@ -180,8 +179,8 @@
   out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
   out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
-static INLINE void av1_round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
-                                               int stride) {
+static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+                                           int stride) {
   if (bit < 0) {
     bit = -bit;
     __m256i round = _mm256_set1_epi32(1 << (bit - 1));
@@ -195,19 +194,19 @@
     }
   }
 }
-static INLINE void av1_store_buffer_avx2(const __m256i *const in, int32_t *out,
-                                         const int stride, const int out_size) {
+static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+                                     const int stride, const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     _mm256_store_si256((__m256i *)(out), in[i]);
     out += stride;
   }
 }
-static INLINE void av1_fwd_txfm_transpose_16x16_avx2(const __m256i *in,
-                                                     __m256i *out) {
-  av1_fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+                                                 __m256i *out) {
+  fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
 }
 
 static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
@@ -229,11 +228,11 @@
     const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);     \
     const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);     \
     out0 = _mm256_add_epi32(in0_w0, in1_w1);                 \
-    av1_round_shift_32_8xn_avx2(&out0, 1, -bit, 1);          \
+    round_shift_32_8xn_avx2(&out0, 1, -bit, 1);              \
     const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);     \
     const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);     \
     out1 = _mm256_sub_epi32(in0_w1, in1_w0);                 \
-    av1_round_shift_32_8xn_avx2(&out1, 1, -bit, 1);          \
+    round_shift_32_8xn_avx2(&out1, 1, -bit, 1);              \
   } while (0)
 
 #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
@@ -253,8 +252,8 @@
 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
                                   const int8_t cos_bit, int instride,
                                   int outstride);
-static void av1_fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                           const int col_num, const int outstride) {
+static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                       const int col_num, const int outstride) {
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
@@ -354,8 +353,8 @@
     out[6 * outstride + col] = u[3];  // buf0[3]
   }
 }
-static void av1_fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                            const int col_num, const int outstirde) {
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstirde) {
   (void)col_num;
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
@@ -527,8 +526,8 @@
     out[7 * outstirde + col] = v0;
   }
 }
-static void av1_idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                           int col_num, int outstride) {
+static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
+                       int outstride) {
   (void)bit;
   (void)outstride;
   int num_iters = 8 * col_num;
@@ -547,7 +546,7 @@
                              TX_TYPE tx_type, int bd) {
   __m256i in[8], out[8];
   const TX_SIZE tx_size = TX_8X8;
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int width = tx_size_wide[tx_size];
@@ -555,180 +554,180 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case ADST_DCT:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case DCT_ADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case ADST_ADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case FLIPADST_DCT:
-      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case DCT_FLIPADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case FLIPADST_FLIPADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case ADST_FLIPADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case FLIPADST_ADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case IDTX:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case V_DCT:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case H_DCT:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case V_ADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case H_ADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
-      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case V_FLIPADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     case H_FLIPADST:
-      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
-      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                     width_div8);
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
       col_txfm_8x8_rounding(out, -shift[1]);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
       break;
     default: assert(0);
   }
   (void)bd;
 }
 
-static void av1_fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                            const int col_num, const int outstride) {
+static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstride) {
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
@@ -1012,8 +1011,8 @@
     out[15 * outstride + col] = v[15];
   }
 }
-static void av1_fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                             const int num_cols, const int outstride) {
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                         const int num_cols, const int outstride) {
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
@@ -1264,8 +1263,8 @@
     out[15 * outstride + col] = v[0];
   }
 }
-static void av1_idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
-                            int col_num, const int outstride) {
+static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        int col_num, const int outstride) {
   (void)bit;
   (void)outstride;
   __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
@@ -1280,132 +1279,132 @@
   }
 }
 static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
-  av1_fdct16_avx2,   // DCT_DCT
-  av1_fadst16_avx2,  // ADST_DCT
-  av1_fdct16_avx2,   // DCT_ADST
-  av1_fadst16_avx2,  // ADST_ADST
-  av1_fadst16_avx2,  // FLIPADST_DCT
-  av1_fdct16_avx2,   // DCT_FLIPADST
-  av1_fadst16_avx2,  // FLIPADST_FLIPADST
-  av1_fadst16_avx2,  // ADST_FLIPADST
-  av1_fadst16_avx2,  // FLIPADST_ADST
-  av1_idtx16_avx2,   // IDTX
-  av1_fdct16_avx2,   // V_DCT
-  av1_idtx16_avx2,   // H_DCT
-  av1_fadst16_avx2,  // V_ADST
-  av1_idtx16_avx2,   // H_ADST
-  av1_fadst16_avx2,  // V_FLIPADST
-  av1_idtx16_avx2    // H_FLIPADST
+  fdct16_avx2,   // DCT_DCT
+  fadst16_avx2,  // ADST_DCT
+  fdct16_avx2,   // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fadst16_avx2,  // FLIPADST_DCT
+  fdct16_avx2,   // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  fdct16_avx2,   // V_DCT
+  idtx16_avx2,   // H_DCT
+  fadst16_avx2,  // V_ADST
+  idtx16_avx2,   // H_ADST
+  fadst16_avx2,  // V_FLIPADST
+  idtx16_avx2    // H_FLIPADST
 };
 static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
-  av1_fdct8_avx2,   // DCT_DCT
-  av1_fdct8_avx2,   // ADST_DCT
-  av1_fadst8_avx2,  // DCT_ADST
-  av1_fadst8_avx2,  // ADST_ADST
-  av1_fdct8_avx2,   // FLIPADST_DCT
-  av1_fadst8_avx2,  // DCT_FLIPADST
-  av1_fadst8_avx2,  // FLIPADST_FLIPADST
-  av1_fadst8_avx2,  // ADST_FLIPADST
-  av1_fadst8_avx2,  // FLIPADST_ADST
-  av1_idtx8_avx2,   // IDTX
-  av1_idtx8_avx2,   // V_DCT
-  av1_fdct8_avx2,   // H_DCT
-  av1_idtx8_avx2,   // V_ADST
-  av1_fadst8_avx2,  // H_ADST
-  av1_idtx8_avx2,   // V_FLIPADST
-  av1_fadst8_avx2   // H_FLIPADST
+  fdct8_avx2,   // DCT_DCT
+  fdct8_avx2,   // ADST_DCT
+  fadst8_avx2,  // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fdct8_avx2,   // FLIPADST_DCT
+  fadst8_avx2,  // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  idtx8_avx2,   // V_DCT
+  fdct8_avx2,   // H_DCT
+  idtx8_avx2,   // V_ADST
+  fadst8_avx2,  // H_ADST
+  idtx8_avx2,   // V_FLIPADST
+  fadst8_avx2   // H_FLIPADST
 };
 void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   __m256i in[16], out[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   const int txw_idx = get_txw_idx(TX_8X16);
   const int txh_idx = get_txh_idx(TX_8X16);
   const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
-  const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
-  av1_load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+  load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
   col_txfm(in, out, bit, 1, 1);
   col_txfm_8x8_rounding(out, -shift[1]);
   col_txfm_8x8_rounding(&out[8], -shift[1]);
-  av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
   row_txfm(in, out, bit, 2, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
-  av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
   av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
-  av1_store_buffer_avx2(in, coeff, 8, 16);
+  store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }
 static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
-  av1_fdct8_avx2,   // DCT_DCT
-  av1_fadst8_avx2,  // ADST_DCT
-  av1_fdct8_avx2,   // DCT_ADST
-  av1_fadst8_avx2,  // ADST_ADST
-  av1_fadst8_avx2,  // FLIPADST_DCT
-  av1_fdct8_avx2,   // DCT_FLIPADST
-  av1_fadst8_avx2,  // FLIPADST_FLIPADST
-  av1_fadst8_avx2,  // ADST_FLIPADST
-  av1_fadst8_avx2,  // FLIPADST_ADST
-  av1_idtx8_avx2,   // IDTX
-  av1_fdct8_avx2,   // V_DCT
-  av1_idtx8_avx2,   // H_DCT
-  av1_fadst8_avx2,  // V_ADST
-  av1_idtx8_avx2,   // H_ADST
-  av1_fadst8_avx2,  // V_FLIPADST
-  av1_idtx8_avx2    // H_FLIPADST
+  fdct8_avx2,   // DCT_DCT
+  fadst8_avx2,  // ADST_DCT
+  fdct8_avx2,   // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fadst8_avx2,  // FLIPADST_DCT
+  fdct8_avx2,   // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  fdct8_avx2,   // V_DCT
+  idtx8_avx2,   // H_DCT
+  fadst8_avx2,  // V_ADST
+  idtx8_avx2,   // H_ADST
+  fadst8_avx2,  // V_FLIPADST
+  idtx8_avx2    // H_FLIPADST
 };
 static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
-  av1_fdct16_avx2,   // DCT_DCT
-  av1_fdct16_avx2,   // ADST_DCT
-  av1_fadst16_avx2,  // DCT_ADST
-  av1_fadst16_avx2,  // ADST_ADST
-  av1_fdct16_avx2,   // FLIPADST_DCT
-  av1_fadst16_avx2,  // DCT_FLIPADST
-  av1_fadst16_avx2,  // FLIPADST_FLIPADST
-  av1_fadst16_avx2,  // ADST_FLIPADST
-  av1_fadst16_avx2,  // FLIPADST_ADST
-  av1_idtx16_avx2,   // IDTX
-  av1_idtx16_avx2,   // V_DCT
-  av1_fdct16_avx2,   // H_DCT
-  av1_idtx16_avx2,   // V_ADST
-  av1_fadst16_avx2,  // H_ADST
-  av1_idtx16_avx2,   // V_FLIPADST
-  av1_fadst16_avx2   // H_FLIPADST
+  fdct16_avx2,   // DCT_DCT
+  fdct16_avx2,   // ADST_DCT
+  fadst16_avx2,  // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fdct16_avx2,   // FLIPADST_DCT
+  fadst16_avx2,  // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  idtx16_avx2,   // V_DCT
+  fdct16_avx2,   // H_DCT
+  idtx16_avx2,   // V_ADST
+  fadst16_avx2,  // H_ADST
+  idtx16_avx2,   // V_FLIPADST
+  fadst16_avx2   // H_FLIPADST
 };
 void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   __m256i in[16], out[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   const int txw_idx = get_txw_idx(TX_16X8);
   const int txh_idx = get_txh_idx(TX_16X8);
   const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
   const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
-  const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
-  av1_load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
-  av1_round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+  load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+  round_shift_32_8xn_avx2(in, 16, shift[0], 1);
   col_txfm(in, out, bit, 2, 2);
-  av1_round_shift_32_8xn_avx2(out, 16, shift[1], 1);
-  av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
-  av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
   row_txfm(in, out, bit, 1, 1);
-  av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
-  av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
   av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
-  av1_store_buffer_avx2(in, coeff, 8, 16);
+  store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }
 void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m256i in[32], out[32];
   const TX_SIZE tx_size = TX_16X16;
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int width = tx_size_wide[tx_size];
@@ -1415,196 +1414,196 @@
   const int size = (height << 1);
   switch (tx_type) {
     case DCT_DCT:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case ADST_DCT:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case DCT_ADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case ADST_ADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case FLIPADST_DCT:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case DCT_FLIPADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case FLIPADST_FLIPADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case ADST_FLIPADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case FLIPADST_ADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case IDTX:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case V_DCT:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case H_DCT:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case V_ADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case H_ADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case V_FLIPADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     case H_FLIPADST:
-      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
-      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
-      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
-                      width_div8);
-      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
-                       width_div8);
-      av1_fwd_txfm_transpose_16x16_avx2(out, in);
-      av1_store_buffer_avx2(in, coeff, 8, 32);
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
       break;
     default: assert(0);
   }
   (void)bd;
 }
-static INLINE void av1_fdct32_avx2(__m256i *input, __m256i *output,
-                                   const int8_t cos_bit, const int instride,
-                                   const int outstride) {
+static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+                               const int8_t cos_bit, const int instride,
+                               const int outstride) {
   __m256i buf0[32];
   __m256i buf1[32];
   const int32_t *cospi;
@@ -2009,51 +2008,51 @@
   }
 }
 static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct32_avx2,  // DCT_DCT
-  NULL,             // ADST_DCT
-  NULL,             // DCT_ADST
-  NULL,             // ADST_ADST
-  NULL,             // FLIPADST_DCT
-  NULL,             // DCT_FLIPADST
-  NULL,             // FLIPADST_FLIPADST
-  NULL,             // ADST_FLIPADST
-  NULL,             // FLIPADST_ADST
-  idtx32x32_avx2,   // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
 };
 static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct32_avx2,  // DCT_DCT
-  NULL,             // ADST_DCT
-  NULL,             // DCT_ADST
-  NULL,             // ADST_ADST
-  NULL,             // FLIPADST_DCT
-  NULL,             // DCT_FLIPADST
-  NULL,             // FLIPADST_FLIPADST
-  NULL,             // ADST_FLIPADST
-  NULL,             // FLIPADST_ADST
-  idtx32x32_avx2,   // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
 };
 void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   __m256i buf0[128], buf1[128];
   const int tx_size = TX_32X32;
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
@@ -2063,25 +2062,23 @@
   const int width_div8 = (width >> 3);
 
   for (int i = 0; i < width_div16; i++) {
-    av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
-                              width_div8, 0, 0);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
-                                width_div8);
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
     col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
              width_div8);
     col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
              width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
-                                width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
   }
 
   for (r = 0; r < height; r += 8) {
     for (c = 0; c < width_div8; c++) {
-      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
-                                      &buf1[c * 8 * width_div8 + (r >> 3)],
-                                      width_div8, width_div8);
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
     }
   }
 
@@ -2090,26 +2087,24 @@
              width_div8);
     row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
              width_div8);
-    av1_round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
-    av1_round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2],
-                                width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
   }
 
   for (r = 0; r < height; r += 8) {
     for (c = 0; c < width_div8; c++) {
-      av1_fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
-                                      &buf0[c * 8 * width_div8 + (r >> 3)],
-                                      width_div8, width_div8);
+      fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
+                                  &buf0[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
     }
   }
 
-  av1_store_buffer_avx2(buf0, output, 8, 128);
+  store_buffer_avx2(buf0, output, 8, 128);
 }
-static INLINE void av1_fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
-                                          __m256i *cospi_m32,
-                                          __m256i *cospi_p32,
-                                          const __m256i *__rounding,
-                                          int8_t cos_bit) {
+static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   x2[0] = _mm256_add_epi32(x1[0], x1[31]);
   x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
   x2[1] = _mm256_add_epi32(x1[1], x1[30]);
@@ -2175,11 +2170,10 @@
   x2[62] = x1[62];
   x2[63] = x1[63];
 }
-static INLINE void av1_fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
-                                          __m256i *cospi_m32,
-                                          __m256i *cospi_p32,
-                                          const __m256i *__rounding,
-                                          int8_t cos_bit) {
+static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   x3[0] = _mm256_add_epi32(x2[0], x2[15]);
   x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
   x3[1] = _mm256_add_epi32(x2[1], x2[14]);
@@ -2245,10 +2239,12 @@
   x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
   x3[56] = _mm256_add_epi32(x2[56], x2[55]);
 }
-static INLINE void av1_fdct64_stage4_avx2(
-    __m256i *x3, __m256i *x4, __m256i *cospi_m32, __m256i *cospi_p32,
-    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
-    const __m256i *__rounding, int8_t cos_bit) {
+static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   x4[0] = _mm256_add_epi32(x3[0], x3[7]);
   x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
   x4[1] = _mm256_add_epi32(x3[1], x3[6]);
@@ -2314,10 +2310,12 @@
   x4[62] = x3[62];
   x4[63] = x3[63];
 }
-static INLINE void av1_fdct64_stage5_avx2(
-    __m256i *x4, __m256i *x5, __m256i *cospi_m32, __m256i *cospi_p32,
-    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
-    const __m256i *__rounding, int8_t cos_bit) {
+static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   x5[0] = _mm256_add_epi32(x4[0], x4[3]);
   x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
   x5[1] = _mm256_add_epi32(x4[1], x4[2]);
@@ -2383,7 +2381,7 @@
   x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
   x5[60] = _mm256_add_epi32(x4[60], x4[59]);
 }
-static INLINE void av1_fdct64_stage6_avx2(
+static INLINE void fdct64_stage6_avx2(
     __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
     __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
     __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
@@ -2454,11 +2452,13 @@
   x6[62] = x5[62];
   x6[63] = x5[63];
 }
-static INLINE void av1_fdct64_stage7_avx2(
-    __m256i *x6, __m256i *x7, __m256i *cospi_p08, __m256i *cospi_p56,
-    __m256i *cospi_p40, __m256i *cospi_p24, __m256i *cospi_m08,
-    __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_m24,
-    const __m256i *__rounding, int8_t cos_bit) {
+static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+                                      __m256i *cospi_p08, __m256i *cospi_p56,
+                                      __m256i *cospi_p40, __m256i *cospi_p24,
+                                      __m256i *cospi_m08, __m256i *cospi_m56,
+                                      __m256i *cospi_m40, __m256i *cospi_m24,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   x7[0] = x6[0];
   x7[1] = x6[1];
   x7[2] = x6[2];
@@ -2524,10 +2524,10 @@
   x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
   x7[62] = _mm256_add_epi32(x6[62], x6[61]);
 }
-static INLINE void av1_fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
-                                          const int32_t *cospi,
-                                          const __m256i *__rounding,
-                                          int8_t cos_bit) {
+static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
   __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
   __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
@@ -2611,10 +2611,10 @@
   x8[60] = x7[60];
   x8[63] = x7[63];
 }
-static INLINE void av1_fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
-                                          const int32_t *cospi,
-                                          const __m256i *__rounding,
-                                          int8_t cos_bit) {
+static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
   __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
   __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
   __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
@@ -2697,10 +2697,10 @@
   x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
   x9[63] = _mm256_add_epi32(x8[63], x8[62]);
 }
-static INLINE void av1_fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
-                                           const int32_t *cospi,
-                                           const __m256i *__rounding,
-                                           int8_t cos_bit) {
+static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+                                       const int32_t *cospi,
+                                       const __m256i *__rounding,
+                                       int8_t cos_bit) {
   __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
   __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
   __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
@@ -2799,8 +2799,8 @@
   btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
                         *__rounding, cos_bit);
 }
-static void av1_fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
-                            const int instride, const int outstride) {
+static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+                        const int instride, const int outstride) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
   __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
@@ -2951,30 +2951,29 @@
 
   // stage 2
   __m256i x2[64];
-  av1_fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
   // stage 3
-  av1_fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
   // stage 4
-  av1_fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
-                         &cospi_m48, &__rounding, cos_bit);
+  fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
   // stage 5
-  av1_fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
-                         &cospi_m48, &__rounding, cos_bit);
+  fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
   // stage 6
-  av1_fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
-                         &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56,
-                         &cospi_m40, &cospi_p24, &cospi_m24, &__rounding,
-                         cos_bit);
+  fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
+                     &cospi_p24, &cospi_m24, &__rounding, cos_bit);
   // stage 7
-  av1_fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
-                         &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
-                         &__rounding, cos_bit);
+  fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+                     &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+                     &__rounding, cos_bit);
   // stage 8
-  av1_fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+  fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
   // stage 9
-  av1_fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+  fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
   // stage 10
-  av1_fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+  fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
 
   startidx = 0 * outstride;
   endidx = 63 * outstride;
@@ -3114,37 +3113,35 @@
   assert(tx_type == DCT_DCT);
   const TX_SIZE tx_size = TX_64X64;
   __m256i buf0[512], buf1[512];
-  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const transform_1d_avx2 col_txfm = av1_fdct64_avx2;
-  const transform_1d_avx2 row_txfm = av1_fdct64_avx2;
+  const transform_1d_avx2 col_txfm = fdct64_avx2;
+  const transform_1d_avx2 row_txfm = fdct64_avx2;
   const int width_div16 = (width >> 4);
   const int width_div8 = (width >> 3);
   int r, c;
   for (int i = 0; i < width_div16; i++) {
-    av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
-                              width_div8, 0, 0);
-    av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
-                                width_div8);
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
     col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
     col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
              width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
-                                width_div8);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
   }
 
   for (r = 0; r < height; r += 8) {
     for (c = 0; c < width_div8; c++) {
-      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
-                                      &buf1[c * 8 * width_div8 + (r >> 3)],
-                                      width_div8, width_div8);
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
     }
   }
 
@@ -3153,18 +3150,18 @@
              width_div16);
     row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
              width_div16);
-    av1_round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
-                                width_div16);
-    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
-                                width_div16);
+    round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+                            width_div16);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+                            width_div16);
   }
 
   for (r = 0; r < (height >> 1); r += 8) {
     for (c = 0; c < width_div16; c++) {
-      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
-                                      &buf1[c * 8 * width_div16 + (r >> 3)],
-                                      width_div16, width_div16);
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
+                                  &buf1[c * 8 * width_div16 + (r >> 3)],
+                                  width_div16, width_div16);
     }
   }
-  av1_store_buffer_avx2(buf1, output, 8, 128);
+  store_buffer_avx2(buf1, output, 8, 128);
 }

diff --git a/libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index d105977..73afc5d 100644
--- a/libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c

@@ -215,105 +215,105 @@
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int input_stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
   const int txw_idx = get_txw_idx(TX_4X4);
   const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_DCT:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_ADST:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_ADST:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_DCT:
       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_FLIPADST:
       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_FLIPADST:
       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_ADST:
       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case IDTX:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case V_DCT:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case H_DCT:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case V_ADST:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case H_ADST:
       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case V_FLIPADST:
       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     case H_FLIPADST:
       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
-      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
     default: assert(0);
@@ -844,152 +844,152 @@
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
   const int txw_idx = get_txw_idx(TX_8X8);
   const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_DCT:
       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_FLIPADST:
       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_FLIPADST:
       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_ADST:
       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case IDTX:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case V_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case H_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case V_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case H_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case V_FLIPADST:
       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case H_FLIPADST:
       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
@@ -1750,152 +1750,168 @@
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
   const int txw_idx = get_txw_idx(TX_16X16);
   const int txh_idx = get_txh_idx(TX_16X16);
   const int col_num = 4;
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_DCT:
       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_FLIPADST:
       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_FLIPADST:
       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_ADST:
       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case IDTX:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case V_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case H_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case V_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case H_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case V_FLIPADST:
       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case H_FLIPADST:
       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
@@ -2078,22 +2094,22 @@
 };
 
 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct32_new_sse4_1,  // DCT_DCT
-  NULL,                   // ADST_DCT
-  NULL,                   // DCT_ADST
-  NULL,                   // ADST_ADST
-  NULL,                   // FLIPADST_DCT
-  NULL,                   // DCT_FLIPADST
-  NULL,                   // FLIPADST_FLIPADST
-  NULL,                   // ADST_FLIPADST
-  NULL,                   // FLIPADST_ADST
-  av1_idtx32_new_sse4_1,  // IDTX
-  NULL,                   // V_DCT
-  NULL,                   // H_DCT
-  NULL,                   // V_ADST
-  NULL,                   // H_ADST
-  NULL,                   // V_FLIPADST
-  NULL                    // H_FLIPADST
+  av1_fdct32_sse4_1,  // DCT_DCT
+  NULL,               // ADST_DCT
+  NULL,               // DCT_ADST
+  NULL,               // ADST_ADST
+  NULL,               // FLIPADST_DCT
+  NULL,               // DCT_FLIPADST
+  NULL,               // FLIPADST_FLIPADST
+  NULL,               // ADST_FLIPADST
+  NULL,               // FLIPADST_ADST
+  av1_idtx32_sse4_1,  // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
@@ -2118,12 +2134,12 @@
 void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[32], out[32];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
   const int txw_idx = get_txw_idx(TX_16X8);
   const int txh_idx = get_txh_idx(TX_16X8);
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
-  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
@@ -2153,12 +2169,12 @@
 void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[32], out[32];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
   const int txw_idx = get_txw_idx(TX_8X16);
   const int txh_idx = get_txh_idx(TX_8X16);
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
-  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
@@ -2182,13 +2198,13 @@
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
   const int txw_idx = get_txw_idx(TX_4X16);
   const int txh_idx = get_txh_idx(TX_4X16);
   const int txfm_size_col = tx_size_wide[TX_4X16];
   const int txfm_size_row = tx_size_high[TX_4X16];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
 
@@ -2211,13 +2227,13 @@
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
   const int txw_idx = get_txw_idx(TX_16X4);
   const int txh_idx = get_txh_idx(TX_16X4);
   const int txfm_size_col = tx_size_wide[TX_16X4];
   const int txfm_size_row = tx_size_high[TX_16X4];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
@@ -2242,13 +2258,13 @@
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[128];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
   const int txw_idx = get_txw_idx(TX_16X32);
   const int txh_idx = get_txh_idx(TX_16X32);
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
 
   // column transform
   load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
@@ -2274,20 +2290,20 @@
   (void)tx_type;
   __m128i in[512];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
   const int txw_idx = get_txw_idx(TX_32X64);
   const int txh_idx = get_txh_idx(TX_32X64);
   const int txfm_size_col = tx_size_wide[TX_32X64];
   const int txfm_size_row = tx_size_high[TX_32X64];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int num_row = txfm_size_row >> 2;
   const int num_col = txfm_size_col >> 2;
 
   // column transform
   load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
   for (int i = 0; i < num_col; i++) {
-    av1_fdct64_new_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
+    av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
   }
   for (int i = 0; i < num_col; i++) {
     col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
@@ -2296,7 +2312,7 @@
 
   // row transform
   for (int i = 0; i < num_row; i++) {
-    av1_fdct32_new_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
+    av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
   }
   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512, -shift[2],
@@ -2309,13 +2325,13 @@
   (void)tx_type;
   __m128i in[512];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_64X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
   const int txw_idx = get_txw_idx(TX_64X32);
   const int txh_idx = get_txh_idx(TX_64X32);
   const int txfm_size_col = tx_size_wide[TX_64X32];
   const int txfm_size_row = tx_size_high[TX_64X32];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const int num_row = txfm_size_row >> 2;
   const int num_col = txfm_size_col >> 2;
 
@@ -2331,7 +2347,7 @@
   }
 
   for (int i = 0; i < num_col; i++) {
-    av1_fdct32_new_sse4_1((in + i), (in + i), bitcol, num_col);
+    av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
   }
 
   for (int i = 0; i < num_row; i++) {
@@ -2341,7 +2357,7 @@
 
   // row transform
   for (int i = 0; i < num_row; i++) {
-    av1_fdct64_new_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+    av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
   }
   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512 >> 1,
@@ -2353,13 +2369,13 @@
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[128];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
   const int txw_idx = get_txw_idx(TX_32X16);
   const int txh_idx = get_txh_idx(TX_32X16);
   const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
 
   // column transform
   load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
@@ -2382,13 +2398,13 @@
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
   const int txw_idx = get_txw_idx(TX_8X32);
   const int txh_idx = get_txh_idx(TX_8X32);
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
 
   const int txfm_size_col = tx_size_wide[TX_8X32];
   const int txfm_size_row = tx_size_high[TX_8X32];
@@ -2417,13 +2433,13 @@
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64];
   __m128i *outcoef128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
   const int txw_idx = get_txw_idx(TX_32X8);
   const int txh_idx = get_txh_idx(TX_32X8);
   const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
 
   const int txfm_size_col = tx_size_wide[TX_32X8];
   const int txfm_size_row = tx_size_high[TX_32X8];
@@ -2450,13 +2466,13 @@
                                TX_TYPE tx_type, int bd) {
   __m128i in[8];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
   const int txw_idx = get_txw_idx(TX_4X8);
   const int txh_idx = get_txh_idx(TX_4X8);
   const int txfm_size_col = tx_size_wide[TX_4X8];
   const int txfm_size_row = tx_size_high[TX_4X8];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
 
@@ -2480,13 +2496,13 @@
                                TX_TYPE tx_type, int bd) {
   __m128i in[8];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
   const int txw_idx = get_txw_idx(TX_8X4);
   const int txh_idx = get_txh_idx(TX_8X4);
   const int txfm_size_col = tx_size_wide[TX_8X4];
   const int txfm_size_row = tx_size_high[TX_8X4];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
   int ud_flip, lr_flip;
@@ -2510,13 +2526,13 @@
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[256];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
   const int txw_idx = get_txw_idx(TX_16X64);
   const int txh_idx = get_txh_idx(TX_16X64);
   const int txfm_size_col = tx_size_wide[TX_16X64];
   const int txfm_size_row = tx_size_high[TX_16X64];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   const int num_col = txfm_size_col >> 2;
@@ -2533,7 +2549,7 @@
   }
 
   for (int i = 0; i < num_col; i++) {
-    av1_fdct64_new_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
   }
 
   col_txfm_16x16_rounding(outcoeff128, -shift[1]);
@@ -2552,13 +2568,13 @@
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[256];
   __m128i *outcoeff128 = (__m128i *)coeff;
-  const int8_t *shift = fwd_txfm_shift_ls[TX_64X16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
   const int txw_idx = get_txw_idx(TX_64X16);
   const int txh_idx = get_txh_idx(TX_64X16);
   const int txfm_size_col = tx_size_wide[TX_64X16];
   const int txfm_size_row = tx_size_high[TX_64X16];
-  int bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   // col tranform
@@ -2581,7 +2597,7 @@
 
   transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
   for (int i = 0; i < 4; i++) {
-    av1_fdct64_new_sse4_1(in + i, in + i, bitrow, 4, 4);
+    av1_fdct64_sse4_1(in + i, in + i, bitrow, 4, 4);
   }
   transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
   (void)bd;

diff --git a/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
deleted file mode 100644
index f199b0f..0000000
--- a/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
+++ /dev/null

@@ -1,954 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "aom/aom_integer.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-#include "av1/encoder/x86/temporal_filter_constants.h"
-
-// Compute (a-b)**2 for 8 pixels with size 16-bit
-static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
-                                       uint32_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi32(a_first, b_first);
-  dist_second = _mm_sub_epi32(a_second, b_second);
-  dist_first = _mm_mullo_epi32(dist_first, dist_first);
-  dist_second = _mm_mullo_epi32(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
-}
-
-// Sum up three neighboring distortions for the pixels
-static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
-
-  *sum = _mm_add_epi32(dist_reg, dist_left);
-  *sum = _mm_add_epi32(*sum, dist_right);
-}
-
-static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
-                                    __m128i *sum_second) {
-  highbd_get_sum_4(dist, sum_first);
-  highbd_get_sum_4(dist + 4, sum_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values, plus
-// however many values from y/uv plane are).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
-                                    const __m128i *mul_constants,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
-  const __m128i weight_u32 = _mm_set1_epi32(weight);
-  const __m128i sixteen = _mm_set1_epi32(16);
-  const __m128i zero = _mm_setzero_si128();
-
-  // modifier * 3 / index;
-  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
-  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
-  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
-  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
-
-  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
-  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
-  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
-  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
-
-  // Now we have
-  //   mul_lo: 00 a1 00 a0
-  //   mul_hi: 00 a3 00 a2
-  // Unpack as 64 bit words to get even and odd elements
-  //   unpack_lo: 00 a2 00 a0
-  //   unpack_hi: 00 a3 00 a1
-  // Then we can shift and OR the results to get everything in 32-bits
-  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
-  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
-
-  // Round
-  *output = _mm_add_epi32(mul, rounding_u32);
-  *output = _mm_srl_epi32(*output, strength_u128);
-
-  // Multiply with the weight
-  *output = _mm_min_epu32(*output, sixteen);
-  *output = _mm_sub_epi32(sixteen, *output);
-  *output = _mm_mullo_epi32(*output, weight_u32);
-}
-
-static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
-                                    const __m128i *sum_0_u32,
-                                    const __m128i *sum_1_u32,
-                                    const __m128i *mul_constants_0,
-                                    const __m128i *mul_constants_1,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
-                   weight);
-  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
-                   weight);
-}
-
-// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
-                                                 const __m128i sum_second_u32,
-                                                 const uint16_t *pred,
-                                                 uint16_t *count,
-                                                 uint32_t *accumulator) {
-  // Cast down to 16-bit ints
-  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
-                                      __m128i *reg_second) {
-  highbd_read_dist_4(dist, reg_first);
-  highbd_read_dist_4(dist + 4, reg_second);
-}
-
-static INLINE void highbd_read_chroma_dist_row_8(
-    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
-    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 8 entries from chroma.
-    highbd_read_dist_8(u_dist, u_first, u_second);
-    highbd_read_dist_8(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    highbd_read_dist_4(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
-
-    highbd_read_dist_4(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
-  }
-}
-
-static void av1_highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
-    const uint32_t *const *neighbors_second, int top_weight,
-    int bottom_weight) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 4 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(block_width == 8);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
-  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
-  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
-  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                &v_first, &v_second);
-
-  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      weight = bottom_weight;
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                    &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-    // Get modifier and store result
-    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                     &sum_row_second, &mul_first, &mul_second, strength,
-                     rounding, weight);
-    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                  y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                  &v_first, &v_second);
-  }
-
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void av1_highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_first;
-  const uint32_t *const *neighbors_second;
-
-  // Left
-  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  av1_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  // Right
-  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
-  av1_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-}
-
-// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
-// subsampling in x direction, then we have 16 lumas, else we have 8.
-static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
-    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
-    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
-  __m128i y_reg_fst, y_reg_snd;
-  if (!ss_x) {
-    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
-      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
-    }
-  } else {
-    // Temporary
-    __m128i y_fst, y_snd;
-
-    // First 8
-    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
-
-    // Second 8
-    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
-  }
-
-  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
-  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
-  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
-  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void av1_highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
-    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
-    int top_weight, int bottom_weight, const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_fst, mul_snd;
-
-  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
-  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
-  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
-  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
-
-  __m128i u_sum_row_fst, v_sum_row_fst;
-  __m128i u_sum_row_snd, v_sum_row_snd;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Add chroma values
-  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
-  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
-
-  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
-  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1_fst = u_sum_row_2_fst;
-    u_sum_row_2_fst = u_sum_row_3_fst;
-    u_sum_row_1_snd = u_sum_row_2_snd;
-    u_sum_row_2_snd = u_sum_row_3_snd;
-
-    v_sum_row_1_fst = v_sum_row_2_fst;
-    v_sum_row_2_fst = v_sum_row_3_fst;
-    v_sum_row_1_snd = v_sum_row_2_snd;
-    v_sum_row_2_snd = v_sum_row_3_snd;
-
-    // Add chroma values
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
-
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
-
-    // Add luma values
-    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                         &u_sum_row_snd, &v_sum_row_fst,
-                                         &v_sum_row_snd);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-    } else {
-      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-    }
-
-    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                  u_accum);
-    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                  v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Shift the rows up
-  u_sum_row_1_fst = u_sum_row_2_fst;
-  u_sum_row_2_fst = u_sum_row_3_fst;
-  u_sum_row_1_snd = u_sum_row_2_snd;
-  u_sum_row_2_snd = u_sum_row_3_snd;
-
-  v_sum_row_1_fst = v_sum_row_2_fst;
-  v_sum_row_2_fst = v_sum_row_3_fst;
-  v_sum_row_1_snd = v_sum_row_2_snd;
-  v_sum_row_2_snd = v_sum_row_3_snd;
-
-  // Add chroma values
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void av1_highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_fst;
-  const uint32_t *const *neighbors_snd;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    } else {
-      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      av1_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-    } else {
-      av1_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  av1_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  av1_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-}
-
-void av1_highbd_apply_temporal_filter_sse4_1(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-
-  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 4 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference squared
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
-      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                          y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                          u_dist_ptr + blk_col);
-      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                          v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  av1_highbd_apply_temporal_filter_luma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
-      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  av1_highbd_apply_temporal_filter_chroma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
-      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}

diff --git a/libaom/av1/encoder/x86/ml_sse3.c b/libaom/av1/encoder/x86/ml_sse3.c
index c520c3c..89b1e6a 100644
--- a/libaom/av1/encoder/x86/ml_sse3.c
+++ b/libaom/av1/encoder/x86/ml_sse3.c

@@ -151,7 +151,7 @@
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
 void av1_nn_predict_sse3(const float *input_nodes,
-                         const NN_CONFIG *const nn_config,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
                          float *const output) {
   float buf[2][NN_MAX_NODES_PER_LAYER];
   int buf_index = 0;
@@ -162,7 +162,7 @@
     const float *layer_weights = nn_config->weights[layer];
     const float *layer_bias = nn_config->bias[layer];
     bool output_layer = (layer == nn_config->num_hidden_layers);
-    float *const output_nodes = output_layer ? output : buf[buf_index];
+    float *const output_nodes = output_layer ? output : &buf[buf_index][0];
     const int num_outputs = output_layer ? nn_config->num_outputs
                                          : nn_config->num_hidden_nodes[layer];
 
@@ -240,4 +240,5 @@
     num_inputs = num_outputs;
     buf_index = 1 - buf_index;
   }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }

diff --git a/libaom/av1/encoder/x86/pickrst_avx2.c b/libaom/av1/encoder/x86/pickrst_avx2.c
index d00fca0..f8703a2 100644
--- a/libaom/av1/encoder/x86/pickrst_avx2.c
+++ b/libaom/av1/encoder/x86/pickrst_avx2.c

@@ -22,9 +22,9 @@
                                  const __m128i *shuffle, const __m256i *kl) {
   const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
   const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
-  const __m256i dst0 = yy_loadu_256(dst);
+  const __m256i dst0 = yy_load_256(dst);
   const __m256i r0 = _mm256_add_epi32(dst0, d0);
-  yy_storeu_256(dst, r0);
+  yy_store_256(dst, r0);
 }
 
 static INLINE void acc_stat_win7_one_line_avx2(
@@ -74,7 +74,9 @@
 
   int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+
+  DECLARE_ALIGNED(32, int32_t,
+                  H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
@@ -120,6 +122,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
                                         const __m256i *shuffle,
                                         const __m256i *dgd_ijkl) {
@@ -145,14 +148,14 @@
   // Take the lower-half of d0, extend to u64, add it on to dst (H)
   const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
   // d0l = [a b] [c d] as u64
-  const __m256i dst0 = yy_loadu_256(dst);
-  yy_storeu_256(dst, _mm256_add_epi64(d0l, dst0));
+  const __m256i dst0 = yy_load_256(dst);
+  yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
 
   // Take the upper-half of d0, extend to u64, add it on to dst (H)
   const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
   // d0h = [e f] [g h] as u64
-  const __m256i dst1 = yy_loadu_256(dst + 4);
-  yy_storeu_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+  const __m256i dst1 = yy_load_256(dst + 4);
+  yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
 }
 
 static INLINE void acc_stat_highbd_win7_one_line_avx2(
@@ -216,7 +219,7 @@
       find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
   const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
@@ -316,7 +319,9 @@
       find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int64_t,
+      H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;
   const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
@@ -376,6 +381,7 @@
                                v_end, dgd_stride, src_stride, M, H, bit_depth);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void acc_stat_win5_one_line_avx2(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
@@ -423,7 +429,9 @@
 
   int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int32_t,
+      H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;
@@ -485,7 +493,7 @@
   }
 }
 
-static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+static INLINE __m256i pair_set_epi16(int a, int b) {
   return _mm256_set1_epi32(
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
 }
@@ -622,6 +630,238 @@
   return err;
 }
 
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt0,
+                                                int flt0_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt1,
+                                                int flt1_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
+                               int src_stride, const uint8_t *dat8,
+                               int dat_stride, int32_t *flt0, int flt0_stride,
+                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
+                               int64_t C[2], const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
+                                dat_stride, flt0, flt0_stride, flt1,
+                                flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt1, flt1_stride, H, C);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -841,3 +1081,4 @@
   err += sum[0] + sum[1] + sum[2] + sum[3];
   return err;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/av1/encoder/x86/pickrst_sse4.c b/libaom/av1/encoder/x86/pickrst_sse4.c
index a94e169..a2f65a5 100644
--- a/libaom/av1/encoder/x86/pickrst_sse4.c
+++ b/libaom/av1/encoder/x86/pickrst_sse4.c

@@ -125,6 +125,7 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
                                          const __m128i *shuffle,
                                          const __m128i *dgd_ijkl) {
@@ -386,6 +387,7 @@
                                v_end, dgd_stride, src_stride, M, H, bit_depth);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void acc_stat_win5_one_line_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
@@ -495,7 +497,7 @@
   }
 }
 
-static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+static INLINE __m128i pair_set_epi16(int a, int b) {
   return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
 }
 
@@ -622,6 +624,7 @@
   return err;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -827,3 +830,4 @@
   err += sum[0] + sum[1];
   return err;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/av1/encoder/x86/temporal_filter_avx2.c b/libaom/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000..a11f791
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_avx2.c

@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m128i vf1_128, vf2_128;
+    __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+    vf1 = _mm256_cvtepu8_epi16(vf1_128);
+    vf2 = _mm256_cvtepu8_epi16(vf2_128);
+    vdiff1 = _mm256_sub_epi16(vf1, vf2);
+    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+    vmax = _mm256_max_epu8(vsrc1, vsrc2);
+    vmin = _mm256_min_epu8(vsrc1, vsrc2);
+    vdiff = _mm256_subs_epu8(vmax, vmin);
+
+    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+    _mm256_storeu_si256((__m256i *)(dst), vres1);
+    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+                                                int block_width) {
+  __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+  if (col == 0) {
+    // For the first column, replicate the first element twice to the left
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+  }
+  if (col == block_width - 4) {
+    // For the last column, replicate the last element twice to the right
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+  }
+  return _mm256_cvtepu16_epi32(v128tmp);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+static void apply_temporal_filter_planewise(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+  assert(((block_width == 32) && (block_height == 32)) ||
+         ((block_width == 16) && (block_height == 16)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = xx_load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height; row++) {
+      __m256i vsum = _mm256_setzero_si256();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum = _mm256_add_epi32(vsum, vsrc[i]);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      // Load next row to the last element
+      if (row <= block_width - 4) {
+        vsrc[4] = xx_load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        vsrc[4] = vsrc[3];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+      }
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = acc_5x5_sse[i][j];
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
+
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+      const double scaled_diff =
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
+    }
+  }
+}
+
+void av1_apply_temporal_filter_planewise_avx2(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (is_high_bitdepth) {
+    assert(0 && "Only support low bit-depth with avx2!");
+  }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
+  }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
+}

diff --git a/libaom/av1/encoder/x86/temporal_filter_constants.h b/libaom/av1/encoder/x86/temporal_filter_constants.h
index b3a10dd..7cd61d7 100644
--- a/libaom/av1/encoder/x86/temporal_filter_constants.h
+++ b/libaom/av1/encoder/x86/temporal_filter_constants.h

@@ -373,29 +373,35 @@
   HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
 };
 
-static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
-    { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 };
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+    };
 
 static const uint32_t
     *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
       HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
     };
 
-static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
-    { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2,
-      HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 };
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+    };
 
-static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
-    { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 };
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
+    };
 
 static const uint32_t
     *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
       HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
     };
 
-static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
-    { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4,
-      HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 };
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
+    };
 
 #define DIST_STRIDE ((BW) + 2)
 #endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_

diff --git a/libaom/av1/encoder/x86/temporal_filter_sse2.c b/libaom/av1/encoder/x86/temporal_filter_sse2.c
new file mode 100644
index 0000000..98a6b82
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_sse2.c

@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
+                              const uint8_t *frame2, const unsigned int stride2,
+                              const int block_width, const int block_height,
+                              uint16_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 16) {
+      // Set zero to uninitialized memory to avoid uninitialized loads later
+      *(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
+      __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
+      __m128i vdiff = _mm_subs_epu8(vmax, vmin);
+
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
+      __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
+
+      __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
+      __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
+    }
+
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + block_width + 2) =
+        _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp = _mm_loadu_si128((__m128i *)src);
+  __m128i vzero = _mm_setzero_si128();
+  __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
+  __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void apply_temporal_filter_planewise(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+  assert(((block_width == 32) && (block_height == 32)) ||
+         ((block_width == 16) && (block_height == 16)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height; row++) {
+      __m128i vsum1 = _mm_setzero_si128();
+      __m128i vsum2 = _mm_setzero_si128();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
+        vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row
+        xx_load_and_pad(src, vsrc[4], col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        // Padding for bottom 2 rows
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
+      }
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = acc_5x5_sse[i][j];
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
+            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
+            diff_sse += luma_sq_error[yy * ww + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
+
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+      const double scaled_diff =
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
+    }
+  }
+}
+
+void av1_apply_temporal_filter_planewise_sse2(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (is_high_bitdepth) {
+    assert(0 && "Only support low bit-depth with sse2!");
+  }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
+  }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
+}

diff --git a/libaom/av1/encoder/x86/temporal_filter_sse4.c b/libaom/av1/encoder/x86/temporal_filter_sse4.c
index 556d00c..e3f9f5f 100644
--- a/libaom/av1/encoder/x86/temporal_filter_sse4.c
+++ b/libaom/av1/encoder/x86/temporal_filter_sse4.c

@@ -18,6 +18,10 @@
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/x86/temporal_filter_constants.h"
 
+//////////////////////////
+// Low bit-depth Begins //
+//////////////////////////
+
 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
 // difference squared, and store as unsigned 16-bit integer to dst.
 static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
@@ -325,7 +329,7 @@
 // filtering on a luma block of 16 X block_height. Use blk_fw as an array of
 // size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
 // else use top_weight for top half, and bottom weight for bottom half.
-static void av1_apply_temporal_filter_luma_16(
+static void apply_temporal_filter_luma_16(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
     int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
     int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
@@ -517,7 +521,7 @@
 }
 
 // Perform temporal filter for the luma component.
-static void av1_apply_temporal_filter_luma(
+static void apply_temporal_filter_luma(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
     int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
     int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
@@ -541,7 +545,7 @@
     neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
     neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
     if (use_whole_blk) {
-      av1_apply_temporal_filter_luma_16(
+      apply_temporal_filter_luma_16(
           y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
           u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
@@ -550,7 +554,7 @@
           v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
           bottom_weight, NULL);
     } else {
-      av1_apply_temporal_filter_luma_16(
+      apply_temporal_filter_luma_16(
           y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
           u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
@@ -565,7 +569,7 @@
   // Left
   neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  av1_apply_temporal_filter_luma_16(
+  apply_temporal_filter_luma_16(
       y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
       u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
       v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
@@ -580,7 +584,7 @@
   neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_apply_temporal_filter_luma_16(
+    apply_temporal_filter_luma_16(
         y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
         u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
@@ -598,7 +602,7 @@
   // Middle Second
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_apply_temporal_filter_luma_16(
+    apply_temporal_filter_luma_16(
         y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
         u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
@@ -610,7 +614,7 @@
 
   // Right
   neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-  av1_apply_temporal_filter_luma_16(
+  apply_temporal_filter_luma_16(
       y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
       u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
       v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
@@ -623,7 +627,7 @@
 // filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
-static void av1_apply_temporal_filter_chroma_8(
+static void apply_temporal_filter_chroma_8(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
     int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
     int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
@@ -788,7 +792,7 @@
 }
 
 // Perform temporal filter for the chroma components.
-static void av1_apply_temporal_filter_chroma(
+static void apply_temporal_filter_chroma(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
     int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
     int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
@@ -820,7 +824,7 @@
     }
 
     if (use_whole_blk) {
-      av1_apply_temporal_filter_chroma_8(
+      apply_temporal_filter_chroma_8(
           y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
           u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
@@ -829,7 +833,7 @@
           y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
           top_weight, bottom_weight, NULL);
     } else {
-      av1_apply_temporal_filter_chroma_8(
+      apply_temporal_filter_chroma_8(
           y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
           u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
@@ -851,7 +855,7 @@
     neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
   }
 
-  av1_apply_temporal_filter_chroma_8(
+  apply_temporal_filter_chroma_8(
       y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
       u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
       v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
@@ -874,7 +878,7 @@
 
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_apply_temporal_filter_chroma_8(
+    apply_temporal_filter_chroma_8(
         y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
         u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
@@ -892,7 +896,7 @@
   // Middle Second
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    av1_apply_temporal_filter_chroma_8(
+    apply_temporal_filter_chroma_8(
         y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
         u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
@@ -911,7 +915,7 @@
     neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
   }
 
-  av1_apply_temporal_filter_chroma_8(
+  apply_temporal_filter_chroma_8(
       y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
       u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
       v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
@@ -921,14 +925,47 @@
       bottom_weight, NULL);
 }
 
-void av1_apply_temporal_filter_sse4_1(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
+static void apply_temporal_filter_yuv(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int use_whole_blk = !use_subblock;
+  const int *blk_fw = subblock_filter_weights;
+
+  // Block information (Y-plane).
+  const unsigned int block_height = block_size_high[block_size];
+  const unsigned int block_width = block_size_wide[block_size];
+  const int mb_pels = block_height * block_width;
+  const int y_src_stride = ref_frame->y_stride;
+  const int y_pre_stride = block_width;
+  const int mb_y_src_offset =
+      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
+
+  // Block information (UV-plane).
+  const int ss_y = mbd->plane[1].subsampling_y;
+  const int ss_x = mbd->plane[1].subsampling_x;
+  const unsigned int uv_height = block_height >> ss_y;
+  const unsigned int uv_width = block_width >> ss_x;
+  const int uv_src_stride = ref_frame->uv_stride;
+  const int uv_pre_stride = block_width >> ss_x;
+  const int mb_uv_src_offset =
+      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
+
+  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
+  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
+  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
+  const uint8_t *y_pre = pred;
+  const uint8_t *u_pre = pred + mb_pels;
+  const uint8_t *v_pre = pred + mb_pels * 2;
+  uint32_t *y_accum = accum;
+  uint32_t *u_accum = accum + mb_pels;
+  uint32_t *v_accum = accum + mb_pels * 2;
+  uint16_t *y_count = count;
+  uint16_t *u_count = count + mb_pels;
+  uint16_t *v_count = count + mb_pels * 2;
+
   const unsigned int chroma_height = block_height >> ss_y,
                      chroma_width = block_width >> ss_x;
 
@@ -992,15 +1029,1016 @@
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  av1_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
-      u_dist_ptr, v_dist_ptr);
+  apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src,
+                             v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride,
+                             block_width, block_height, ss_x, ss_y, strength,
+                             blk_fw_ptr, use_whole_blk, y_accum, y_count,
+                             y_dist_ptr, u_dist_ptr, v_dist_ptr);
 
-  av1_apply_temporal_filter_chroma(
+  apply_temporal_filter_chroma(
       y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);
 }
+
+////////////////////////
+// Low bit-depth Ends //
+////////////////////////
+
+///////////////////////////
+// High bit-depth Begins //
+///////////////////////////
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
+    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+}
+
+static void highbd_apply_temporal_filter_yuv(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int use_whole_blk = !use_subblock;
+  const int *blk_fw = subblock_filter_weights;
+
+  // Block information (Y-plane).
+  const unsigned int block_height = block_size_high[block_size];
+  const unsigned int block_width = block_size_wide[block_size];
+  const int mb_pels = block_height * block_width;
+  const int y_src_stride = ref_frame->y_stride;
+  const int y_pre_stride = block_width;
+  const int mb_y_src_offset =
+      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
+
+  // Block information (UV-plane).
+  const int ss_y = mbd->plane[1].subsampling_y;
+  const int ss_x = mbd->plane[1].subsampling_x;
+  const unsigned int uv_height = block_height >> ss_y;
+  const unsigned int uv_width = block_width >> ss_x;
+  const int uv_src_stride = ref_frame->uv_stride;
+  const int uv_pre_stride = block_width >> ss_x;
+  const int mb_uv_src_offset =
+      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
+
+  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
+  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
+  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
+  const uint8_t *y_pre = pred;
+  const uint8_t *u_pre = pred + mb_pels;
+  const uint8_t *v_pre = pred + mb_pels * 2;
+  uint32_t *y_accum = accum;
+  uint32_t *u_accum = accum + mb_pels;
+  uint32_t *v_accum = accum + mb_pels * 2;
+  uint16_t *y_count = count;
+  uint16_t *u_count = count + mb_pels;
+  uint16_t *v_count = count + mb_pels * 2;
+
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
+      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
+      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
+
+/////////////////////////
+// High bit-depth Ends //
+/////////////////////////
+
+void av1_apply_temporal_filter_yuv_sse4_1(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
+  assert(num_planes == 3);
+  (void)num_planes;
+  if (is_high_bitdepth) {
+    highbd_apply_temporal_filter_yuv(
+        ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
+        subblock_filter_weights, pred, accum, count);
+  } else {
+    apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col,
+                              strength, use_subblock, subblock_filter_weights,
+                              pred, accum, count);
+  }
+}

diff --git a/libaom/av1/encoder/x86/wedge_utils_avx2.c b/libaom/av1/encoder/x86/wedge_utils_avx2.c
index 2a792f1..c06bad8 100644
--- a/libaom/av1/encoder/x86/wedge_utils_avx2.c
+++ b/libaom/av1/encoder/x86/wedge_utils_avx2.c

@@ -84,8 +84,8 @@
 /**
  * See av1_wedge_sign_from_residuals_c
  */
-int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
-                                       int N, int64_t limit) {
+int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
   int64_t acc;
   __m256i v_acc0_d = _mm256_setzero_si256();
 

diff --git a/libaom/av1/encoder/x86/wedge_utils_sse2.c b/libaom/av1/encoder/x86/wedge_utils_sse2.c
index 4d2e99f..f3f4b8a 100644
--- a/libaom/av1/encoder/x86/wedge_utils_sse2.c
+++ b/libaom/av1/encoder/x86/wedge_utils_sse2.c

@@ -97,8 +97,8 @@
 /**
  * See av1_wedge_sign_from_residuals_c
  */
-int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
-                                       int N, int64_t limit) {
+int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
   int64_t acc;
 
   __m128i v_sign_d;
@@ -193,8 +193,8 @@
  */
 void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
                                           const int16_t *b, int N) {
-  const __m128i v_neg_w =
-      _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+  const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
+                                        (short)0xffff, 0, (short)0xffff, 0);
 
   assert(N % 64 == 0);
 

diff --git a/libaom/build/cmake/aom_config_defaults.cmake b/libaom/build/cmake/aom_config_defaults.cmake
index f498acd..f9e70eb 100644
--- a/libaom/build/cmake/aom_config_defaults.cmake
+++ b/libaom/build/cmake/aom_config_defaults.cmake

@@ -20,46 +20,45 @@
 # in this file.
 #
 
-set_aom_detect_var(INLINE "" STRING "Sets INLINE value for current target.")
+set_aom_detect_var(INLINE "" "Sets INLINE value for current target.")
 
 # CPUs.
-set_aom_detect_var(ARCH_ARM 0 NUMBER "Enables ARM architecture.")
-set_aom_detect_var(ARCH_MIPS 0 NUMBER "Enables MIPS architecture.")
-set_aom_detect_var(ARCH_PPC 0 NUMBER "Enables PPC architecture.")
-set_aom_detect_var(ARCH_X86 0 NUMBER "Enables X86 architecture.")
-set_aom_detect_var(ARCH_X86_64 0 NUMBER "Enables X86_64 architecture.")
+set_aom_detect_var(ARCH_ARM 0 "Enables ARM architecture.")
+set_aom_detect_var(ARCH_MIPS 0 "Enables MIPS architecture.")
+set_aom_detect_var(ARCH_PPC 0 "Enables PPC architecture.")
+set_aom_detect_var(ARCH_X86 0 "Enables X86 architecture.")
+set_aom_detect_var(ARCH_X86_64 0 "Enables X86_64 architecture.")
 
 # ARM feature flags.
-set_aom_detect_var(HAVE_NEON 0 NUMBER "Enables NEON intrinsics optimizations.")
+set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
 
 # MIPS feature flags.
-set_aom_detect_var(HAVE_DSPR2 0 NUMBER "Enables DSPR2 optimizations.")
-set_aom_detect_var(HAVE_MIPS32 0 NUMBER "Enables MIPS32 optimizations.")
-set_aom_detect_var(HAVE_MIPS64 0 NUMBER "Enables MIPS64 optimizations. ")
-set_aom_detect_var(HAVE_MSA 0 NUMBER "Enables MSA optimizations.")
+set_aom_detect_var(HAVE_DSPR2 0 "Enables DSPR2 optimizations.")
+set_aom_detect_var(HAVE_MIPS32 0 "Enables MIPS32 optimizations.")
+set_aom_detect_var(HAVE_MIPS64 0 "Enables MIPS64 optimizations. ")
+set_aom_detect_var(HAVE_MSA 0 "Enables MSA optimizations.")
 
 # PPC feature flags.
-set_aom_detect_var(HAVE_VSX 0 NUMBER "Enables VSX optimizations.")
+set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
 
 # x86/x86_64 feature flags.
-set_aom_detect_var(HAVE_AVX 0 NUMBER "Enables AVX optimizations.")
-set_aom_detect_var(HAVE_AVX2 0 NUMBER "Enables AVX2 optimizations.")
-set_aom_detect_var(HAVE_MMX 0 NUMBER "Enables MMX optimizations. ")
-set_aom_detect_var(HAVE_SSE 0 NUMBER "Enables SSE optimizations.")
-set_aom_detect_var(HAVE_SSE2 0 NUMBER "Enables SSE2 optimizations.")
-set_aom_detect_var(HAVE_SSE3 0 NUMBER "Enables SSE3 optimizations.")
-set_aom_detect_var(HAVE_SSE4_1 0 NUMBER "Enables SSE 4.1 optimizations.")
-set_aom_detect_var(HAVE_SSE4_2 0 NUMBER "Enables SSE 4.2 optimizations.")
-set_aom_detect_var(HAVE_SSSE3 0 NUMBER "Enables SSSE3 optimizations.")
+set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
+set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ")
+set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.")
+set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.")
+set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.")
+set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.")
 
 # Flags describing the build environment.
-set_aom_detect_var(HAVE_FEXCEPT 0 NUMBER
+set_aom_detect_var(HAVE_FEXCEPT 0
                    "Internal flag, GNU fenv.h present for target.")
-set_aom_detect_var(HAVE_PTHREAD_H 0 NUMBER
-                   "Internal flag, target pthread support.")
-set_aom_detect_var(HAVE_UNISTD_H 0 NUMBER
+set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.")
+set_aom_detect_var(HAVE_UNISTD_H 0
                    "Internal flag, unistd.h present for target.")
-set_aom_detect_var(HAVE_WXWIDGETS 0 NUMBER "WxWidgets present.")
+set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.")
 
 #
 # Variables in this section can be set from the CMake command line or from
@@ -67,72 +66,75 @@
 #
 
 # Build configuration flags.
-set_aom_config_var(AOM_RTCD_FLAGS "" STRING
+set_aom_config_var(AOM_RTCD_FLAGS ""
                    "Arguments to pass to rtcd.pl. Separate with ';'")
-set_aom_config_var(CONFIG_AV1_DECODER 1 NUMBER "Enable AV1 decoder.")
-set_aom_config_var(CONFIG_AV1_ENCODER 1 NUMBER "Enable AV1 encoder.")
-set_aom_config_var(CONFIG_BIG_ENDIAN 0 NUMBER "Internal flag.")
-set_aom_config_var(CONFIG_GCC 0 NUMBER "Building with GCC (detect).")
-set_aom_config_var(CONFIG_GCOV 0 NUMBER "Enable gcov support.")
-set_aom_config_var(CONFIG_GPROF 0 NUMBER "Enable gprof support.")
-set_aom_config_var(CONFIG_LIBYUV 1 NUMBER
-                   "Enables libyuv scaling/conversion support.")
+set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.")
+set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.")
+set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
+set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
+set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
+set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
+set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.")
 
-set_aom_config_var(CONFIG_MULTITHREAD 1 NUMBER "Multithread support.")
-set_aom_config_var(CONFIG_OS_SUPPORT 0 NUMBER "Internal flag.")
-set_aom_config_var(CONFIG_PIC 0 NUMBER "Build with PIC enabled.")
-set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 NUMBER
-                   "Runtime CPU detection support.")
-set_aom_config_var(CONFIG_SHARED 0 NUMBER "Build shared libs.")
-set_aom_config_var(CONFIG_STATIC 1 NUMBER "Build static libs.")
-set_aom_config_var(CONFIG_WEBM_IO 1 NUMBER "Enables WebM support.")
+set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.")
+set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.")
+set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.")
+set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.")
+set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.")
+set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.")
 
 # Debugging flags.
-set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 NUMBER "Bitstream debugging flag.")
-set_aom_config_var(CONFIG_DEBUG 0 NUMBER "Debug build flag.")
-set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 NUMBER "Mismatch debugging flag.")
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "Bitstream debugging flag.")
+set_aom_config_var(CONFIG_DEBUG 0 "Debug build flag.")
+set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.")
 
 # AV1 feature flags.
-set_aom_config_var(CONFIG_ACCOUNTING 0 NUMBER "Enables bit accounting.")
-set_aom_config_var(CONFIG_ANALYZER 0 NUMBER "Enables bit stream analyzer.")
-set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 NUMBER
+set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.")
+set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.")
+set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0
                    "Coefficient range check.")
-set_aom_config_var(CONFIG_DENOISE 1 NUMBER
+set_aom_config_var(CONFIG_DENOISE 1
                    "Denoise/noise modeling support in encoder.")
-set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
-                   "Enables encoder config file support.")
-set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
-set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
-                   "Enables internal encoder stats.")
-set_aom_config_var(CONFIG_LOWBITDEPTH 1 NUMBER
-                   "Enables 8-bit optimized pipeline.")
-set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER
+set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.")
+set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.")
+set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0
+                   "Force high bitdepth decoding pipeline on 8-bit input.")
+mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING)
+set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2
                    "Max profile to support decoding.")
-set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER
-                   "Only enables normal tile mode.")
-set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.")
-set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.")
-set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
-set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")
+set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 "Only enables normal tile mode.")
+set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.")
+set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.")
+set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.")
+set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.")
+set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
 
 # AV1 experiment flags.
-set_aom_config_var(CONFIG_SPEED_STATS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 NUMBER
-                   "AV1 experiment flag.")
-set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 NUMBER
-                   "AV1 experiment flag.")
-set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 NUMBER
+set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
                    "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
-set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 NUMBER
+set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0
                    "Collect stats on partition decisions.")
-set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0 NUMBER
+set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0
                    "Collect encoding component timing information.")
+set_aom_config_var(CONFIG_LPF_MASK 0
+                   "Enable the use loop filter bitmasks for optimizations.")
+set_aom_config_var(CONFIG_HTB_TRELLIS 0
+                   "Enable the use of hash table for trellis optimizations.")
+set_aom_config_var(CONFIG_REALTIME_ONLY 0
+                   "Build for RTC-only to reduce binary size.")
+set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
+                   "Build with high bitdepth support.")
+set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.")
+set_aom_config_var(CONFIG_SUPERRES_IN_RECODE 1
+                   "Enable encoding both full-res and superres in recode loop"
+                   "when SUPERRES_AUTO mode is used.")
 #
 # Variables in this section control optional features of the build system.
 #
@@ -171,10 +173,10 @@
                    ON)
 
 # x86/x86_64 assembly/intrinsics flags.
-set_aom_option_var(ENABLE_MMX
-                   "Enables MMX optimizations on x86/x86_64 targets." ON)
-set_aom_option_var(ENABLE_SSE
-                   "Enables SSE optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets."
+                   ON)
+set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets."
+                   ON)
 set_aom_option_var(ENABLE_SSE2
                    "Enables SSE2 optimizations on x86/x86_64 targets." ON)
 set_aom_option_var(ENABLE_SSE3
@@ -185,7 +187,7 @@
                    "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
 set_aom_option_var(ENABLE_SSE4_2
                    "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
-set_aom_option_var(ENABLE_AVX
-                   "Enables AVX optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
+                   ON)
 set_aom_option_var(ENABLE_AVX2
                    "Enables AVX2 optimizations on x86/x86_64 targets." ON)

diff --git a/libaom/build/cmake/aom_configure.cmake b/libaom/build/cmake/aom_configure.cmake
index cb64713..b870a94 100644
--- a/libaom/build/cmake/aom_configure.cmake
+++ b/libaom/build/cmake/aom_configure.cmake

@@ -24,6 +24,22 @@
 include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
+if(DEFINED CONFIG_LOWBITDEPTH)
+  message(WARNING "CONFIG_LOWBITDEPTH has been removed. \
+    Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \
+    and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.")
+  if(NOT CONFIG_LOWBITDEPTH)
+    set(FORCE_HIGHBITDEPTH_DECODING
+        1
+        CACHE STRING "${cmake_cmdline_helpstring}" FORCE)
+  endif()
+endif()
+
+if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH)
+  change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1
+                         "FORCE_HIGHBITDEPTH_DECODING")
+endif()
+
 # Generate the user config settings.
 list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
 foreach(cache_var ${aom_build_vars})
@@ -36,28 +52,29 @@
 
 # Detect target CPU.
 if(NOT AOM_TARGET_CPU)
-  if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
-     "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+  if("${cpu_lowercase}" STREQUAL "amd64"
+     OR "${cpu_lowercase}" STREQUAL "x86_64")
     if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
       set(AOM_TARGET_CPU "x86")
     elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
       set(AOM_TARGET_CPU "x86_64")
     else()
-      message(FATAL_ERROR
-                "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
-                "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
-                "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
-                "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
+      message(
+        FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
+                    "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
+                    "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
+                    "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
     endif()
-  elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
-         "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
+  elseif("${cpu_lowercase}" STREQUAL "i386"
+         OR "${cpu_lowercase}" STREQUAL "x86")
     set(AOM_TARGET_CPU "x86")
-  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm" OR
-         "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^mips")
-    set(AOM_TARGET_CPU "${CMAKE_SYSTEM_PROCESSOR}")
-  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64")
+  elseif("${cpu_lowercase}" MATCHES "^arm"
+         OR "${cpu_lowercase}" MATCHES "^mips")
+    set(AOM_TARGET_CPU "${cpu_lowercase}")
+  elseif("${cpu_lowercase}" MATCHES "aarch64")
     set(AOM_TARGET_CPU "arm64")
-  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^ppc")
+  elseif("${cpu_lowercase}" MATCHES "^ppc")
     set(AOM_TARGET_CPU "ppc")
   else()
     message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
@@ -69,7 +86,7 @@
 if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
   if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}")
     file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}"
-                       "${CMAKE_TOOLCHAIN_FILE}")
+         "${CMAKE_TOOLCHAIN_FILE}")
   else()
     set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}")
   endif()
@@ -95,7 +112,6 @@
 if(BUILD_SHARED_LIBS)
   set(CONFIG_PIC 1)
   set(CONFIG_SHARED 1)
-  set(CONFIG_STATIC 0)
 endif()
 
 if(NOT MSVC)
@@ -104,8 +120,8 @@
     # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
     # work.
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux" AND "${AOM_TARGET_CPU}" MATCHES
-       "^armv[78]")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux"
+       AND "${AOM_TARGET_CPU}" MATCHES "^armv[78]")
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
     else()
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
@@ -124,10 +140,11 @@
   endif()
 
   if(NOT AS_EXECUTABLE)
-    message(FATAL_ERROR
-              "Unable to find assembler. Install 'yasm' or 'nasm.' "
-              "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
-              "your cmake command line.")
+    message(
+      FATAL_ERROR
+        "Unable to find assembler. Install 'yasm' or 'nasm.' "
+        "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
+        "your cmake command line.")
   endif()
   get_asm_obj_format("objformat")
   set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
@@ -136,20 +153,26 @@
   if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
     set(AS_EXECUTABLE as)
     set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
-  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-    if(NOT AS_EXECUTABLE)
-      set(AS_EXECUTABLE as)
-    endif()
   elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
     if(NOT AS_EXECUTABLE)
       set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
     endif()
+  else()
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE as)
+    endif()
   endif()
-  if(NOT AS_EXECUTABLE)
-    message(FATAL_ERROR
-              "Unknown assembler for: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  find_program(as_executable_found ${AS_EXECUTABLE})
+  if(NOT as_executable_found)
+    message(
+      FATAL_ERROR
+        "Unable to find assembler and optimizations are enabled."
+        "Searched for ${AS_EXECUTABLE}. Install it, add it to your path, or "
+        "set the assembler directly by adding -DAS_EXECUTABLE=<assembler path> "
+        "to your CMake command line."
+        "To build without optimizations, add -DAOM_TARGET_CPU=generic to your "
+        "cmake command line.")
   endif()
-
   string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
 endif()
 
@@ -174,7 +197,7 @@
   require_compiler_flag("-pg" YES)
 endif()
 
-if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows")
+if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows\|Android")
   set(CONFIG_OS_SUPPORT 1)
 endif()
 
@@ -199,15 +222,12 @@
 
 if(NOT MSVC)
   aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
-  aom_check_c_compiles(
-    "fenv_check"
-    "#define _GNU_SOURCE
+  aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE
                         #include <fenv.h>
                         void unused(void) {
                           (void)unused;
                           (void)feenableexcept(FE_DIVBYZERO | FE_INVALID);
-                        }"
-    HAVE_FEXCEPT)
+                        }" HAVE_FEXCEPT)
   aom_pop_var(CMAKE_REQUIRED_LIBRARIES)
 endif()
 
@@ -260,6 +280,7 @@
   add_compiler_flag_if_supported("-Wimplicit-function-declaration")
   add_compiler_flag_if_supported("-Wlogical-op")
   add_compiler_flag_if_supported("-Wpointer-arith")
+  add_compiler_flag_if_supported("-Wshorten-64-to-32")
   add_compiler_flag_if_supported("-Wsign-compare")
   add_compiler_flag_if_supported("-Wstring-conversion")
   add_compiler_flag_if_supported("-Wtype-limits")
@@ -267,22 +288,21 @@
   add_compiler_flag_if_supported("-Wunused")
   add_compiler_flag_if_supported("-Wvla")
 
-  if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND "${SANITIZE}" MATCHES
-     "address|undefined")
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU"
+     AND "${SANITIZE}" MATCHES "address|undefined")
 
     # This combination has more stack overhead, so we account for it by
     # providing higher stack limit than usual.
     add_c_flag_if_supported("-Wstack-usage=170000")
     add_cxx_flag_if_supported("-Wstack-usage=270000")
+  elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
+    add_c_flag_if_supported("-Wstack-usage=117000")
+    add_cxx_flag_if_supported("-Wstack-usage=240000")
   else()
     add_c_flag_if_supported("-Wstack-usage=100000")
     add_cxx_flag_if_supported("-Wstack-usage=240000")
   endif()
 
-  # TODO(jzern): this could be added as a cxx flags for test/*.cc only, avoiding
-  # third_party.
-  add_c_flag_if_supported("-Wshorten-64-to-32")
-
   # Add -Wshadow only for C files to avoid massive gtest warning spam.
   add_c_flag_if_supported("-Wshadow")
 
@@ -317,10 +337,10 @@
 # Generate aom_config templates.
 set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
 set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
-execute_process(COMMAND
-                  ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-                  -DAOM_ROOT=${AOM_ROOT} -P
-                  "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
+execute_process(
+  COMMAND ${CMAKE_COMMAND}
+          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P
+          "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
 
 # Generate aom_config.{asm,h}.
 configure_file("${aom_config_asm_template}"
@@ -344,14 +364,14 @@
 endif()
 
 set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
-    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
-    "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
+                              "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+                              "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
 set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
-    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
-    "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
+                              "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+                              "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
 set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
-    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
-    "${AOM_ROOT}/av1/common/av1_rtcd.c")
+                              "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+                              "${AOM_ROOT}/av1/common/av1_rtcd.c")
 set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd)
 list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT)
 math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1")
@@ -361,16 +381,18 @@
   list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
   list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
   list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
-  execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
-                          --arch=${AOM_TARGET_CPU}
-                          --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
-                          --config=${AOM_CONFIG_DIR}/config/aom_config.h
-                          ${AOM_RTCD_CONFIG_FILE}
-                  OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
+  execute_process(
+    COMMAND
+      ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
+      --arch=${AOM_TARGET_CPU}
+      --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+      --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE}
+    OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
 endforeach()
 
 # Generate aom_version.h.
-execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+execute_process(COMMAND ${CMAKE_COMMAND}
+                        -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
                         -DAOM_ROOT=${AOM_ROOT}
                         -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
                         -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P

diff --git a/libaom/build/cmake/aom_install.cmake b/libaom/build/cmake/aom_install.cmake
index 47206d8..cd40fe4 100644
--- a/libaom/build/cmake/aom_install.cmake
+++ b/libaom/build/cmake/aom_install.cmake

@@ -36,20 +36,20 @@
     add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
 
     # Setup a rule to generate aom.pc.
-    add_custom_command(OUTPUT "${AOM_PKG_CONFIG_FILE}"
-                       COMMAND
-                         ${CMAKE_COMMAND} ARGS
-                         -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-                         -DAOM_ROOT=${AOM_ROOT}
-                         -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
-                         -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-                         -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-                         -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-                         -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
-                         -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
-                         -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
-                         "${AOM_ROOT}/build/cmake/pkg_config.cmake"
-                       COMMENT "Writing aom.pc" VERBATIM)
+    add_custom_command(
+      OUTPUT "${AOM_PKG_CONFIG_FILE}"
+      COMMAND ${CMAKE_COMMAND} ARGS
+              -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT}
+              -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+              -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+              -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+              -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+              -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+              -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+              -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+              "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+      COMMENT "Writing aom.pc"
+      VERBATIM)
 
     # Explicitly add a dependency on the pkg-config file to ensure it's built.
     get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES)
@@ -72,13 +72,19 @@
       endif()
     endif()
 
-    set(AOM_INSTALL_LIBS aom)
+    if(BUILD_SHARED_LIBS)
+      set(AOM_INSTALL_LIBS aom aom_static)
+    else()
+      set(AOM_INSTALL_LIBS aom)
+    endif()
 
     # Setup the install rules.
-    install(FILES ${AOM_INSTALL_INCS} DESTINATION
-                  "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/aom")
-    install(FILES "${AOM_PKG_CONFIG_FILE}" DESTINATION
-                  "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    install(
+      FILES ${AOM_INSTALL_INCS}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/aom")
+    install(
+      FILES "${AOM_PKG_CONFIG_FILE}"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
     install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION
                     "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 

diff --git a/libaom/build/cmake/aom_optimization.cmake b/libaom/build/cmake/aom_optimization.cmake
index be32a32..d8b258f 100644
--- a/libaom/build/cmake/aom_optimization.cmake
+++ b/libaom/build/cmake/aom_optimization.cmake

@@ -35,11 +35,10 @@
 # $opt_name is used to name the target. $target_to_update is made dependent upon
 # the created target.
 #
-# Note: the libaom target is always updated because OBJECT libraries have rules
-# that disallow the direct addition of .o files to them as dependencies. Static
-# libraries do not have this limitation.
-function(add_intrinsics_object_library flag opt_name target_to_update sources
-         dependent_target)
+# Note: this function always updates the aom, and aom_static targets because
+# OBJECT libraries have rules that disallow the direct addition of .o files to
+# them as dependencies. Static and shared libraries do not have this limitation.
+function(add_intrinsics_object_library flag opt_name target_to_update sources)
   if("${${sources}}" STREQUAL "")
     return()
   endif()
@@ -50,12 +49,29 @@
     get_msvc_intrinsic_flag(${flag} "flag")
   endif()
 
+  if("${flag}" STREQUAL "-mavx2")
+    unset(FLAG_SUPPORTED)
+    check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED)
+    if(${FLAG_SUPPORTED})
+      set(flag "${flag} -mno-avx256-split-unaligned-load")
+    endif()
+
+    unset(FLAG_SUPPORTED)
+    check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED)
+    if(${FLAG_SUPPORTED})
+      set(flag "${flag} -mno-avx256-split-unaligned-store")
+    endif()
+  endif()
+
   if(flag)
     separate_arguments(flag)
     target_compile_options(${target_name} PUBLIC ${flag})
   endif()
 
-  target_sources(${dependent_target} PRIVATE $<TARGET_OBJECTS:${target_name}>)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:${target_name}>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:${target_name}>)
+  endif()
 
   # Add the new lib target to the global list of aom library targets.
   list(APPEND AOM_LIB_TARGETS ${target_name})
@@ -83,28 +99,24 @@
   if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho64")
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-      set(objformat "elf64")
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
-           STREQUAL "Windows")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
       set(objformat "win64")
     else()
-      message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
+      set(objformat "elf64")
     endif()
   elseif("${AOM_TARGET_CPU}" STREQUAL "x86")
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho32")
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-      set(objformat "elf32")
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
-           STREQUAL "Windows")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
       set(objformat "win32")
     else()
-      message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
+      set(objformat "elf32")
     endif()
   else()
-    message(FATAL_ERROR
-              "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+    message(
+      FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
   endif()
 
   set(${out_format} ${objformat} PARENT_SCOPE)
@@ -112,10 +124,10 @@
 
 # Adds library target named $lib_name for ASM files in variable named by
 # $asm_sources. Builds an output directory path from $lib_name. Links $lib_name
-# into $dependent_target. Generates a dummy C file with a dummy function to
-# ensure that all cmake generators can determine the linker language, and that
-# build tools don't complain that an object exposes no symbols.
-function(add_asm_library lib_name asm_sources dependent_target)
+# into the aom library target(s). Generates a dummy C file with a dummy function
+# to ensure that all cmake generators can determine the linker language, and
+# that build tools don't complain that an object exposes no symbols.
+function(add_asm_library lib_name asm_sources)
   if("${${asm_sources}}" STREQUAL "")
     return()
   endif()
@@ -138,8 +150,12 @@
                                "${asm_object}" "${asm_source}"
                        DEPENDS "${asm_source}"
                        COMMENT "Building ASM object ${asm_object}"
-                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}" VERBATIM)
+                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
+                       VERBATIM)
     target_sources(aom PRIVATE "${asm_object}")
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE "${asm_object}")
+    endif()
   endforeach()
 
   # The above created a target containing only ASM sources. Cmake needs help
@@ -161,32 +177,44 @@
   execute_process(COMMAND ${AS_EXECUTABLE} -hf OUTPUT_VARIABLE nasm_helptext)
 
   if(NOT "${nasm_helptext}" MATCHES "-Ox")
-    message(FATAL_ERROR
-              "Unsupported nasm: multipass optimization not supported.")
+    message(
+      FATAL_ERROR "Unsupported nasm: multipass optimization not supported.")
   endif()
 
   if("${AOM_TARGET_CPU}" STREQUAL "x86")
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       if(NOT "${nasm_helptext}" MATCHES "macho32")
-        message(FATAL_ERROR
-                  "Unsupported nasm: macho32 object format not supported.")
+        message(
+          FATAL_ERROR "Unsupported nasm: macho32 object format not supported.")
       endif()
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      if(NOT "${nasm_helptext}" MATCHES "win32")
+        message(
+          FATAL_ERROR "Unsupported nasm: win32 object format not supported.")
+      endif()
+    else()
       if(NOT "${nasm_helptext}" MATCHES "elf32")
-        message(FATAL_ERROR
-                  "Unsupported nasm: elf32 object format not supported.")
+        message(
+          FATAL_ERROR "Unsupported nasm: elf32 object format not supported.")
       endif()
     endif()
   else()
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       if(NOT "${nasm_helptext}" MATCHES "macho64")
-        message(FATAL_ERROR
-                  "Unsupported nasm: macho64 object format not supported.")
+        message(
+          FATAL_ERROR "Unsupported nasm: macho64 object format not supported.")
       endif()
-    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      if(NOT "${nasm_helptext}" MATCHES "win64")
+        message(
+          FATAL_ERROR "Unsupported nasm: win64 object format not supported.")
+      endif()
+    else()
       if(NOT "${nasm_helptext}" MATCHES "elf64")
-        message(FATAL_ERROR
-                  "Unsupported nasm: elf64 object format not supported.")
+        message(
+          FATAL_ERROR "Unsupported nasm: elf64 object format not supported.")
       endif()
     endif()
   endif()
@@ -197,16 +225,16 @@
 # include file, $source is the C source file, and $symbol is used for the symbol
 # argument passed to rtcd.pl.
 function(add_rtcd_build_step config output source symbol)
-  add_custom_command(OUTPUT ${output}
-                     COMMAND ${PERL_EXECUTABLE} ARGS
-                             "${AOM_ROOT}/build/cmake/rtcd.pl"
-                             --arch=${AOM_TARGET_CPU}
-                             --sym=${symbol} ${AOM_RTCD_FLAGS}
-                             --config=${AOM_CONFIG_DIR}/config/aom_config.h
-                             ${config} > ${output}
-                     DEPENDS ${config}
-                     COMMENT "Generating ${output}"
-                     WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM)
+  add_custom_command(
+    OUTPUT ${output}
+    COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl"
+            --arch=${AOM_TARGET_CPU}
+            --sym=${symbol} ${AOM_RTCD_FLAGS}
+            --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output}
+    DEPENDS ${config}
+    COMMENT "Generating ${output}"
+    WORKING_DIRECTORY ${AOM_CONFIG_DIR}
+    VERBATIM)
   set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
   set_property(SOURCE ${output} PROPERTY GENERATED)
 endfunction()

diff --git a/libaom/build/cmake/compiler_flags.cmake b/libaom/build/cmake/compiler_flags.cmake
index 79192c1..24484bc 100644
--- a/libaom/build/cmake/compiler_flags.cmake
+++ b/libaom/build/cmake/compiler_flags.cmake

@@ -131,8 +131,8 @@
   message("Checking C compiler flag support for: " ${c_flag})
   check_c_compiler_flag("${c_flag}" HAVE_C_FLAG)
   if(NOT HAVE_C_FLAG)
-    message(FATAL_ERROR
-              "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
+    message(
+      FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
   endif()
 
   if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
@@ -167,8 +167,8 @@
   message("Checking C compiler flag support for: " ${cxx_flag})
   check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG)
   if(NOT HAVE_CXX_FLAG)
-    message(FATAL_ERROR
-              "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
+    message(
+      FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
   endif()
 
   if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")

diff --git a/libaom/build/cmake/compiler_tests.cmake b/libaom/build/cmake/compiler_tests.cmake
index f115610..0402832 100644
--- a/libaom/build/cmake/compiler_tests.cmake
+++ b/libaom/build/cmake/compiler_tests.cmake

@@ -87,10 +87,12 @@
     set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE)
 
     if(C_TEST_COMPILED)
-      set(AOM_C_PASSED_TESTS "${AOM_C_PASSED_TESTS} ${test_name}"
+      set(AOM_C_PASSED_TESTS
+          "${AOM_C_PASSED_TESTS} ${test_name}"
           CACHE STRING "" FORCE)
     else()
-      set(AOM_C_FAILED_TESTS "${AOM_C_FAILED_TESTS} ${test_name}"
+      set(AOM_C_FAILED_TESTS
+          "${AOM_C_FAILED_TESTS} ${test_name}"
           CACHE STRING "" FORCE)
       message("C Compiler test ${test_name} failed.")
     endif()
@@ -123,10 +125,12 @@
     set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE)
 
     if(CXX_TEST_COMPILED)
-      set(AOM_CXX_PASSED_TESTS "${AOM_CXX_PASSED_TESTS} ${test_name}"
+      set(AOM_CXX_PASSED_TESTS
+          "${AOM_CXX_PASSED_TESTS} ${test_name}"
           CACHE STRING "" FORCE)
     else()
-      set(AOM_CXX_FAILED_TESTS "${AOM_CXX_FAILED_TESTS} ${test_name}"
+      set(AOM_CXX_FAILED_TESTS
+          "${AOM_CXX_FAILED_TESTS} ${test_name}"
           CACHE STRING "" FORCE)
       message("CXX Compiler test ${test_name} failed.")
     endif()

diff --git a/libaom/build/cmake/dist.cmake b/libaom/build/cmake/dist.cmake
index 6f81736..5b9fc95 100644
--- a/libaom/build/cmake/dist.cmake
+++ b/libaom/build/cmake/dist.cmake

@@ -17,8 +17,8 @@
   set(${out_string} "${${out_string}}" PARENT_SCOPE)
 endfunction()
 
-set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR"
-    "AOM_DIST_INCLUDES" "AOM_DIST_LIBS" "ENABLE_DOCS")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES"
+                  "AOM_DIST_LIBS" "ENABLE_DOCS")
 
 foreach(arg ${REQUIRED_ARGS})
   if("${${arg}}" STREQUAL "")

diff --git a/libaom/build/cmake/exports.cmake b/libaom/build/cmake/exports.cmake
index b6e14d9..fa7842c 100644
--- a/libaom/build/cmake/exports.cmake
+++ b/libaom/build/cmake/exports.cmake

@@ -28,15 +28,19 @@
   set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
 
   add_custom_target(generate_exports
-                    COMMAND ${CMAKE_COMMAND} -DAOM_ROOT="${AOM_ROOT}"
+                    COMMAND ${CMAKE_COMMAND}
+                            -DAOM_ROOT="${AOM_ROOT}"
                             -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
                             -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
-                            -DAOM_SYM_FILE="${aom_sym_file}" -DAOM_MSVC=${MSVC}
-                            -DAOM_XCODE=${XCODE} -DCONFIG_NAME=$<CONFIG>
+                            -DAOM_SYM_FILE="${aom_sym_file}"
+                            -DAOM_MSVC=${MSVC}
+                            -DAOM_XCODE=${XCODE}
+                            -DCONFIG_NAME=$<CONFIG>
                             -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
                             -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
                             -DCONFIG_INSPECTION=${CONFIG_INSPECTION}
-                            -DENABLE_TESTS=${ENABLE_TESTS} -P
+                            -DENABLE_TESTS=${ENABLE_TESTS}
+                            -P
                             "${AOM_ROOT}/build/cmake/generate_exports.cmake"
                     SOURCES ${AOM_EXPORTS_SOURCES}
                     DEPENDS ${AOM_EXPORTS_SOURCES})
@@ -46,21 +50,25 @@
   add_dependencies(aom generate_exports)
 
   if(APPLE)
-    set_property(TARGET aom APPEND_STRING
+    set_property(TARGET aom
+                 APPEND_STRING
                  PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
   elseif(WIN32)
     if(NOT MSVC)
-      set_property(TARGET aom APPEND_STRING
+      set_property(TARGET aom
+                   APPEND_STRING
                    PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
     else()
-      set_property(TARGET aom APPEND_STRING
+      set_property(TARGET aom
+                   APPEND_STRING
                    PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
     endif()
 
     # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
 
   else()
-    set_property(TARGET aom APPEND_STRING
+    set_property(TARGET aom
+                 APPEND_STRING
                  PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}")
   endif()
 endfunction()

diff --git a/libaom/build/cmake/generate_aom_config_templates.cmake b/libaom/build/cmake/generate_aom_config_templates.cmake
index b91c036..529daaf 100644
--- a/libaom/build/cmake/generate_aom_config_templates.cmake
+++ b/libaom/build/cmake/generate_aom_config_templates.cmake

@@ -11,9 +11,7 @@
 cmake_minimum_required(VERSION 3.5)
 
 string(TIMESTAMP year "%Y")
-set(
-  asm_file_header_block
-  "\;
+set(asm_file_header_block "\;
 \; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
 \;
 \; This source code is subject to the terms of the BSD 2 Clause License and
@@ -23,11 +21,8 @@
 \; Media Patent License 1.0 was not distributed with this source code in the
 \; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 \;
-"
-  )
-set(
-  h_file_header_block
-  "/*
+")
+set(h_file_header_block "/*
  * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
@@ -39,11 +34,8 @@
  */
 \#ifndef AOM_CONFIG_H_
 \#define AOM_CONFIG_H_
-"
-  )
-set(
-  cmake_file_header_block
-  "##
+")
+set(cmake_file_header_block "##
 ## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
@@ -53,8 +45,7 @@
 ## Media Patent License 1.0 was not distributed with this source code in the
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
-"
-  )
+")
 
 # Terminates cmake execution when $var_name is an empty string, or the variable
 # name it contains does not expand to an existing directory.
@@ -73,8 +64,8 @@
 
 set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
 if(NOT EXISTS "${AOM_DEFAULTS}")
-  message(FATAL_ERROR
-            "Configuration default values file (${AOM_DEFAULTS}) missing.")
+  message(
+    FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.")
 endif()
 
 include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
@@ -86,7 +77,7 @@
 foreach(aom_var ${aom_build_vars})
   if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
     file(APPEND "${aom_config_h_template}"
-                "\#define ${aom_var} \${${aom_var}}\n")
+         "\#define ${aom_var} \${${aom_var}}\n")
   endif()
 endforeach()
 file(APPEND "${aom_config_h_template}" "\#endif  // AOM_CONFIG_H_")
@@ -94,8 +85,8 @@
 set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
 file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
 foreach(aom_var ${aom_build_vars})
-  if(NOT "${aom_var}" STREQUAL "INLINE" AND NOT "${aom_var}" STREQUAL
-     "AOM_RTCD_FLAGS")
+  if(NOT "${aom_var}" STREQUAL "INLINE"
+     AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
     file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
   endif()
 endforeach()

diff --git a/libaom/build/cmake/generate_exports.cmake b/libaom/build/cmake/generate_exports.cmake
index 7ab5aae..f1d15a0 100644
--- a/libaom/build/cmake/generate_exports.cmake
+++ b/libaom/build/cmake/generate_exports.cmake

@@ -10,8 +10,8 @@
 #
 cmake_minimum_required(VERSION 3.5)
 
-set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM"
-    "AOM_SYM_FILE" "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE"
+                  "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
 
 foreach(arg ${REQUIRED_ARGS})
   if("${${arg}}" STREQUAL "")

diff --git a/libaom/build/cmake/pkg_config.cmake b/libaom/build/cmake/pkg_config.cmake
index 358c1a2..c3914d7 100644
--- a/libaom/build/cmake/pkg_config.cmake
+++ b/libaom/build/cmake/pkg_config.cmake

@@ -11,8 +11,9 @@
 cmake_minimum_required(VERSION 3.5)
 
 set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
-    "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR" "CMAKE_INSTALL_LIBDIR"
-    "CMAKE_PROJECT_NAME" "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+                  "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR"
+                  "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME"
+                  "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
 
 foreach(arg ${REQUIRED_ARGS})
   if("${${arg}}" STREQUAL "")
@@ -47,9 +48,8 @@
 file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n")
 file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n")
 file(
-  APPEND
-    "${pkgconfig_file}"
-    "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n")
+  APPEND "${pkgconfig_file}"
+  "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n")
 file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
 file(APPEND "${pkgconfig_file}" "Requires:\n")
 file(APPEND "${pkgconfig_file}" "Conflicts:\n")

diff --git a/libaom/build/cmake/sanitizers.cmake b/libaom/build/cmake/sanitizers.cmake
index 77708e1..bcb600c 100644
--- a/libaom/build/cmake/sanitizers.cmake
+++ b/libaom/build/cmake/sanitizers.cmake

@@ -21,9 +21,17 @@
 
 string(TOLOWER ${SANITIZE} SANITIZE)
 
-# Require the sanitizer requested.
-require_linker_flag("-fsanitize=${SANITIZE}")
-require_compiler_flag("-fsanitize=${SANITIZE}" YES)
+# Require the sanitizer requested. cfi sanitizer requires all the flags in order
+# for the compiler to accept it.
+if("${SANITIZE}" MATCHES "cfi" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
+  require_linker_flag("-fsanitize=${SANITIZE} -flto -fno-sanitize-trap=cfi \
+    -fuse-ld=gold" YES)
+  require_compiler_flag("-fsanitize=${SANITIZE} -flto -fvisibility=hidden \
+    -fno-sanitize-trap=cfi" YES)
+else()
+  require_linker_flag("-fsanitize=${SANITIZE}")
+  require_compiler_flag("-fsanitize=${SANITIZE}" YES)
+endif()
 
 # Make callstacks accurate.
 require_compiler_flag("-fno-omit-frame-pointer -fno-optimize-sibling-calls" YES)
@@ -31,8 +39,8 @@
 # Fix link errors due to missing rt compiler lib in 32-bit builds.
 # http://llvm.org/bugs/show_bug.cgi?id=17693
 if(CMAKE_C_COMPILER_ID MATCHES "Clang")
-  if(${CMAKE_SIZEOF_VOID_P} EQUAL 4 AND "${SANITIZE}" MATCHES
-     "integer|undefined")
+  if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+     AND "${SANITIZE}" MATCHES "integer|undefined")
     require_linker_flag("--rtlib=compiler-rt -lgcc_s")
   endif()
 endif()

diff --git a/libaom/build/cmake/toolchains/arm-ios-common.cmake b/libaom/build/cmake/toolchains/arm-ios-common.cmake
index 8f40951..053e33a 100644
--- a/libaom/build/cmake/toolchains/arm-ios-common.cmake
+++ b/libaom/build/cmake/toolchains/arm-ios-common.cmake

@@ -21,6 +21,6 @@
 set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
 # No runtime cpu detect for arm*-ios targets.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
 
 # TODO(tomfinegan): Handle bit code embedding.

diff --git a/libaom/build/cmake/toolchains/arm64-android-clang.cmake b/libaom/build/cmake/toolchains/arm64-android-clang.cmake
new file mode 100644
index 0000000..c13b1d9
--- /dev/null
+++ b/libaom/build/cmake/toolchains/arm64-android-clang.cmake

@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_ 1)
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+set(AS_EXECUTABLE as)
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(AOM_ANDROID_NDK_PATH)
+  set(ENV{_AOM_ANDROID_NDK_PATH} "${AOM_ANDROID_NDK_PATH}")
+else()
+  set(AOM_ANDROID_NDK_PATH "$ENV{_AOM_ANDROID_NDK_PATH}")
+endif()
+
+if("${AOM_ANDROID_NDK_PATH}" STREQUAL "")
+  message(FATAL_ERROR "AOM_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${AOM_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
+
+# No intrinsics flag required for arm64-android-clang.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for arm64-android-clang.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
+
+set(CMAKE_SYSTEM_NAME "Android")

diff --git a/libaom/build/cmake/toolchains/arm64-linux-gcc.cmake b/libaom/build/cmake/toolchains/arm64-linux-gcc.cmake
index 590a97a..a6c9543 100644
--- a/libaom/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/arm64-linux-gcc.cmake

@@ -33,4 +33,4 @@
 set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for arm64-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
index bfeac92..a8e15cb 100644
--- a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake

@@ -26,4 +26,4 @@
 set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # No runtime cpu detect for arm64-mingw-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/armv7-ios.cmake b/libaom/build/cmake/toolchains/armv7-ios.cmake
index 32a1b53..11f7e16 100644
--- a/libaom/build/cmake/toolchains/armv7-ios.cmake
+++ b/libaom/build/cmake/toolchains/armv7-ios.cmake

@@ -28,4 +28,4 @@
 set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for armv7s-ios.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
index 6cbc2a8..b898b4b 100644
--- a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake

@@ -31,10 +31,10 @@
 set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
-    ${AOM_EXTRA_TOOLCHAIN_FLAGS})
+                 ${AOM_EXTRA_TOOLCHAIN_FLAGS})
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 
 set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
 
 # No runtime cpu detect for armv7-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
index eb488ec..2dc4b18 100644
--- a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake

@@ -26,4 +26,4 @@
 set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # No runtime cpu detect for armv7-mingw-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/armv7s-ios.cmake b/libaom/build/cmake/toolchains/armv7s-ios.cmake
index 0940a6e..faa2933 100644
--- a/libaom/build/cmake/toolchains/armv7s-ios.cmake
+++ b/libaom/build/cmake/toolchains/armv7s-ios.cmake

@@ -28,4 +28,4 @@
 set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for armv7s-ios.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/mips32-linux-gcc.cmake b/libaom/build/cmake/toolchains/mips32-linux-gcc.cmake
index 0f93490..c644eec 100644
--- a/libaom/build/cmake/toolchains/mips32-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/mips32-linux-gcc.cmake

@@ -74,4 +74,4 @@
   message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips32 targets.")
 endif()
 
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)

diff --git a/libaom/build/cmake/toolchains/mips64-linux-gcc.cmake b/libaom/build/cmake/toolchains/mips64-linux-gcc.cmake
index ad9aab0..442d910 100644
--- a/libaom/build/cmake/toolchains/mips64-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/mips64-linux-gcc.cmake

@@ -51,4 +51,4 @@
   message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips64 targets.")
 endif()
 
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)

diff --git a/libaom/build/cmake/toolchains/ppc-linux-gcc.cmake b/libaom/build/cmake/toolchains/ppc-linux-gcc.cmake
index c86cc27..54db99b 100644
--- a/libaom/build/cmake/toolchains/ppc-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/ppc-linux-gcc.cmake

@@ -26,4 +26,4 @@
 set(AS_EXECUTABLE ${CROSS}as)
 set(CMAKE_SYSTEM_PROCESSOR "ppc")
 
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")

diff --git a/libaom/build/cmake/toolchains/x86-ios-simulator.cmake b/libaom/build/cmake/toolchains/x86-ios-simulator.cmake
index 6b6f52c..caacb8c 100644
--- a/libaom/build/cmake/toolchains/x86-ios-simulator.cmake
+++ b/libaom/build/cmake/toolchains/x86-ios-simulator.cmake

@@ -23,6 +23,6 @@
 set(CMAKE_OSX_ARCHITECTURES "i386")
 
 # Avoid noisy PIC/PIE warnings.
-set(CONFIG_PIC 1 CACHE NUMBER "")
+set(CONFIG_PIC 1 CACHE STRING "")
 
 include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")

diff --git a/libaom/build/cmake/toolchains/x86-macos.cmake b/libaom/build/cmake/toolchains/x86-macos.cmake
index 7a46e06..095ef18 100644
--- a/libaom/build/cmake/toolchains/x86-macos.cmake
+++ b/libaom/build/cmake/toolchains/x86-macos.cmake

@@ -15,4 +15,4 @@
 set(CMAKE_CXX_COMPILER_ARG1 "-arch i386")
 
 # Apple tools always complain in 32 bit mode without PIC.
-set(CONFIG_PIC 1 CACHE NUMBER "")
+set(CONFIG_PIC 1 CACHE STRING "")

diff --git a/libaom/build/cmake/util.cmake b/libaom/build/cmake/util.cmake
index 5337941..9b3da84 100644
--- a/libaom/build/cmake/util.cmake
+++ b/libaom/build/cmake/util.cmake

@@ -21,12 +21,11 @@
 # variable referred to by $out_file_list_var parameter.
 macro(create_dummy_source_file basename extension out_file_list_var)
   set(dummy_source_file "${AOM_GEN_SRC_DIR}/${basename}_dummy.${extension}")
-  file(
-    WRITE
-      "${dummy_source_file}" "// Generated file. DO NOT EDIT!\n"
-      "// ${target_name} needs a ${extension} file to force link language, \n"
-      "// or to silence a harmless CMake warning: Ignore me.\n"
-      "void ${target_name}_dummy_function(void) {}\n")
+  file(WRITE "${dummy_source_file}"
+       "// Generated file. DO NOT EDIT!\n"
+       "// ${target_name} needs a ${extension} file to force link language, \n"
+       "// or to silence a harmless CMake warning: Ignore me.\n"
+       "void aom_${target_name}_dummy_function(void) {}\n")
   list(APPEND "${out_file_list_var}" "${dummy_source_file}")
 endmacro()
 
@@ -85,8 +84,8 @@
     set(CMAKE_CXX_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
     message("--- Using ${launcher_name} as compiler launcher.")
   else()
-    message(WARNING
-              "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
+    message(
+      WARNING "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
   endif()
 endfunction()
 
@@ -102,7 +101,7 @@
 #
 # The names of variables defaulted through this macro are added to
 # $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
-macro(set_aom_detect_var name value type helpstring)
+macro(set_aom_detect_var name value helpstring)
   unset(list_index)
   list(FIND AOM_DETECT_VARS ${name} list_index)
   if(${list_index} EQUAL -1)
@@ -114,7 +113,7 @@
   unset(cache_helpstring)
   get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
   if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
-    set(${name} ${value} CACHE ${type} "${helpstring}")
+    set(${name} ${value} CACHE STRING "${helpstring}")
     mark_as_advanced(${name})
   else()
     message(
@@ -132,7 +131,7 @@
 #
 # The names of variables defaulted through this macro are added to
 # $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
-macro(set_aom_config_var name value type helpstring)
+macro(set_aom_config_var name value helpstring)
   unset(list_index)
   list(FIND AOM_CONFIG_VARS ${name} list_index)
   if(${list_index} EQUAL -1)
@@ -144,7 +143,7 @@
   unset(cache_helpstring)
   get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
   if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
-    set(${name} ${value} CACHE ${type} "${helpstring}")
+    set(${name} ${value} CACHE STRING "${helpstring}")
   endif()
 endmacro()
 

diff --git a/libaom/build/cmake/version.cmake b/libaom/build/cmake/version.cmake
index d169b12..dd953a3 100644
--- a/libaom/build/cmake/version.cmake
+++ b/libaom/build/cmake/version.cmake

@@ -11,7 +11,7 @@
 cmake_minimum_required(VERSION 3.5)
 
 set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE"
-    "PERL_EXECUTABLE")
+                  "PERL_EXECUTABLE")
 
 foreach(arg ${REQUIRED_ARGS})
   if("${${arg}}" STREQUAL "")
@@ -25,13 +25,20 @@
 unset(aom_version)
 if(EXISTS "${GIT_EXECUTABLE}")
   execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${AOM_ROOT}/.git describe
-                  OUTPUT_VARIABLE aom_version ERROR_QUIET)
-  string(STRIP "${aom_version}" aom_version)
+                  OUTPUT_VARIABLE aom_version
+                  ERROR_QUIET
+                  RESULT_VARIABLE version_check_result)
 
-  # Remove the leading 'v' from the version string.
-  string(FIND "${aom_version}" "v" v_pos)
-  if(${v_pos} EQUAL 0)
-    string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+  if(${version_check_result} EQUAL 0)
+    string(STRIP "${aom_version}" aom_version)
+
+    # Remove the leading 'v' from the version string.
+    string(FIND "${aom_version}" "v" v_pos)
+    if(${v_pos} EQUAL 0)
+      string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+    endif()
+  else()
+    set(aom_version "")
   endif()
 endif()
 
@@ -40,18 +47,19 @@
 endif()
 
 unset(last_aom_version)
-if(EXISTS "${AOM_CONFIG_DIR}/config/aom_version.h")
-  extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
-                         last_aom_version)
+set(version_file "${AOM_CONFIG_DIR}/config/aom_version.h")
+if(EXISTS "${version_file}")
+  extract_version_string("${version_file}" last_aom_version)
+  if("${aom_version}" MATCHES "CHANGELOG$")
+    set(aom_version "${last_aom_version}")
+  endif()
 endif()
 
 if(NOT "${aom_version}" STREQUAL "${last_aom_version}")
-
   # TODO(tomfinegan): Perl dependency is unnecessary. CMake can do everything
-  # that is done by version.pl on its own (if a bit more verbose...).
-  execute_process(COMMAND
-                    ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/version.pl"
-                    --version_data=${aom_version}
-                    --version_filename=${AOM_CONFIG_DIR}/config/aom_version.h
-                    VERBATIM)
+  # that is done by version.pl on its own (if a bit more verbosely...).
+  execute_process(COMMAND ${PERL_EXECUTABLE}
+                          "${AOM_ROOT}/build/cmake/version.pl"
+                          --version_data=${aom_version}
+                          --version_filename=${version_file} VERBATIM)
 endif()

diff --git a/libaom/common/args.c b/libaom/common/args.c
index 7131e24..ec2a863 100644
--- a/libaom/common/args.c
+++ b/libaom/common/args.c

@@ -17,6 +17,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
+#include "aom/aom_codec.h"
 
 #if defined(__GNUC__) && __GNUC__
 extern void die(const char *fmt, ...) __attribute__((noreturn));
@@ -48,20 +49,31 @@
   if (end >= str) end[1] = '\0';
 }
 
-int arg_cfg(int *argc, char ***argv, const char *file) {
-  char **argv_local = (char **)*argv;
-  char **argv_org = (char **)*argv;
+static const char kSbSizeWarningString[] =
+    "super_block_size has to be 64 or 128.";
+static const char kMinpartWarningString[] =
+    "min_partition_size has to be smaller or equal to max_partition_size.";
+static const char kMaxpartWarningString[] =
+    "max_partition_size has to be smaller or equal to super_block_size.";
+
+int parse_cfg(const char *file, cfg_options_t *config) {
   char line[1024 * 10];
   FILE *f = fopen(file, "r");
   if (!f) return 1;
 
+#define GET_PARAMS(field)          \
+  if (strcmp(left, #field) == 0) { \
+    config->field = atoi(right);   \
+    continue;                      \
+  }
+
   while (fgets(line, sizeof(line) - 1, f)) {
     char *actual_line = ignore_front_spaces(line);
     char *left, *right, *comment;
     size_t length = strlen(actual_line);
 
     if (length == 0 || actual_line[0] == '#') continue;
-    right = strchr(actual_line, ':');
+    right = strchr(actual_line, '=');
     if (right == NULL) continue;
     right[0] = '\0';
 
@@ -74,23 +86,61 @@
     ignore_end_spaces(left);
     ignore_end_spaces(right);
 
-    char **new_args = argv_dup(*argc, (const char **)argv_local);
-    char *new_line = (char *)malloc(sizeof(*new_line) * 128);
+    GET_PARAMS(super_block_size);
+    GET_PARAMS(max_partition_size);
+    GET_PARAMS(min_partition_size);
+    GET_PARAMS(disable_ab_partition_type);
+    GET_PARAMS(disable_rect_partition_type);
+    GET_PARAMS(disable_1to4_partition_type);
+    GET_PARAMS(disable_flip_idtx);
+    GET_PARAMS(disable_cdef);
+    GET_PARAMS(disable_lr);
+    GET_PARAMS(disable_obmc);
+    GET_PARAMS(disable_warp_motion);
+    GET_PARAMS(disable_global_motion);
+    GET_PARAMS(disable_dist_wtd_comp);
+    GET_PARAMS(disable_diff_wtd_comp);
+    GET_PARAMS(disable_inter_intra_comp);
+    GET_PARAMS(disable_masked_comp);
+    GET_PARAMS(disable_one_sided_comp);
+    GET_PARAMS(disable_palette);
+    GET_PARAMS(disable_intrabc);
+    GET_PARAMS(disable_cfl);
+    GET_PARAMS(disable_smooth_intra);
+    GET_PARAMS(disable_filter_intra);
+    GET_PARAMS(disable_dual_filter);
+    GET_PARAMS(disable_intra_angle_delta);
+    GET_PARAMS(disable_intra_edge_filter);
+    GET_PARAMS(disable_tx_64x64);
+    GET_PARAMS(disable_smooth_inter_intra);
+    GET_PARAMS(disable_inter_inter_wedge);
+    GET_PARAMS(disable_inter_intra_wedge);
+    GET_PARAMS(disable_paeth_intra);
+    GET_PARAMS(disable_trellis_quant);
+    GET_PARAMS(disable_ref_frame_mv);
+    GET_PARAMS(reduced_reference_set);
+    GET_PARAMS(reduced_tx_type_set);
 
-    if (argv_local != argv_org) free(argv_local);
-
-    if (!strcmp(right, "ON"))
-      snprintf(new_line, sizeof(*new_line) * 128, "--%s", left);
-    else
-      snprintf(new_line, sizeof(*new_line) * 128, "--%s=%s", left, right);
-
-    new_args[(*argc) - 1] = new_args[(*argc) - 2];
-    new_args[(*argc) - 2] = new_line;
-    argv_local = new_args;
-    *argv = new_args;
-    (*argc)++;
+    fprintf(stderr, "\nInvalid parameter: %s", left);
+    exit(-1);
   }
+
+  if (config->super_block_size != 128 && config->super_block_size != 64) {
+    fprintf(stderr, "\n%s", kSbSizeWarningString);
+    exit(-1);
+  }
+  if (config->min_partition_size > config->max_partition_size) {
+    fprintf(stderr, "\n%s", kMinpartWarningString);
+    exit(-1);
+  }
+  if (config->max_partition_size > config->super_block_size) {
+    fprintf(stderr, "\n%s", kMaxpartWarningString);
+    exit(-1);
+  }
+
   fclose(f);
+  config->init_by_cfg_file = 1;
+
   return 0;
 }
 
@@ -209,10 +259,6 @@
   return 0;
 }
 
-struct aom_rational {
-  int num; /**< fraction numerator */
-  int den; /**< fraction denominator */
-};
 struct aom_rational arg_parse_rational(const struct arg *arg) {
   long int rawval;
   char *endptr;

diff --git a/libaom/common/args.h b/libaom/common/args.h
index 6a26642..286f7dd 100644
--- a/libaom/common/args.h
+++ b/libaom/common/args.h

@@ -13,6 +13,9 @@
 #define AOM_COMMON_ARGS_H_
 #include <stdio.h>
 
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -50,7 +53,7 @@
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
 char *ignore_front_spaces(const char *str);
 void ignore_end_spaces(char *str);
-int arg_cfg(int *argc, char ***argv, const char *file);
+int parse_cfg(const char *file, cfg_options_t *config);
 const char *arg_next(struct arg *arg);
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
 char **argv_dup(int argc, const char **argv);

diff --git a/libaom/common/av1_config.c b/libaom/common/av1_config.c
index 90955fb..9f5b020 100644
--- a/libaom/common/av1_config.c
+++ b/libaom/common/av1_config.c

@@ -237,9 +237,9 @@
   // The reader instance is local to this function, but a pointer to the
   // reader instance is used within this function and throughout this file to
   // allow use of the helper macros that reduce parse error checking verbosity.
-  struct aom_read_bit_buffer reader_instance = {
-    buffer, buffer + length, 0, &result, bitreader_error_handler
-  };
+  struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0,
+                                                 &result,
+                                                 bitreader_error_handler };
   struct aom_read_bit_buffer *reader = &reader_instance;
 
   AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
@@ -416,9 +416,9 @@
   *bytes_read = 0;
 
   int result = 0;
-  struct aom_read_bit_buffer reader_instance = {
-    buffer, buffer + buffer_length, 0, &result, bitreader_error_handler
-  };
+  struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length,
+                                                 0, &result,
+                                                 bitreader_error_handler };
   struct aom_read_bit_buffer *reader = &reader_instance;
 
   memset(config, 0, sizeof(*config));

diff --git a/libaom/common/ivfdec.h b/libaom/common/ivfdec.h
index ea294fa..dbc7733 100644
--- a/libaom/common/ivfdec.h
+++ b/libaom/common/ivfdec.h

@@ -11,6 +11,7 @@
 #ifndef AOM_COMMON_IVFDEC_H_
 #define AOM_COMMON_IVFDEC_H_
 
+#include "aom/aom_codec.h"
 #include "common/tools_common.h"
 
 #ifdef __cplusplus
@@ -18,8 +19,6 @@
 #endif
 
 int file_is_ivf(struct AvxInputContext *input);
-
-typedef int64_t aom_codec_pts_t;
 int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
                    size_t *buffer_size, aom_codec_pts_t *pts);
 

diff --git a/libaom/common/obudec.c b/libaom/common/obudec.c
index bd9f98d..650f997 100644
--- a/libaom/common/obudec.c
+++ b/libaom/common/obudec.c

@@ -438,13 +438,15 @@
     return -1;
   }
 #endif
-  uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
-  if (!new_buffer) {
-    free(*buffer);
-    fprintf(stderr, "obudec: Out of memory.\n");
-    return -1;
+  if (tu_size > 0) {
+    uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
+    if (!new_buffer) {
+      free(*buffer);
+      fprintf(stderr, "obudec: Out of memory.\n");
+      return -1;
+    }
+    *buffer = new_buffer;
   }
-  *buffer = new_buffer;
   *bytes_read = tu_size;
   *buffer_size = tu_size;
 
@@ -465,10 +467,11 @@
         memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size);
         offset = length_of_temporal_unit_size;
       } else {
-        memcpy(*buffer, obu_ctx->buffer, obu_ctx->bytes_buffered);
-        offset = obu_ctx->bytes_buffered;
-        data_size = tu_size - obu_ctx->bytes_buffered;
-        obu_ctx->bytes_buffered = 0;
+        const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size);
+        memcpy(*buffer, obu_ctx->buffer, copy_size);
+        offset = copy_size;
+        data_size = tu_size - copy_size;
+        obu_ctx->bytes_buffered -= copy_size;
       }
 
       if (fread(*buffer + offset, 1, data_size, f) != data_size) {

diff --git a/libaom/common/tools_common.h b/libaom/common/tools_common.h
index d9a68f0..1ed0045 100644
--- a/libaom/common/tools_common.h
+++ b/libaom/common/tools_common.h

@@ -144,20 +144,103 @@
 
 int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
 
+///////////////////////////////////////////////////////////////////////////////
+// A description of the interfaces used to access the AOM codecs
+///////////////////////////////////////////////////////////////////////////////
+//
+// There are three levels of interfaces used to access the AOM codec: the
+// AVXInterface, the aom_codec_iface, and the aom_codec_ctx. Each of these
+// is described in detail here.
+//
+//
+// 1. AVXInterface
+//    (Related files: common/tools_common.c,  common/tools_common.h)
+//
+// The high-level interface to the AVx encoders / decoders. Each AvxInterface
+// contains the name of the codec (e.g., "av1"), the four character code
+// associated with it, and a function pointer to the actual interface (see the
+// documentation on aom_codec_iface_t for more info). This API
+// is meant for lookup / iteration over all known codecs.
+//
+// For the encoder, call get_aom_encoder_by_name(...) if you know the name
+// (e.g., "av1"); to iterate over all known encoders, use
+// get_aom_encoder_count() and get_aom_encoder_by_index(i). To get the
+// encoder specifically for large scale tile encoding, use
+// get_aom_lst_encoder().
+//
+// For the decoder, similar functions are available. There is also a
+// get_aom_decoder_by_fourcc(fourcc) to get the decoder based on the four
+// character codes.
+//
+// The main purpose of the AVXInterface is to get a reference to the
+// aom_codec_interface_t, pointed to by its codec_interface variable.
+//
+//
+// 2. aom_codec_iface_t
+//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
+//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
+//    av1/av1_dx_iface.c)
+//
+// Used to initialize the codec context, which contains the configuration for
+// for modifying the encoder/decoder during run-time. See the documentation of
+// aom/aom_codec.h for more details. For the most part, users will call the
+// helper functions listed there, such as aom_codec_iface_name,
+// aom_codec_get_caps, etc., to interact with it.
+//
+// The main purpose of the aom_codec_iface_t is to provide a way to generate
+// a default codec config, find out what capabilities the implementation has,
+// and create an aom_codec_ctx_t (which is actually used to interact with the
+// codec).
+//
+// Note that the implementations of the aom_codec_iface_t are located in
+// av1/av1_cx_iface.c and av1/av1_dx_iface.c
+//
+//
+// 3. aom_codec_ctx_t
+//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
+//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
+//
+// The actual interface between user code and the codec. It stores the name
+// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
+// initialization flags, a config for either encoder or the decoder, and a
+// pointer to internal data.
+//
+// The codec is configured / queried through calls to aom_codec_control,
+// which takes a control code (listed in aomcx.h and aomdx.h) and a parameter.
+// In the case of "getter" control codes, the parameter is modified to have
+// the requested value; in the case of "setter" control codes, the codec's
+// configuration is changed based on the parameter. Note that a aom_codec_err_t
+// is returned, which indicates if the operation was successful or not.
+//
+// Note that for the encoder, the aom_codec_alg_priv_t points to the
+// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
+// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
+// here and also used in the core algorithm.
+//
+// At the end, aom_codec_destroy should be called for each initialized
+// aom_codec_ctx_t.
+
 typedef struct AvxInterface {
   const char *const name;
   const uint32_t fourcc;
+  // Pointer to a function of zero arguments that returns an aom_codec_iface_t
+  // pointer. E.g.:
+  //   aom_codec_iface_t *codec = interface->codec_interface();
   aom_codec_iface_t *(*const codec_interface)();
 } AvxInterface;
 
 int get_aom_encoder_count(void);
+// Lookup the interface by index -- it must be the case that
+// i < get_aom_encoder_count()
 const AvxInterface *get_aom_encoder_by_index(int i);
+// Lookup the interface by name -- returns NULL if no match.
 const AvxInterface *get_aom_encoder_by_name(const char *name);
 const AvxInterface *get_aom_lst_encoder(void);
 
 int get_aom_decoder_count(void);
 const AvxInterface *get_aom_decoder_by_index(int i);
 const AvxInterface *get_aom_decoder_by_name(const char *name);
+// Lookup the interface by the fourcc -- returns NULL if no match.
 const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc);
 
 void aom_img_write(const aom_image_t *img, FILE *file);

diff --git a/libaom/common/video_writer.c b/libaom/common/video_writer.c
index 2b42e36..1d4328a 100644
--- a/libaom/common/video_writer.c
+++ b/libaom/common/video_writer.c

@@ -41,8 +41,10 @@
     if (!file) return NULL;
 
     writer = malloc(sizeof(*writer));
-    if (!writer) return NULL;
-
+    if (!writer) {
+      fclose(file);
+      return NULL;
+    }
     writer->frame_count = 0;
     writer->info = *info;
     writer->file = file;

diff --git a/libaom/common/webmdec.cc b/libaom/common/webmdec.cc
index 17ac53c..33bda59 100644
--- a/libaom/common/webmdec.cc
+++ b/libaom/common/webmdec.cc

@@ -197,6 +197,17 @@
   return frame.Read(reader, *buffer) ? -1 : 0;
 }
 
+// Calculate the greatest common divisor between two numbers.
+static int gcd(int a, int b) {
+  int remainder;
+  while (b > 0) {
+    remainder = a % b;
+    a = b;
+    b = remainder;
+  }
+  return a;
+}
+
 int webm_guess_framerate(struct WebmInputContext *webm_ctx,
                          struct AvxInputContext *aom_ctx) {
   uint32_t i = 0;
@@ -213,6 +224,14 @@
   aom_ctx->framerate.numerator = (i - 1) * 1000000;
   aom_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
+  // Fraction might be represented in large numbers, like 49000000/980000
+  // for 50fps. Simplify as much as possible.
+  int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator);
+  if (g != 0) {
+    aom_ctx->framerate.numerator /= g;
+    aom_ctx->framerate.denominator /= g;
+  }
+
   delete[] buffer;
   webm_ctx->buffer = NULL;
 

diff --git a/libaom/docs.cmake b/libaom/docs.cmake
index b5bfa9b..28ca5c0 100644
--- a/libaom/docs.cmake
+++ b/libaom/docs.cmake

@@ -20,111 +20,123 @@
 set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
 set(AOM_DOXYGEN_SECTIONS "av1")
 
-set(AOM_DOXYGEN_SOURCES
-    "${AOM_ROOT}/aom/aom.h"
-    "${AOM_ROOT}/aom/aom_codec.h"
-    "${AOM_ROOT}/aom/aom_decoder.h"
-    "${AOM_ROOT}/aom/aom_encoder.h"
-    "${AOM_ROOT}/aom/aom_frame_buffer.h"
-    "${AOM_ROOT}/aom/aom_image.h"
-    "${AOM_ROOT}/aom/aom_integer.h"
-    "${AOM_ROOT}/keywords.dox"
-    "${AOM_ROOT}/mainpage.dox"
-    "${AOM_ROOT}/usage.dox")
+set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
+                        "${AOM_ROOT}/aom/aom_decoder.h"
+                        "${AOM_ROOT}/aom/aom_encoder.h"
+                        "${AOM_ROOT}/aom/aom_frame_buffer.h"
+                        "${AOM_ROOT}/aom/aom_image.h"
+                        "${AOM_ROOT}/aom/aom_integer.h"
+                        "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox"
+                        "${AOM_ROOT}/usage.dox")
 
 if(CONFIG_AV1_DECODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/apps/aomdec.c" "${AOM_ROOT}/examples/decode_to_md5.c"
-      "${AOM_ROOT}/examples/decode_with_drops.c"
-      "${AOM_ROOT}/examples/simple_decoder.c")
+                                  "${AOM_ROOT}/apps/aomdec.c"
+                                  "${AOM_ROOT}/examples/decode_to_md5.c"
+                                  "${AOM_ROOT}/examples/decode_with_drops.c"
+                                  "${AOM_ROOT}/examples/simple_decoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Full featured decoder." "Frame by frame MD5 checksum."
-      "Drops frames while decoding." "Simplified decoder loop.")
+                                       "Full featured decoder."
+                                       "Frame by frame MD5 checksum."
+                                       "Drops frames while decoding."
+                                       "Simplified decoder loop.")
 
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
-      "${AOM_ROOT}/usage_dx.dox")
+                          "${AOM_ROOT}/usage_dx.dox")
 
   if(CONFIG_ANALYZER)
     set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-        "${AOM_ROOT}/examples/analyzer.cc")
+                                    "${AOM_ROOT}/examples/analyzer.cc")
 
     set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-        "Bitstream analyzer.")
+                                         "Bitstream analyzer.")
   endif()
 
   if(CONFIG_INSPECTION)
     set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-        "${AOM_ROOT}/examples/inspect.c")
+                                    "${AOM_ROOT}/examples/inspect.c")
 
     set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-        "Bitstream inspector.")
+                                         "Bitstream inspector.")
   endif()
 endif()
 
 if(CONFIG_AV1_ENCODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/apps/aomenc.c" "${AOM_ROOT}/examples/lossless_encoder.c"
-      "${AOM_ROOT}/examples/set_maps.c" "${AOM_ROOT}/examples/simple_encoder.c"
-      "${AOM_ROOT}/examples/twopass_encoder.c")
+                                  "${AOM_ROOT}/apps/aomenc.c"
+                                  "${AOM_ROOT}/examples/lossless_encoder.c"
+                                  "${AOM_ROOT}/examples/set_maps.c"
+                                  "${AOM_ROOT}/examples/simple_encoder.c"
+                                  "${AOM_ROOT}/examples/twopass_encoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Full featured encoder." "Simplified lossless encoder."
-      "Set active and ROI maps." "Simplified encoder loop."
-      "Two-pass encoder loop.")
+                                       "Full featured encoder."
+                                       "Simplified lossless encoder."
+                                       "Set active and ROI maps."
+                                       "Simplified encoder loop."
+                                       "Two-pass encoder loop.")
 
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/examples/scalable_encoder.c")
+                                  "${AOM_ROOT}/examples/scalable_encoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Scalable encoder loop.")
+                                       "Scalable encoder loop.")
+
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/svc_encoder_rtc.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Layered encoder for RTC.")
 
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
-      "${AOM_ROOT}/usage_cx.dox")
+                          "${AOM_ROOT}/usage_cx.dox")
 endif()
 
 if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/examples/aom_cx_set_ref.c")
+                                  "${AOM_ROOT}/examples/aom_cx_set_ref.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Set encoder reference frame.")
+                                       "Set encoder reference frame.")
 endif()
 
 if(CONFIG_AV1_ENCODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/examples/lightfield_encoder.c")
+                                  "${AOM_ROOT}/examples/lightfield_encoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Lightfield encoder example.")
+                                       "Lightfield encoder example.")
 endif()
 
 if(CONFIG_AV1_DECODER)
-  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES
+      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
       "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Lightfield tile list decoder example.")
+                                       "Lightfield tile list decoder example.")
 endif()
 
 if(CONFIG_AV1_DECODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/examples/lightfield_decoder.c")
+                                  "${AOM_ROOT}/examples/lightfield_decoder.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Lightfield decoder example.")
+                                       "Lightfield decoder example.")
 endif()
 
 if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
-  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES
+      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
       "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c")
 
   set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Lightfield bitstream parsing example.")
+                                       "Lightfield bitstream parsing example.")
 endif()
 
 # Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE
@@ -169,25 +181,19 @@
 
   # Generate samples.dox, an index page that refers to the example_basename.dox
   # files that were just created.
-  set(
-    samples_header
-    "
+  set(samples_header "
 /*!\\page samples Sample Code
 This SDK includes a number of sample applications. Each sample documents a
 feature of the SDK in both prose and the associated C code. The following
 samples are included:
-"
-    )
+")
 
-  set(
-    utils_desc
-    "
+  set(utils_desc "
 In addition, the SDK contains a number of utilities. Since these utilities are
 built upon the concepts described in the sample code listed above, they are not
 documented in pieces like the samples are. Their source is included here for
 reference. The following utilities are included:
-"
-    )
+")
 
   # Write the description for the samples section.
   set(samples_dox "${AOM_CONFIG_DIR}/samples.dox")
@@ -230,11 +236,11 @@
   file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
   file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data})
   file(APPEND "${AOM_DOXYFILE}"
-              "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
+       "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
   file(APPEND "${AOM_DOXYFILE}"
-              "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+       "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
   file(APPEND "${AOM_DOXYFILE}"
-              "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
+       "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
   write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES")
   write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
                                          "AOM_DOXYGEN_SECTIONS")

diff --git a/libaom/examples/analyzer.cc b/libaom/examples/analyzer.cc
index 261d085..3598821 100644
--- a/libaom/examples/analyzer.cc
+++ b/libaom/examples/analyzer.cc

@@ -15,7 +15,7 @@
 
 #include "aom/aom_decoder.h"
 #include "aom/aomdx.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/decoder/accounting.h"
 #include "av1/decoder/inspection.h"
 #include "common/tools_common.h"
@@ -528,8 +528,8 @@
   wxMenuBar *mb = new wxMenuBar();
 
   fileMenu = new wxMenu();
-  fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open daala file"));
-  fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close daala file"));
+  fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file"));
+  fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file"));
   fileMenu->Enable(wxID_CLOSE, false);
   fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program"));
   mb->Append(fileMenu, _("&File"));

diff --git a/libaom/examples/aom_cx_set_ref.c b/libaom/examples/aom_cx_set_ref.c
index 5a78af9..2f4f658 100644
--- a/libaom/examples/aom_cx_set_ref.c
+++ b/libaom/examples/aom_cx_set_ref.c

@@ -262,7 +262,7 @@
     die("Failed to allocate image.");
   }
 
-  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
   // Allocate memory with the border so that it can be used as a reference.
   if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
                                  info.frame_height, 32, 8,
@@ -283,7 +283,7 @@
   cfg.g_lag_in_frames = 3;
   cfg.g_bit_depth = AOM_BITS_8;
 
-  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
                ? AOM_CODEC_USE_HIGHBITDEPTH
                : 0;
 
@@ -311,7 +311,7 @@
     if (limit && frame_in >= limit) break;
     aom_image_t *frame_to_encode;
 
-    if (!CONFIG_LOWBITDEPTH) {
+    if (FORCE_HIGHBITDEPTH_DECODING) {
       // Need to allocate larger buffer to use hbd internal.
       int input_shift = 0;
       if (!allocated_raw_shift) {

diff --git a/libaom/examples/av1_dec_fuzzer.cc b/libaom/examples/av1_dec_fuzzer.cc
index 937c944..1cddc8c 100644
--- a/libaom/examples/av1_dec_fuzzer.cc
+++ b/libaom/examples/av1_dec_fuzzer.cc

@@ -38,7 +38,7 @@
   aom_codec_ctx_t codec;
   // Set thread count in the range [1, 64].
   const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
-  aom_codec_dec_cfg_t cfg = { threads, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+  aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
   if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) {
     return 0;
   }

diff --git a/libaom/examples/build_av1_dec_fuzzer.sh b/libaom/examples/build_av1_dec_fuzzer.sh
index 78fe66e..0dcb254 100755
--- a/libaom/examples/build_av1_dec_fuzzer.sh
+++ b/libaom/examples/build_av1_dec_fuzzer.sh

@@ -50,9 +50,9 @@
 EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
 cd "${BUILD_DIR}"
 cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
-  -DCONFIG_SCALABILITY=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_AV1_ENCODER=0 \
-  -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DCONFIG_SIZE_LIMIT=1 \
-  -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+  -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \
+  -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \
+  -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
   -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
   -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address
 

diff --git a/libaom/examples/inspect.c b/libaom/examples/inspect.c
index 9ca2a02..526bdc1 100644
--- a/libaom/examples/inspect.c
+++ b/libaom/examples/inspect.c

@@ -29,7 +29,7 @@
 
 #include "aom/aom_decoder.h"
 #include "aom/aomdx.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
 #if CONFIG_ACCOUNTING
 #include "av1/decoder/accounting.h"
@@ -42,7 +42,7 @@
 #include "common/video_reader.h"
 
 // Max JSON buffer size.
-const int MAX_BUFFER = 1024 * 1024 * 32;
+const int MAX_BUFFER = 1024 * 1024 * 256;
 
 typedef enum {
   ACCOUNTING_LAYER = 1,
@@ -248,9 +248,9 @@
 
 const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
 
-const map_entry intrabc_map[] = {
-  { "INTRABC", 1 }, { "NO_INTRABC", 0 }, LAST_ENUM
-};
+const map_entry intrabc_map[] = { { "INTRABC", 1 },
+                                  { "NO_INTRABC", 0 },
+                                  LAST_ENUM };
 
 const map_entry palette_map[] = {
   { "ZERO_COLORS", 0 },  { "TWO_COLORS", 2 },   { "THREE_COLORS", 3 },

diff --git a/libaom/examples/lightfield_bitstream_parsing.c b/libaom/examples/lightfield_bitstream_parsing.c
index afacf44..ffcbcb9 100644
--- a/libaom/examples/lightfield_bitstream_parsing.c
+++ b/libaom/examples/lightfield_bitstream_parsing.c

@@ -148,14 +148,14 @@
     size_t frame_size = frame_sizes[image_idx];
     const unsigned char *frame = frames[image_idx];
 
-    aom_codec_control_(codec, AV1_SET_DECODE_TILE_ROW, tr);
-    aom_codec_control_(codec, AV1_SET_DECODE_TILE_COL, tc);
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
 
     aom_codec_err_t aom_status =
         aom_codec_decode(codec, frame, frame_size, NULL);
     if (aom_status) die_codec(codec, "Failed to decode tile.");
 
-    aom_codec_control_(codec, AV1D_GET_TILE_DATA, &tile_data);
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data);
 
     // Copy over tile info.
     //  uint8_t anchor_frame_idx;
@@ -228,7 +228,7 @@
     die_codec(&codec, "Failed to initialize decoder.");
 
   // Decode anchor frames.
-  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
 
   printf("Reading %d reference images.\n", num_references);
   for (i = 0; i < num_references; ++i) {
@@ -248,8 +248,8 @@
   }
 
   // Decode camera frames.
-  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
-  aom_codec_control_(&codec, AV1D_EXT_TILE_DEBUG, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1);
 
   FILE *infile = aom_video_reader_get_file(reader);
   // Record the offset of the first camera image.
@@ -291,14 +291,15 @@
 
     // Need to decode frame header to get camera frame header info. So, here
     // decoding 1 tile is enough.
-    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, 0);
-    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, 0);
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0);
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0);
 
     aom_codec_err_t aom_status =
         aom_codec_decode(&codec, frame, frame_size, NULL);
     if (aom_status) die_codec(&codec, "Failed to decode tile.");
 
-    aom_codec_control_(&codec, AV1D_GET_FRAME_HEADER_INFO, &frame_header_info);
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO,
+                                  &frame_header_info);
 
     size_t obu_size_offset =
         (uint8_t *)frame_header_info.coded_tile_data - frame;
@@ -330,13 +331,13 @@
 
   // Read out the image format.
   aom_img_fmt_t ref_fmt = 0;
-  if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
     die_codec(&codec, "Failed to get the image format");
   const int bps = get_image_bps(ref_fmt);
   if (!bps) die_codec(&codec, "Invalid image format.");
   // read out the tile size.
   unsigned int tile_size = 0;
-  if (aom_codec_control(&codec, AV1D_GET_TILE_SIZE, &tile_size))
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size))
     die_codec(&codec, "Failed to get the tile size");
   const unsigned int tile_width = tile_size >> 16;
   const unsigned int tile_height = tile_size & 65535;

diff --git a/libaom/examples/lightfield_decoder.c b/libaom/examples/lightfield_decoder.c
index 7a445f0..a292e9c 100644
--- a/libaom/examples/lightfield_decoder.c
+++ b/libaom/examples/lightfield_decoder.c

@@ -95,16 +95,16 @@
                  aom_image_t *reference_images, aom_image_t *output,
                  int *tile_idx, unsigned int *output_bit_depth,
                  aom_image_t **img_ptr, int output_format) {
-  aom_codec_control_(codec, AV1_SET_TILE_MODE, 1);
-  aom_codec_control_(codec, AV1D_EXT_TILE_DEBUG, 1);
-  aom_codec_control_(codec, AV1_SET_DECODE_TILE_ROW, tr);
-  aom_codec_control_(codec, AV1_SET_DECODE_TILE_COL, tc);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
 
   av1_ref_frame_t ref;
   ref.idx = 0;
   ref.use_external_ref = 1;
   ref.img = reference_images[ref_idx];
-  if (aom_codec_control(codec, AV1_SET_REFERENCE, &ref)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) {
     die_codec(codec, "Failed to set reference frame.");
   }
 
@@ -126,11 +126,12 @@
   if (output_format != YUV1D) {
     // read out the tile size.
     unsigned int tile_size = 0;
-    if (aom_codec_control(codec, AV1D_GET_TILE_SIZE, &tile_size))
+    if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
       die_codec(codec, "Failed to get the tile size");
     const unsigned int tile_width = tile_size >> 16;
     const unsigned int tile_height = tile_size & 65535;
-    const uint8_t output_frame_width_in_tiles = output_frame_width / tile_width;
+    const uint32_t output_frame_width_in_tiles =
+        output_frame_width / tile_width;
 
     // Copy the tile to the output frame.
     const int row_offset =
@@ -197,12 +198,13 @@
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
-  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+                                    info->is_annexb)) {
     die("Failed to set annex b status");
   }
 
   // Decode anchor frames.
-  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
   for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
     frame = aom_video_reader_get_frame(reader, &frame_size);
@@ -210,11 +212,11 @@
       die_codec(&codec, "Failed to decode frame.");
 
     if (i == 0) {
-      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
         die_codec(&codec, "Failed to get the image format");
 
       int frame_res[2];
-      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
         die_codec(&codec, "Failed to get the image frame size");
 
       // Allocate memory to store decoded references. Allocate memory with the
@@ -229,8 +231,8 @@
       }
     }
 
-    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
-                          &reference_images[i]))
+    if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                                      &reference_images[i]))
       die_codec(&codec, "Failed to copy decoded reference frame");
 
     aom_codec_iter_t iter = NULL;
@@ -276,7 +278,7 @@
   if (output_format != YUV1D) {
     // Allocate the output frame.
     aom_img_fmt_t out_fmt = ref_fmt;
-    if (!CONFIG_LOWBITDEPTH) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+    if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
     if (!aom_img_alloc(&output, out_fmt, output_frame_width,
                        output_frame_height, 32))
       die("Failed to allocate output image.");
@@ -285,6 +287,7 @@
   printf("Decoding tile list from file.\n");
   char line[1024];
   FILE *tile_list_fptr = fopen(tile_list_file, "r");
+  if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file.");
   int tile_list_cnt = 0;
   int tile_list_writes = 0;
   int tile_idx = 0;

diff --git a/libaom/examples/lightfield_encoder.c b/libaom/examples/lightfield_encoder.c
index 4dd71ca..e80fe24 100644
--- a/libaom/examples/lightfield_encoder.c
+++ b/libaom/examples/lightfield_encoder.c

@@ -52,7 +52,7 @@
   exit(EXIT_FAILURE);
 }
 
-static int aom_img_size_bytes(aom_image_t *img) {
+static int img_size_bytes(aom_image_t *img) {
   int image_size_bytes = 0;
   int plane;
   for (plane = 0; plane < 3; ++plane) {
@@ -117,7 +117,7 @@
 
 static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
                           aom_image_t *raw_shift) {
-  if (!CONFIG_LOWBITDEPTH) {
+  if (FORCE_HIGHBITDEPTH_DECODING) {
     // Need to allocate larger buffer to use hbd internal.
     int input_shift = 0;
     aom_img_upshift(raw_shift, raw, input_shift);
@@ -134,7 +134,7 @@
                              aom_image_t *raw_shift) {
   aom_codec_ctx_t codec;
   int frame_count = 0;
-  int image_size_bytes = aom_img_size_bytes(raw);
+  int image_size_bytes = img_size_bytes(raw);
   int u_blocks, v_blocks;
   int bu, bv;
   aom_fixed_buf_t stats = { NULL, 0 };
@@ -242,7 +242,7 @@
   AvxVideoWriter *writer = NULL;
   aom_codec_ctx_t codec;
   int frame_count = 0;
-  int image_size_bytes = aom_img_size_bytes(raw);
+  int image_size_bytes = img_size_bytes(raw);
   int bu, bv;
   int u_blocks, v_blocks;
   aom_image_t *frame_to_encode;
@@ -259,6 +259,11 @@
     die_codec(&codec, "Failed to turn off auto altref");
   if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
     die_codec(&codec, "Failed to set frame parallel decoding");
+  if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1))
+    die_codec(&codec, "Failed to enable encoder ext_tile debug");
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 1))
+    die_codec(&codec, "Failed to set cpu-used");
+
   // Note: The superblock is a sequence parameter and has to be the same for 1
   // sequence. In lightfield application, must choose the superblock size(either
   // 64x64 or 128x128) before the encoding starts. Otherwise, the default is
@@ -272,8 +277,18 @@
   v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
 
   reference_image_num = u_blocks * v_blocks;
+  // Set the max gf group length so the references are guaranteed to be in
+  // a different gf group than any of the regular frames. This avoids using
+  // both vbr and constant quality mode in a single group. The number of
+  // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of
+  // 16. If it is necessary to exceed this reference frame limit, one will have
+  // to do some additional handling to ensure references are in separate gf
+  // groups from the regular frames.
+  if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL,
+                        reference_image_num - 1))
+    die_codec(&codec, "Failed to set max gf interval");
   aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
-  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
   // Allocate memory with the border so that it can be used as a reference.
   int border_in_pixels =
       (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode)
@@ -457,7 +472,7 @@
   if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) {
     die("Failed to allocate image.");
   }
-  if (!CONFIG_LOWBITDEPTH) {
+  if (FORCE_HIGHBITDEPTH_DECODING) {
     // Need to allocate larger buffer to use hbd internal.
     aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h,
                   32);
@@ -479,7 +494,7 @@
   cfg.kf_mode = AOM_KF_DISABLED;
   cfg.large_scale_tile = 0;  // Only set it to 1 for camera frame encoding.
   cfg.g_bit_depth = AOM_BITS_8;
-  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
                ? AOM_CODEC_USE_HIGHBITDEPTH
                : 0;
 
@@ -499,7 +514,7 @@
         lf_blocksize, flags, &raw_shift);
   free(stats.buf);
 
-  if (!CONFIG_LOWBITDEPTH) aom_img_free(&raw_shift);
+  if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift);
   aom_img_free(&raw);
   fclose(infile);
 

diff --git a/libaom/examples/lightfield_tile_list_decoder.c b/libaom/examples/lightfield_tile_list_decoder.c
index 87a8b43..3b928df 100644
--- a/libaom/examples/lightfield_tile_list_decoder.c
+++ b/libaom/examples/lightfield_tile_list_decoder.c

@@ -51,14 +51,14 @@
                              FILE *file) {
   // read out the tile size.
   unsigned int tile_size = 0;
-  if (aom_codec_control(codec, AV1D_GET_TILE_SIZE, &tile_size))
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
     die_codec(codec, "Failed to get the tile size");
   const unsigned int tile_width = tile_size >> 16;
   const unsigned int tile_height = tile_size & 65535;
-  const uint8_t output_frame_width_in_tiles = img->d_w / tile_width;
+  const uint32_t output_frame_width_in_tiles = img->d_w / tile_width;
 
   unsigned int tile_count = 0;
-  if (aom_codec_control(codec, AV1D_GET_TILE_COUNT, &tile_count))
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count))
     die_codec(codec, "Failed to get the tile size");
 
   // Write tile to file.
@@ -136,12 +136,13 @@
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
-  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+                                    info->is_annexb)) {
     die("Failed to set annex b status");
   }
 
   // Decode anchor frames.
-  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
   for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
     frame = aom_video_reader_get_frame(reader, &frame_size);
@@ -150,11 +151,11 @@
 
     if (i == 0) {
       aom_img_fmt_t ref_fmt = 0;
-      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
         die_codec(&codec, "Failed to get the image format");
 
       int frame_res[2];
-      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
         die_codec(&codec, "Failed to get the image frame size");
 
       // Allocate memory to store decoded references. Allocate memory with the
@@ -169,8 +170,8 @@
       }
     }
 
-    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
-                          &reference_images[i]))
+    if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                                      &reference_images[i]))
       die_codec(&codec, "Failed to copy decoded reference frame");
 
     aom_codec_iter_t iter = NULL;
@@ -186,11 +187,11 @@
   }
 
   // Decode the lightfield.
-  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
 
   // Set external references.
   av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
-  aom_codec_control_(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
   // Must decode the camera frame header first.
   aom_video_reader_read_frame(reader);
   frame = aom_video_reader_get_frame(reader, &frame_size);

diff --git a/libaom/examples/lossless_encoder.c b/libaom/examples/lossless_encoder.c
index 438ff21..e0253d2 100644
--- a/libaom/examples/lossless_encoder.c
+++ b/libaom/examples/lossless_encoder.c

@@ -113,7 +113,7 @@
   if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
-  if (aom_codec_control_(&codec, AV1E_SET_LOSSLESS, 1))
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1))
     die_codec(&codec, "Failed to use lossless mode");
 
   // Encode frames.

diff --git a/libaom/examples/noise_model.c b/libaom/examples/noise_model.c
index 45f1b4d..d07443f 100644
--- a/libaom/examples/noise_model.c
+++ b/libaom/examples/noise_model.c

@@ -180,6 +180,7 @@
   aom_image_t renoised;
   grain->apply_grain = 1;
   grain->random_seed = 7391;
+  grain->bit_depth = raw->bit_depth;
   aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
 
   if (av1_add_film_grain(grain, denoised, &renoised)) {
@@ -313,7 +314,7 @@
                      info.frame_height, 1)) {
     die("Failed to allocate image.");
   }
-  infile = fopen(args.input, "r");
+  infile = fopen(args.input, "rb");
   if (!infile) {
     die("Failed to open input file:", args.input);
   }

diff --git a/libaom/examples/resize_util.c b/libaom/examples/resize_util.c
index 6a84d57..5692c20 100644
--- a/libaom/examples/resize_util.c
+++ b/libaom/examples/resize_util.c

@@ -83,6 +83,7 @@
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
+    fclose(fpin);
     printf("Can't open file %s to write\n", fout);
     usage();
     return 1;

diff --git a/libaom/examples/svc_encoder_rtc.c b/libaom/examples/svc_encoder_rtc.c
new file mode 100644
index 0000000..1316c6c
--- /dev/null
+++ b/libaom/examples/svc_encoder_rtc.c

@@ -0,0 +1,907 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+//  This is an example demonstrating how to implement a multi-layer AOM
+//  encoding scheme for RTC video applications.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "aom_ports/aom_timer.h"
+
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+
+static const char *exec_name;
+
+void usage_exit(void) { exit(EXIT_FAILURE); }
+
+static int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3 };
+static int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1, 2, 3, 3, 3, 3 };
+static int mode_to_num_layers[10] = { 1, 2, 3, 3, 2, 2, 3, 9, 9, 9 };
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[AOM_MAX_TS_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[AOM_MAX_TS_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[AOM_MAX_TS_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[AOM_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[AOM_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative across temporal layers).
+  double layer_encoding_bitrate[AOM_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+};
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+static void open_input_file(struct AvxInputContext *input,
+                            aom_chroma_sample_position_t csp) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.aom_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else {
+      fatal("Unsupported Y4M stream.");
+    }
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
+static void set_rate_control_metrics(struct RateControlMetrics *rc,
+                                     double framerate,
+                                     unsigned int ss_number_layers,
+                                     unsigned int ts_number_layers) {
+  int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 };
+  ts_rate_decimator[0] = 1;
+  if (ts_number_layers == 2) {
+    ts_rate_decimator[0] = 2;
+    ts_rate_decimator[1] = 1;
+  }
+  if (ts_number_layers == 3) {
+    ts_rate_decimator[0] = 4;
+    ts_rate_decimator[1] = 2;
+    ts_rate_decimator[2] = 1;
+  }
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    unsigned int i = sl * ts_number_layers;
+    rc->layer_framerate[0] = framerate / ts_rate_decimator[0];
+    rc->layer_pfb[i] =
+        1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0];
+    for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
+      i = sl * ts_number_layers + tl;
+      if (tl > 0) {
+        rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl];
+        rc->layer_pfb[i] =
+            1000.0 *
+            (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+            (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]);
+      }
+      rc->layer_input_frames[tl] = 0;
+      rc->layer_enc_frames[tl] = 0;
+      rc->layer_encoding_bitrate[i] = 0.0;
+      rc->layer_avg_frame_size[i] = 0.0;
+      rc->layer_avg_rate_mismatch[i] = 0.0;
+    }
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlMetrics *rc,
+                                          int frame_cnt,
+                                          unsigned int ss_number_layers,
+                                          unsigned int ts_number_layers) {
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+  printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers);
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    tot_num_frames = 0;
+    for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
+      unsigned int i = sl * ts_number_layers + tl;
+      const int num_dropped =
+          tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl]
+                 : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1;
+      tot_num_frames += rc->layer_input_frames[tl];
+      rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] *
+                                      rc->layer_encoding_bitrate[i] /
+                                      tot_num_frames;
+      rc->layer_avg_frame_size[i] =
+          rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl];
+      rc->layer_avg_rate_mismatch[i] =
+          100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl];
+      printf("For layer#: %d %d \n", sl, tl);
+      printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i],
+             rc->layer_encoding_bitrate[i]);
+      printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i],
+             rc->layer_avg_frame_size[i]);
+      printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]);
+      printf(
+          "Number of input frames, encoded (non-key) frames, "
+          "and perc dropped frames: %d %d %f\n",
+          rc->layer_input_frames[tl], rc->layer_enc_frames[tl],
+          100.0 * num_dropped / rc->layer_input_frames[tl]);
+      printf("\n");
+    }
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+                     rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames:\n", rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f\n",
+         rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if (frame_cnt - 1 != tot_num_frames)
+    die("Error: Number of input frames not equal to output!\n");
+}
+
+// Layer pattern configuration.
+static int set_layer_pattern(int layering_mode, int superframe_cnt,
+                             aom_svc_layer_id_t *layer_id,
+                             aom_svc_ref_frame_config_t *ref_frame_config,
+                             int *use_svc_control, int spatial_layer_id,
+                             int is_key_frame, int ksvc_mode) {
+  int i;
+  int shift = (layering_mode == 7) ? 2 : 0;
+  *use_svc_control = 1;
+  layer_id->spatial_layer_id = spatial_layer_id;
+  // Set the referende map buffer idx for the 7 references:
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+  // Note for this layered patterns only use LAST and GF for prediction in
+  // non-rd mode (speed >= 7).
+  int layer_flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                    AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+                    AOM_EFLAG_NO_REF_ARF2;
+  if (ksvc_mode) {
+    // Same pattern as case 8.
+    layering_mode = 8;
+    if (!is_key_frame)
+      // No inter-layer prediction on inter-frames.
+      layer_flags |= AOM_EFLAG_NO_REF_GF;
+  }
+  switch (layering_mode) {
+    case 0:
+      // 1-layer: update LAST on every frame, reference LAST and GF.
+      layer_id->temporal_layer_id = 0;
+      ref_frame_config->refresh[0] = 1;
+      break;
+    case 1:
+      // 2-temporal layer.
+      //    1    3    5
+      //  0    2    4
+      if (superframe_cnt % 2 == 0) {
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+      } else {
+        layer_id->temporal_layer_id = 1;
+        // No updates on layer 1, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 2:
+      // 3-temporal layer:
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[1] = 0;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 3:
+      // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
+      // only reference GF (not LAST). Other frames only reference LAST.
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, only reference LAST.
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update GF, only reference LAST (TL0).
+        ref_frame_config->refresh[3] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference GF.
+        layer_flags |= AOM_EFLAG_NO_REF_LAST;
+      }
+      break;
+    case 4:
+      // 2-temporal layer with the old update flags, not with the new
+      // SVC control.
+      *use_svc_control = 0;
+      //    1    3    5
+      //  0    2    4
+      if (superframe_cnt % 2 == 0) {
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        layer_flags |= AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+      } else {
+        layer_id->temporal_layer_id = 1;
+        // No updates on layer 1, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 5:
+      // 2 spatial layers, 1 temporal.
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST.
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN to slot 0. Update slot 1 (LAST).
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->refresh[1] = 1;
+      }
+      break;
+    case 6:
+      // 3 spatial layers, 1 temporal.
+      // Note for this case, we set the buffer idx for all references to be
+      // either LAST or GOLDEN, which are always valid references, since decoder
+      // will check if any of the 7 references is valid scale in
+      // valid_ref_frame_size().
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Set all buffer_idx to 0.
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN (and all other refs) to slot 0.
+        // Update slot 1 (LAST).
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+        // and GOLDEN (and all other refs) to slot 1.
+        // Update slot 2 (LAST).
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      }
+      break;
+    case 7:
+      // 3 spatial and 3 temporal layer.
+      // Same as case 8 but overalap in the buffer slot updates.
+      // (shift = 2). The slots 3 and 4 updated by first TL2 are
+      // reused for update in TL1 superframe.
+      // Note for this case, frame order hint must be disabled for
+      // lower resolutios (operating points > 0) to be decoedable.
+    case 8:
+      // 3 spatial and 3 temporal layer.
+      // No overlap in buffer updates between TL2 and TL1.
+      // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7.
+      // Set the references via the svc_ref_frame_config control.
+      layer_flags = 0;
+      // Always reference LAST.
+      ref_frame_config->reference[0] = 1;
+      if (superframe_cnt % 4 == 0) {
+        // Base temporal layer.
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST, update LAST.
+          // Set all buffer_idx to 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 0.
+          // Update slot 1 (LAST).
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 1.
+          // Update slot 2 (LAST).
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->refresh[2] = 1;
+        }
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        // First top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST (slot 0).
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to slot 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and Update slot 4.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 4.
+          // No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 4;
+          ref_frame_config->ref_idx[0] = 2;
+        }
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        // Middle temporal enhancement layer.
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST.
+          // Set all buffer_idx to 0.
+          // Set GOLDEN to slot 5 and update slot 5.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 5 - shift;
+          ref_frame_config->refresh[5 - shift] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 5.
+          // Set LAST2 to slot 6 and update slot 6.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5 - shift;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[2] = 6 - shift;
+          ref_frame_config->refresh[6 - shift] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 6.
+          // Set LAST2 to slot 6 and update slot 7.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 6 - shift;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[2] = 7 - shift;
+          ref_frame_config->refresh[7 - shift] = 1;
+        }
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        // Second top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Set LAST to slot 5 and reference LAST.
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 5 - shift;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 6 - shift;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // GOLDEN to slot 4. No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 7 - shift;
+          ref_frame_config->ref_idx[3] = 4;
+        }
+      }
+      if (layer_id->spatial_layer_id > 0)
+        ref_frame_config->reference[3] = 1;  // Reference GOLDEN.
+      break;
+    default: assert(0); die("Error: Unsupported temporal layering mode!\n");
+  }
+  return layer_flags;
+}
+
+int main(int argc, char **argv) {
+  AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_cnt = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  unsigned int width;
+  unsigned int height;
+  uint32_t error_resilient = 0;
+  int speed;
+  int frame_avail;
+  int got_data = 0;
+  int flags = 0;
+  unsigned i;
+  int pts = 0;             // PTS starts at 0.
+  int frame_duration = 1;  // 1 timebase tick per frame.
+  int layering_mode = 0;
+  aom_svc_layer_id_t layer_id;
+  aom_svc_params_t svc_params;
+  aom_svc_ref_frame_config_t ref_frame_config;
+  const AvxInterface *encoder = NULL;
+  struct AvxInputContext input_ctx;
+  struct RateControlMetrics rc;
+  int64_t cx_time = 0;
+  const int min_args_base = 13;
+  const int min_args = min_args_base;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate = 30.0;
+  int use_svc_control = 1;
+  zero(rc.layer_target_bitrate);
+  memset(&layer_id, 0, sizeof(aom_svc_layer_id_t));
+  memset(&input_ctx, 0, sizeof(input_ctx));
+  memset(&svc_params, 0, sizeof(svc_params));
+
+  // Flag to test dynamic scaling of source frames for single
+  // spatial stream, using the scaling_mode control.
+  const int test_dynamic_scaling_single_layer = 0;
+
+  /* Setup default input stream settings */
+  input_ctx.framerate.numerator = 30;
+  input_ctx.framerate.denominator = 1;
+  input_ctx.only_i420 = 1;
+  input_ctx.bit_depth = 0;
+  unsigned int ts_number_layers = 1;
+  unsigned int ss_number_layers = 1;
+  exec_name = argv[0];
+  // Check usage and arguments.
+  if (argc < min_args) {
+    die("Usage: %s <infile> <outfile> <codec_type(av1)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1>\n",
+        argv[0]);
+  }
+
+  encoder = get_aom_encoder_by_name(argv[3]);
+
+  width = (unsigned int)strtoul(argv[4], NULL, 0);
+  height = (unsigned int)strtoul(argv[5], NULL, 0);
+  if (width < 16 || width % 2 || height < 16 || height % 2) {
+    die("Invalid resolution: %d x %d", width, height);
+  }
+
+  layering_mode = (int)strtol(argv[12], NULL, 0);
+  if (layering_mode < 0 || layering_mode > 13) {
+    die("Invalid layering mode (0..12) %s", argv[12]);
+  }
+
+  if (argc != min_args + mode_to_num_layers[layering_mode]) {
+    die("Invalid number of arguments");
+  }
+
+  ts_number_layers = mode_to_num_temporal_layers[layering_mode];
+  ss_number_layers = mode_to_num_spatial_layers[layering_mode];
+
+  input_ctx.filename = argv[1];
+  open_input_file(&input_ctx, 0);
+
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) {
+      die("Failed to allocate image", width, height);
+    }
+  }
+
+  // Populate encoder configuration.
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) {
+    printf("Failed to get config: %s\n", aom_codec_err_to_string(res));
+    return EXIT_FAILURE;
+  }
+
+  // Update the default configuration with our settings.
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
+  cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
+  cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
+
+  speed = (int)strtol(argv[8], NULL, 0);
+  if (speed < 0 || speed > 8) {
+    die("Invalid speed setting: must be positive");
+  }
+
+  for (i = min_args_base;
+       (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
+    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
+    svc_params.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
+  }
+
+  cfg.rc_target_bitrate =
+      svc_params.layer_target_bitrate[ss_number_layers * ts_number_layers - 1];
+
+  svc_params.framerate_factor[0] = 1;
+  if (ts_number_layers == 2) {
+    svc_params.framerate_factor[0] = 2;
+    svc_params.framerate_factor[1] = 1;
+  } else if (ts_number_layers == 3) {
+    svc_params.framerate_factor[0] = 4;
+    svc_params.framerate_factor[1] = 2;
+    svc_params.framerate_factor[2] = 1;
+  }
+
+  // Real time parameters.
+  cfg.g_usage = AOM_USAGE_REALTIME;
+
+  cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 52;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
+  cfg.rc_buf_initial_sz = 600;
+  cfg.rc_buf_optimal_sz = 600;
+  cfg.rc_buf_sz = 1000;
+
+  // Use 1 thread as default.
+  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
+
+  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
+  if (error_resilient != 0 && error_resilient != 1) {
+    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  }
+  // Enable error resilient mode.
+  cfg.g_error_resilient = error_resilient;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = AOM_KF_AUTO;
+
+  // Disable automatic keyframe placement.
+  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
+
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
+  set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
+
+  if (input_ctx.file_type == FILE_TYPE_Y4M) {
+    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
+      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
+    }
+    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
+        input_ctx.framerate.denominator != cfg.g_timebase.num) {
+      die("Incorrect framerate: numerator %d denominator %d",
+          cfg.g_timebase.num, cfg.g_timebase.den);
+    }
+  }
+
+  // Open an output file for each stream.
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    for (unsigned tl = 0; tl < ts_number_layers; ++tl) {
+      i = sl * ts_number_layers + tl;
+      char file_name[PATH_MAX];
+      AvxVideoInfo info;
+      info.codec_fourcc = encoder->fourcc;
+      info.frame_width = cfg.g_w;
+      info.frame_height = cfg.g_h;
+      info.time_base.numerator = cfg.g_timebase.num;
+      info.time_base.denominator = cfg.g_timebase.den;
+
+      snprintf(file_name, sizeof(file_name), "%s_%d.av1", argv[2], i);
+      outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[i]) die("Failed to open %s for writing", file_name);
+      assert(outfile[i] != NULL);
+    }
+  }
+
+  // Initialize codec.
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  aom_codec_control(&codec, AOME_SET_CPUUSED, speed);
+  aom_codec_control(&codec, AV1E_SET_AQ_MODE, 3);
+  aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
+  aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
+
+  svc_params.number_spatial_layers = ss_number_layers;
+  svc_params.number_temporal_layers = ts_number_layers;
+  for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
+    svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+    svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
+  }
+  for (i = 0; i < ss_number_layers; ++i) {
+    svc_params.scaling_factor_num[i] = 1;
+    svc_params.scaling_factor_den[i] = 1;
+  }
+  if (ss_number_layers == 2) {
+    svc_params.scaling_factor_num[0] = 1;
+    svc_params.scaling_factor_den[0] = 2;
+  } else if (ss_number_layers == 3) {
+    svc_params.scaling_factor_num[0] = 1;
+    svc_params.scaling_factor_den[0] = 4;
+    svc_params.scaling_factor_num[1] = 1;
+    svc_params.scaling_factor_den[1] = 2;
+  }
+
+  aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params);
+
+  // This controls the maximum target size of the key frame.
+  // For generating smaller key frames, use a smaller max_intra_size_pct
+  // value, like 100 or 200.
+  {
+    const int max_intra_size_pct = 300;
+    aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
+
+  frame_avail = 1;
+  while (frame_avail || got_data) {
+    struct aom_usec_timer timer;
+    frame_avail = read_frame(&input_ctx, &raw);
+    int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
+    // Loop over spatial layers.
+    for (unsigned int slx = 0; slx < ss_number_layers; slx++) {
+      aom_codec_iter_t iter = NULL;
+      const aom_codec_cx_pkt_t *pkt;
+      int layer = 0;
+
+      // Set the reference/update flags, layer_id, and reference_map
+      // buffer index.
+      flags = set_layer_pattern(layering_mode, frame_cnt, &layer_id,
+                                &ref_frame_config, &use_svc_control, slx,
+                                is_key_frame, (layering_mode == 9));
+      aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+      if (use_svc_control)
+        aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+                          &ref_frame_config);
+
+      layer = slx * ts_number_layers + layer_id.temporal_layer_id;
+      if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
+
+      if (test_dynamic_scaling_single_layer) {
+        if (frame_cnt >= 200 && frame_cnt <= 400) {
+          // Scale source down by 2x2.
+          struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        } else {
+          // Source back up to original resolution (no scaling).
+          struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        }
+      }
+
+      // Do the layer encode.
+      aom_usec_timer_start(&timer);
+      if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
+        die_codec(&codec, "Failed to encode frame");
+      aom_usec_timer_mark(&timer);
+      cx_time += aom_usec_timer_elapsed(&timer);
+
+      got_data = 0;
+      while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
+        got_data = 1;
+        switch (pkt->kind) {
+          case AOM_CODEC_CX_FRAME_PKT:
+            for (unsigned int sl = layer_id.spatial_layer_id;
+                 sl < ss_number_layers; ++sl) {
+              for (unsigned tl = layer_id.temporal_layer_id;
+                   tl < ts_number_layers; ++tl) {
+                unsigned int j = sl * ts_number_layers + tl;
+                aom_video_writer_write_frame(outfile[j], pkt->data.frame.buf,
+                                             pkt->data.frame.sz, pts);
+                if (sl == (unsigned int)layer_id.spatial_layer_id)
+                  rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz;
+                // Keep count of rate control stats per layer (for non-key).
+                if (tl == (unsigned int)layer_id.temporal_layer_id &&
+                    sl == (unsigned int)layer_id.spatial_layer_id &&
+                    !(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+                  rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
+                  rc.layer_avg_rate_mismatch[j] +=
+                      fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
+                      rc.layer_pfb[j];
+                  if (slx == 0) ++rc.layer_enc_frames[tl];
+                }
+              }
+            }
+
+            // Update for short-time encoding bitrate states, for moving window
+            // of size rc->window, shifted by rc->window / 2.
+            // Ignore first window segment, due to key frame.
+            // For spatial layers: only do this for top/highest SL.
+            if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) {
+              sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+              rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size;
+              if (frame_cnt % rc.window_size == 0) {
+                rc.window_count += 1;
+                rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+                rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate / rc.window_size) *
+                    (sum_bitrate / rc.window_size);
+                sum_bitrate = 0.0;
+              }
+            }
+            // Second shifted window.
+            if (frame_cnt > rc.window_size + rc.window_size / 2 &&
+                slx == ss_number_layers - 1) {
+              sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+              if (frame_cnt > 2 * rc.window_size &&
+                  frame_cnt % rc.window_size == 0) {
+                rc.window_count += 1;
+                rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+                rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate2 / rc.window_size) *
+                    (sum_bitrate2 / rc.window_size);
+                sum_bitrate2 = 0.0;
+              }
+            }
+            break;
+          default: break;
+        }
+      }
+    }  // loop over spatial layers
+    ++frame_cnt;
+    pts += frame_duration;
+  }
+  close_input_file(&input_ctx);
+  printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
+                                ts_number_layers);
+  printf("\n");
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n",
+         frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+         1000000 * (double)frame_cnt / (double)cx_time);
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  // Try to rewrite the output file headers with the actual frame count.
+  for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
+    aom_video_writer_close(outfile[i]);
+
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    aom_img_free(&raw);
+  }
+  return EXIT_SUCCESS;
+}

diff --git a/libaom/test/acm_random.h b/libaom/test/acm_random.h
index a14b671..8b1d51a 100644
--- a/libaom/test/acm_random.h
+++ b/libaom/test/acm_random.h

@@ -67,7 +67,7 @@
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
-    return r < 128 ? r << 4 : r >> 4;
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
   }
 
   int PseudoUniform(int range) { return random_.Generate(range); }

diff --git a/libaom/test/altref_test.cc b/libaom/test/altref_test.cc
index dabb147..43df39f 100644
--- a/libaom/test/altref_test.cc
+++ b/libaom/test/altref_test.cc

@@ -50,7 +50,8 @@
 
   virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame_num_ == forced_kf_frame_num_) {
-      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_KEY))
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY))
           << "Frame #" << frame_num_ << " isn't a keyframe!";
     }
     ++frame_num_;

diff --git a/libaom/test/aom_integer_test.cc b/libaom/test/aom_integer_test.cc
index fe88a54..d5dfad9 100644
--- a/libaom/test/aom_integer_test.cc
+++ b/libaom/test/aom_integer_test.cc

@@ -20,9 +20,9 @@
 const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = {
   1, 1, 2, 3, 4, 5
 };
-const uint64_t kSizeTestInputs[kSizeTestNumValues] = {
-  0, 0x7f, 0x3fff, 0x1fffff, 0xffffff, 0x10000000
-};
+const uint64_t kSizeTestInputs[kSizeTestNumValues] = { 0,        0x7f,
+                                                       0x3fff,   0x1fffff,
+                                                       0xffffff, 0x10000000 };
 
 const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80,
                                             0x10 };  // UINT32_MAX + 1

diff --git a/libaom/test/aq_segment_test.cc b/libaom/test/aq_segment_test.cc
index 51557a5..83bfdb6 100644
--- a/libaom/test/aq_segment_test.cc
+++ b/libaom/test/aq_segment_test.cc

@@ -20,7 +20,8 @@
 namespace {
 
 class AqSegmentTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
       public ::libaom_test::EncoderTest {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
@@ -65,25 +66,13 @@
   int deltaq_mode_;
 };
 
-// Validate that this AQ segmentation mode (AQ=1, variance_ap)
-// encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ1) { DoTest(1); }
-
-// Validate that this AQ segmentation mode (AQ=2, complexity_aq)
-// encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ2) { DoTest(2); }
-
-// Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq)
-// encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { DoTest(3); }
+// Validate that this AQ segmentation mode (1-variance_aq, 2-complexity_aq,
+// 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
 class AqSegmentTestLarge : public AqSegmentTest {};
 
-TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ1) { DoTest(1); }
-
-TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) { DoTest(2); }
-
-TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) { DoTest(3); }
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
 // Validate that this delta q mode
 // encodes and decodes without a mismatch.
@@ -100,9 +89,9 @@
 AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
                           ::testing::Values(::libaom_test::kRealTime,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(5, 9));
+                          ::testing::Range(5, 9), ::testing::Range(0, 4));
 AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
                           ::testing::Values(::libaom_test::kRealTime,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(3, 5));
+                          ::testing::Range(3, 5), ::testing::Range(0, 4));
 }  // namespace

diff --git a/libaom/test/arf_freq_test.cc b/libaom/test/arf_freq_test.cc
index 50b478b..0780cd7 100644
--- a/libaom/test/arf_freq_test.cc
+++ b/libaom/test/arf_freq_test.cc

@@ -214,7 +214,7 @@
 // BWDREF_FRAME is also a non-show frame, and the minimum run between two
 // consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
 // number as long as it does not exceed the gf_group interval.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DISABLED_AV1, ArfFreqTestLarge,
     ::testing::Combine(
         ::testing::Values(

diff --git a/libaom/test/onyxc_int_test.cc b/libaom/test/av1_common_int_test.cc
similarity index 91%
rename from libaom/test/onyxc_int_test.cc
rename to libaom/test/av1_common_int_test.cc
index 3889595..dde2542 100644
--- a/libaom/test/onyxc_int_test.cc
+++ b/libaom/test/av1_common_int_test.cc

@@ -11,9 +11,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
-TEST(OnyxcInt, TestGetTxSize) {
+TEST(AV1CommonInt, TestGetTxSize) {
   for (int t = TX_4X4; t < TX_SIZES_ALL; t++) {
     TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]);
     GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]);

diff --git a/libaom/test/av1_config_test.cc b/libaom/test/av1_config_test.cc
index e2f2c53..fca980f 100644
--- a/libaom/test/av1_config_test.cc
+++ b/libaom/test/av1_config_test.cc

@@ -24,20 +24,20 @@
 // Sequence Header OBUs vs Sequence Header OBUs with the
 // reduced_still_image_flag set).
 //
-const uint8_t kAnnexBFullSequenceHeaderObu[] = {
-  0x0c, 0x08, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
-};
+const uint8_t kAnnexBFullSequenceHeaderObu[] = { 0x0c, 0x08, 0x00, 0x00, 0x00,
+                                                 0x04, 0x45, 0x7e, 0x3e, 0xff,
+                                                 0xfc, 0xc0, 0x20 };
 const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = {
   0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
 };
 
-const uint8_t kLobfFullSequenceHeaderObu[] = {
-  0x0a, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
-};
+const uint8_t kLobfFullSequenceHeaderObu[] = { 0x0a, 0x0b, 0x00, 0x00, 0x00,
+                                               0x04, 0x45, 0x7e, 0x3e, 0xff,
+                                               0xfc, 0xc0, 0x20 };
 
-const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = {
-  0x0a, 0x07, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
-};
+const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = { 0x0a, 0x07, 0x18,
+                                                            0x22, 0x2b, 0xf1,
+                                                            0xfe, 0xc0, 0x20 };
 
 const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 };
 

diff --git a/libaom/test/av1_convolve_2d_test.cc b/libaom/test/av1_convolve_2d_test.cc
index b0cef81..50a58f0 100644
--- a/libaom/test/av1_convolve_2d_test.cc
+++ b/libaom/test/av1_convolve_2d_test.cc

@@ -9,16 +9,20 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/av1_convolve_2d_test_util.h"
 
 using libaom_test::ACMRandom;
 using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
 using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
+#if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
 using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
-using ::testing::make_tuple;
-using ::testing::tuple;
+#endif
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 
@@ -26,181 +30,182 @@
 
 TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C_COPY, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C_X, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C_Y, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1Convolve2DSrTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_convolve_2d_copy_sr_sse2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_sse2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(
     SSE2_X, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2_Y, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1Convolve2DSrTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_convolve_2d_copy_sr_avx2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(
     AVX2_X, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2_Y, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON_X, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON_Y, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AV1Convolve2DSrTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
 
-INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_convolve_2d_copy_sr_neon, 0, 0));
+INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_neon, 0, 0));
 #endif  // HAVE_NEON
 
 TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
 
-INSTANTIATE_TEST_CASE_P(C_COPY, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_copy_c, 0, 0));
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_c, 0, 0));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C_X, AV1JntConvolve2DTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C_Y, AV1JntConvolve2DTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
-INSTANTIATE_TEST_CASE_P(SSE2, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_sse2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_sse2, 1, 1));
 
-INSTANTIATE_TEST_CASE_P(SSE2_X, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_x_sse2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(SSE2_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_sse2, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(SSE2_Y, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_y_sse2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(SSE2_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_sse2, 0, 1));
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_ssse3, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_ssse3, 1, 1));
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_CASE_P(AVX2_X, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_x_avx2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_avx2, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_y_avx2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_avx2, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(AVX2, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_avx2, 1, 1));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSSE3
 #endif  // HAVE_SSE2
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
+INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
 
-INSTANTIATE_TEST_CASE_P(NEON, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_2d_neon, 1, 1));
-INSTANTIATE_TEST_CASE_P(NEON_X, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_x_neon, 1, 0));
+INSTANTIATE_TEST_SUITE_P(NEON, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_SUITE_P(NEON_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_neon, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(NEON_Y, AV1JntConvolve2DTest,
-                        libaom_test::AV1Convolve2D::BuildParams(
-                            av1_dist_wtd_convolve_y_neon, 0, 1));
+INSTANTIATE_TEST_SUITE_P(NEON_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_neon, 0, 1));
 #endif  // HAVE_NEON
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
 TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
 
-INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_x_sr_c, 1, 0));
+INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_c, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_y_sr_c, 0, 1));
+INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_c, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_copy_sr_c, 0, 0));
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_c, 0, 0));
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_sr_ssse3, 1, 1));
-INSTANTIATE_TEST_CASE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_x_sr_ssse3, 1, 0));
-INSTANTIATE_TEST_CASE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_y_sr_ssse3, 0, 1));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_sr_ssse3, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_ssse3, 1, 0));
+INSTANTIATE_TEST_SUITE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_ssse3, 0, 1));
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_sr_avx2, 1, 1));
-INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_x_sr_avx2, 1, 0));
-INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_y_sr_avx2, 0, 1));
-INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_sr_avx2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_avx2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_avx2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSSE3
 #endif  // HAVE_SSE2
@@ -212,43 +217,45 @@
   RunSpeedTest(GET_PARAM(1));
 }
 
-INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_x_c, 1, 0));
+INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_c, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_y_c, 0, 1));
+INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_c, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0, 0));
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
-INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
-INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0,
+                             0));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
+INSTANTIATE_TEST_SUITE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
-INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
-INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
-                        libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/av1_convolve_2d_test_util.cc b/libaom/test/av1_convolve_2d_test_util.cc
index 9cfe3e6..6f103d3 100644
--- a/libaom/test/av1_convolve_2d_test_util.cc
+++ b/libaom/test/av1_convolve_2d_test_util.cc

@@ -15,8 +15,8 @@
 #include "av1/common/common_data.h"
 #include "av1/common/convolve.h"
 
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace libaom_test {
 
@@ -51,7 +51,7 @@
   for (int i = 0; i < h; ++i)
     for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
   for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = rnd_.Rand31();
+    output[i] = output2[i] = static_cast<uint8_t>(rnd_.Rand31());
 
   // Make sure that sizes 2xN and Nx2 are also tested for chroma.
   const int num_sizes =
@@ -355,6 +355,7 @@
 }
 }  // namespace AV1Convolve2D
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdConvolve2D {
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
     highbd_convolve_2d_func filter, int has_subx, int has_suby) {
@@ -445,7 +446,7 @@
     for (int j = 0; j < w; ++j)
       input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
   for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = rnd_.Rand31();
+    output[i] = output2[i] = static_cast<int16_t>(rnd_.Rand31());
 
   // Make sure that sizes 2xN and Nx2 are also tested for chroma.
   const int num_sizes =
@@ -703,4 +704,5 @@
   }
 }
 }  // namespace AV1HighbdConvolve2D
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace libaom_test

diff --git a/libaom/test/av1_convolve_2d_test_util.h b/libaom/test/av1_convolve_2d_test_util.h
index e0eb584..3c19cfe 100644
--- a/libaom/test/av1_convolve_2d_test_util.h
+++ b/libaom/test/av1_convolve_2d_test_util.h

@@ -12,6 +12,8 @@
 #ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
 #define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
 
+#include <tuple>
+
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 
@@ -30,11 +32,10 @@
                                  uint8_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams *filter_params_x,
                                  const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 const int subpel_x_qn, const int subpel_y_qn,
                                  ConvolveParams *conv_params);
 
-typedef ::testing::tuple<convolve_2d_func, int, int, BLOCK_SIZE>
-    Convolve2DParam;
+typedef std::tuple<convolve_2d_func, int, int, BLOCK_SIZE> Convolve2DParam;
 
 ::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
     convolve_2d_func filter, int subx_exist, int suby_exist);
@@ -68,14 +69,15 @@
 };
 }  // namespace AV1Convolve2D
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdConvolve2D {
 typedef void (*highbd_convolve_2d_func)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 
-typedef ::testing::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
+typedef std::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
     HighbdConvolve2DParam;
 
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
@@ -111,6 +113,7 @@
   libaom_test::ACMRandom rnd_;
 };
 }  // namespace AV1HighbdConvolve2D
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace libaom_test
 

diff --git a/libaom/test/av1_convolve_scale_test.cc b/libaom/test/av1_convolve_scale_test.cc
index a933fc9..ffd0bab 100644
--- a/libaom/test/av1_convolve_scale_test.cc
+++ b/libaom/test/av1_convolve_scale_test.cc

@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
 #include <vector>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -33,8 +34,8 @@
 const int kYStepQn = 20;
 
 using libaom_test::ACMRandom;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
 int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
@@ -269,8 +270,8 @@
 
  protected:
   void SetParams(const BaseParams &params, int bd) {
-    width_ = ::testing::get<0>(params.dims);
-    height_ = ::testing::get<1>(params.dims);
+    width_ = std::get<0>(params.dims);
+    height_ = std::get<1>(params.dims);
     ntaps_x_ = params.ntaps_x;
     ntaps_y_ = params.ntaps_y;
     bd_ = bd;
@@ -454,13 +455,14 @@
 TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
 TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, LowBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
                        ::testing::ValuesIn(kBlockDim),
                        ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
                        ::testing::Bool()));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
@@ -520,10 +522,11 @@
 TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
 TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, HighBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
                        ::testing::ValuesIn(kBlockDim),
                        ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
                        ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/av1_fwd_txfm1d_test.cc b/libaom/test/av1_fwd_txfm1d_test.cc
index 863cb39..abc46ed 100644
--- a/libaom/test/av1_fwd_txfm1d_test.cc
+++ b/libaom/test/av1_fwd_txfm1d_test.cc

@@ -30,11 +30,11 @@
 const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
 
 const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
-  { av1_fdct4_new, av1_fadst4_new, av1_fidentity4_c },
-  { av1_fdct8_new, av1_fadst8_new, av1_fidentity8_c },
-  { av1_fdct16_new, av1_fadst16_new, av1_fidentity16_c },
-  { av1_fdct32_new, NULL, av1_fidentity32_c },
-  { av1_fdct64_new, NULL, NULL },
+  { av1_fdct4, av1_fadst4, av1_fidentity4_c },
+  { av1_fdct8, av1_fadst8, av1_fidentity8_c },
+  { av1_fdct16, av1_fadst16, av1_fidentity16_c },
+  { av1_fdct32, NULL, av1_fidentity32_c },
+  { av1_fdct64, NULL, NULL },
 };
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12

diff --git a/libaom/test/av1_fwd_txfm2d_test.cc b/libaom/test/av1_fwd_txfm2d_test.cc
index eb09cb1..dd60665 100644
--- a/libaom/test/av1_fwd_txfm2d_test.cc
+++ b/libaom/test/av1_fwd_txfm2d_test.cc

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <tuple>
 #include <vector>
 
 #include "config/av1_rtcd.h"
@@ -32,7 +33,7 @@
 
 namespace {
 // tx_type_, tx_size_, max_error_, max_avg_error_
-typedef ::testing::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+typedef std::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
 
 class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
  public:
@@ -196,8 +197,8 @@
   return param_list;
 }
 
-INSTANTIATE_TEST_CASE_P(C, AV1FwdTxfm2d,
-                        ::testing::ValuesIn(GetTxfm2dParamList()));
+INSTANTIATE_TEST_SUITE_P(C, AV1FwdTxfm2d,
+                         ::testing::ValuesIn(GetTxfm2dParamList()));
 
 TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
 
@@ -350,7 +351,7 @@
   }
 }
 
-typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
+typedef std::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
 
 class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
 
@@ -387,9 +388,9 @@
   TX_64X16,
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, AV1FwdTxfm2dTest,
-                        Combine(ValuesIn(fwd_txfm_for_sse2),
-                                Values(av1_lowbd_fwd_txfm_sse2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_sse2),
+                                 Values(av1_lowbd_fwd_txfm_sse2)));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE4_1
@@ -400,9 +401,9 @@
   TX_64X32,
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1FwdTxfm2dTest,
-                        Combine(ValuesIn(fwd_txfm_for_sse41),
-                                Values(av1_lowbd_fwd_txfm_sse4_1)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_sse41),
+                                 Values(av1_lowbd_fwd_txfm_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
@@ -412,9 +413,9 @@
   TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, AV1FwdTxfm2dTest,
-                        Combine(ValuesIn(fwd_txfm_for_avx2),
-                                Values(av1_lowbd_fwd_txfm_avx2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_avx2),
+                                 Values(av1_lowbd_fwd_txfm_avx2)));
 #endif  // HAVE_AVX2
 
 typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
@@ -543,7 +544,7 @@
   }
 }
 
-typedef ::testing::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
+typedef std::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
 
 class AV1HighbdFwdTxfm2dTest
     : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
@@ -567,16 +568,16 @@
   TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
-                        Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
-                                Values(av1_highbd_fwd_txfm)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
+                                 Values(av1_highbd_fwd_txfm)));
 #endif  // HAVE_SSE4_1
 #if HAVE_AVX2
 static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8,   TX_16X16, TX_32X32,
                                               TX_64X64, TX_8X16,  TX_16X8 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdFwdTxfm2dTest,
-                        Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
-                                Values(av1_highbd_fwd_txfm)));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
+                                 Values(av1_highbd_fwd_txfm)));
 #endif  // HAVE_AVX2
 }  // namespace

diff --git a/libaom/test/av1_highbd_iht_test.cc b/libaom/test/av1_highbd_iht_test.cc
index 6d77cbf..8fea500 100644
--- a/libaom/test/av1_highbd_iht_test.cc
+++ b/libaom/test/av1_highbd_iht_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
@@ -26,14 +28,31 @@
 namespace {
 
 using libaom_test::ACMRandom;
-using ::testing::tuple;
+using std::tuple;
 
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
                           TX_TYPE tx_type, int bd);
 
 typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
                            TX_TYPE tx_type, int bd);
-
+static const char *tx_type_name[] = {
+  "DCT_DCT",
+  "ADST_DCT",
+  "DCT_ADST",
+  "ADST_ADST",
+  "FLIPADST_DCT",
+  "DCT_FLIPADST",
+  "FLIPADST_FLIPADST",
+  "ADST_FLIPADST",
+  "FLIPADST_ADST",
+  "IDTX",
+  "V_DCT",
+  "H_DCT",
+  "V_ADST",
+  "H_ADST",
+  "V_FLIPADST",
+  "H_FLIPADST",
+};
 // Test parameter argument list:
 //   <transform reference function,
 //    optimized inverse transform function,
@@ -138,7 +157,7 @@
 
 TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE4_1
 #define PARAM_LIST_4X4                                   \
@@ -167,27 +186,28 @@
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
-                        ::testing::ValuesIn(kArrayIhtParam));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvHTNxN,
+                         ::testing::ValuesIn(kArrayIhtParam));
 #endif  // HAVE_SSE4_1
 
 typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output,
                                     int stride, const TxfmParam *txfm_param);
 
-typedef ::testing::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
+typedef std::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
 class AV1HighbdInvTxfm2d
     : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
  public:
   virtual void SetUp() { target_func_ = GET_PARAM(0); }
   void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
-                           int bit_depth);
+                           int bit_depth, int gt_int16 = 0);
 
  private:
   HighbdInvTxfm2dFunc target_func_;
 };
 
 void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
-                                             int run_times, int bit_depth_) {
+                                             int run_times, int bit_depth_,
+                                             int gt_int16) {
   FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
   TxfmParam txfm_param;
   const int BLK_WIDTH = 64;
@@ -234,8 +254,15 @@
       inv_input[scan[i]] = 0;
     }
     txfm_param.eob = eob;
-    aom_usec_timer ref_timer, test_timer;
+    if (gt_int16) {
+      const uint16_t inv_input_mask =
+          static_cast<uint16_t>((1 << (bit_depth_ + 7)) - 1);
+      for (int i = 0; i < eob; i++) {
+        inv_input[scan[i]] = (rnd.Rand31() & inv_input_mask);
+      }
+    }
 
+    aom_usec_timer ref_timer, test_timer;
     aom_usec_timer_start(&ref_timer);
     for (int i = 0; i < run_times; ++i) {
       av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output),
@@ -264,7 +291,8 @@
           ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c])
               << "[" << r << "," << c << "] " << cnt
               << " tx_size: " << static_cast<int>(tx_size_)
-              << " tx_type: " << tx_type_ << " eob " << eob;
+              << " bit_depth_: " << bit_depth_
+              << " tx_type: " << tx_type_name[tx_type_] << " eob " << eob;
         }
       }
     }
@@ -272,8 +300,8 @@
 }
 
 TEST_P(AV1HighbdInvTxfm2d, match) {
-  int bitdepth_ar[2] = { 10, 12 };
-  for (int k = 0; k < 2; ++k) {
+  int bitdepth_ar[3] = { 8, 10, 12 };
+  for (int k = 0; k < 3; ++k) {
     int bd = bitdepth_ar[k];
     for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
       for (int i = 0; i < (int)TX_TYPES; ++i) {
@@ -287,6 +315,25 @@
   }
 }
 
+TEST_P(AV1HighbdInvTxfm2d, gt_int16) {
+  int bitdepth_ar[3] = { 8, 10, 12 };
+  static const TX_TYPE types[] = {
+    DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX, V_DCT, H_DCT, H_ADST, H_FLIPADST
+  };
+  for (int k = 0; k < 3; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      const TX_SIZE sz = static_cast<TX_SIZE>(j);
+      for (uint8_t i = 0; i < sizeof(types) / sizeof(TX_TYPE); ++i) {
+        const TX_TYPE tp = types[i];
+        if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+          RunAV1InvTxfm2dTest(tp, sz, 1, bd, 1);
+        }
+      }
+    }
+  }
+}
+
 TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) {
   int bitdepth_ar[2] = { 10, 12 };
   for (int k = 0; k < 2; ++k) {
@@ -304,13 +351,12 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d,
-                        ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
 #endif
 
-// TODO(http://crbug.com/aomedia/2350): these cause test vector mismatches.
-#if 0  // HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
-                        ::testing::Values(av1_highbd_inv_txfm_add_avx2));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_avx2));
 #endif
 }  // namespace

diff --git a/libaom/test/av1_horz_only_frame_superres_test.cc b/libaom/test/av1_horz_only_frame_superres_test.cc
index ffc9136..115fc84 100644
--- a/libaom/test/av1_horz_only_frame_superres_test.cc
+++ b/libaom/test/av1_horz_only_frame_superres_test.cc

@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
 #include <vector>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -31,8 +32,8 @@
 const int kHPad = 32;
 
 using libaom_test::ACMRandom;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 template <typename Pixel>
 class TestImage {
@@ -297,9 +298,10 @@
 TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
 TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, LowBDConvolveHorizRSTest,
-                        ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest,
+                         ::testing::Values(av1_convolve_horiz_rs_sse4_1));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
                                           uint16_t *dst, int dst_stride, int w,
                                           int h, const int16_t *x_filters,
@@ -354,9 +356,10 @@
 TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
 TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, HighBDConvolveHorizRSTest,
     ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
                        ::testing::ValuesIn(kBDs)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace

diff --git a/libaom/test/av1_inv_txfm1d_test.cc b/libaom/test/av1_inv_txfm1d_test.cc
index bf3a44e..01d4a4d 100644
--- a/libaom/test/av1_inv_txfm1d_test.cc
+++ b/libaom/test/av1_inv_txfm1d_test.cc

@@ -16,6 +16,8 @@
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/encoder/av1_fwd_txfm1d.h"
 
+typedef TX_SIZE TxSize;
+
 using libaom_test::ACMRandom;
 using libaom_test::input_base;
 
@@ -24,19 +26,15 @@
 const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
 
 const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
-  { av1_fdct4_new, av1_fadst4_new },
-  { av1_fdct8_new, av1_fadst8_new },
-  { av1_fdct16_new, av1_fadst16_new },
-  { av1_fdct32_new, NULL },
-  { av1_fdct64_new, NULL },
+  { av1_fdct4, av1_fadst4 },   { av1_fdct8, av1_fadst8 },
+  { av1_fdct16, av1_fadst16 }, { av1_fdct32, NULL },
+  { av1_fdct64, NULL },
 };
 
 const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = {
-  { av1_idct4_new, av1_iadst4_new },
-  { av1_idct8_new, av1_iadst8_new },
-  { av1_idct16_new, av1_iadst16_new },
-  { av1_idct32_new, NULL },
-  { av1_idct64_new, NULL },
+  { av1_idct4, av1_iadst4 },   { av1_idct8, av1_iadst8 },
+  { av1_idct16, av1_iadst16 }, { av1_idct32, NULL },
+  { av1_idct64, NULL },
 };
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
@@ -77,7 +75,7 @@
   ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES);
   for (int k = 0; k < count_test_block; ++k) {
     // choose a random transform to test
-    const TX_SIZE tx_size = static_cast<TX_SIZE>(rnd.Rand8() % TX_SIZES);
+    const TxSize tx_size = static_cast<TxSize>(rnd.Rand8() % TX_SIZES);
     const int tx_size_pix = txfm_size_ls[tx_size];
     const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0];
 
@@ -88,9 +86,11 @@
     memset(input + 32, 0, 32 * sizeof(input[0]));
 
     int32_t ref_output[64];
+    memset(ref_output, 0, sizeof(ref_output));
     reference_idct_1d_int(input, ref_output, tx_size_pix);
 
     int32_t output[64];
+    memset(output, 0, sizeof(output));
     inv_txfm_func(input, output, cos_bit, range_bit);
 
     for (int i = 0; i < tx_size_pix; ++i) {

diff --git a/libaom/test/av1_inv_txfm2d_test.cc b/libaom/test/av1_inv_txfm2d_test.cc
index 5432130..eacdf85 100644
--- a/libaom/test/av1_inv_txfm2d_test.cc
+++ b/libaom/test/av1_inv_txfm2d_test.cc

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <tuple>
 #include <vector>
 
 #include "config/av1_rtcd.h"
@@ -36,11 +37,33 @@
 
 using std::vector;
 
+typedef TX_TYPE TxType;
+typedef TX_SIZE TxSize;
+
 namespace {
 
+static const char *tx_type_name[] = {
+  "DCT_DCT",
+  "ADST_DCT",
+  "DCT_ADST",
+  "ADST_ADST",
+  "FLIPADST_DCT",
+  "DCT_FLIPADST",
+  "FLIPADST_FLIPADST",
+  "ADST_FLIPADST",
+  "FLIPADST_ADST",
+  "IDTX",
+  "V_DCT",
+  "H_DCT",
+  "V_ADST",
+  "H_ADST",
+  "V_FLIPADST",
+  "H_FLIPADST",
+};
+
 // AV1InvTxfm2dParam argument list:
 // tx_type_, tx_size_, max_error_, max_avg_error_
-typedef ::testing::tuple<TX_TYPE, TX_SIZE, int, double> AV1InvTxfm2dParam;
+typedef std::tuple<TxType, TxSize, int, double> AV1InvTxfm2dParam;
 
 class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
  public:
@@ -86,7 +109,7 @@
         }
         double ref_coeffs[64 * 64] = { 0 };
         ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
-        ASSERT_EQ(tx_type_, DCT_DCT);
+        ASSERT_EQ(tx_type_, static_cast<TxType>(DCT_DCT));
         libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
                                          tx_size_);
         DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
@@ -139,8 +162,8 @@
 
   int max_error_;
   double max_avg_error_;
-  TX_TYPE tx_type_;
-  TX_SIZE tx_size_;
+  TxType tx_type_;
+  TxSize tx_size_;
 };
 
 static int max_error_ls[TX_SIZES_ALL] = {
@@ -193,8 +216,8 @@
     const int max_error = max_error_ls[s];
     const double avg_error = avg_error_ls[s];
     for (int t = 0; t < TX_TYPES; ++t) {
-      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
-      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      const TxType tx_type = static_cast<TxType>(t);
+      const TxSize tx_size = static_cast<TxSize>(s);
       if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
         param_list.push_back(
             AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
@@ -204,8 +227,8 @@
   return param_list;
 }
 
-INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm2d,
-                        ::testing::ValuesIn(GetInvTxfm2dParamList()));
+INSTANTIATE_TEST_SUITE_P(C, AV1InvTxfm2d,
+                         ::testing::ValuesIn(GetInvTxfm2dParamList()));
 
 TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
 
@@ -216,18 +239,18 @@
     int8_t high_range = libaom_test::high_range_arr[bd_idx];
     for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
       for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
-                                           static_cast<TX_TYPE>(tx_type)) ==
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(tx_size),
+                                           static_cast<TxType>(tx_type)) ==
             false) {
           continue;
         }
         TXFM_2D_FLIP_CFG cfg;
-        av1_get_inv_txfm_cfg(static_cast<TX_TYPE>(tx_type),
-                             static_cast<TX_SIZE>(tx_size), &cfg);
+        av1_get_inv_txfm_cfg(static_cast<TxType>(tx_type),
+                             static_cast<TxSize>(tx_size), &cfg);
         int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
         int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
         av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
-                                (TX_SIZE)tx_size, bd);
+                                static_cast<TxSize>(tx_size), bd);
         libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
                                             cfg.cos_bit_col, low_range,
                                             high_range);
@@ -239,18 +262,19 @@
   }
 }
 
-typedef ::testing::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
+typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
 class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
  public:
   virtual void SetUp() { target_func_ = GET_PARAM(0); }
-  void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times);
+  void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
+                           int gt_int16 = 0);
 
  private:
   LbdInvTxfm2dFunc target_func_;
 };
 
-void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size,
-                                          int run_times) {
+void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
+                                          int run_times, int gt_int16) {
   FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
   InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
   if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) {
@@ -275,6 +299,7 @@
   const int16_t eobmax = rows_nonezero * cols_nonezero;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int randTimes = run_times == 1 ? (eobmax + 500) : 1;
+
   for (int cnt = 0; cnt < randTimes; ++cnt) {
     const int16_t max_in = (1 << (bd)) - 1;
     for (int r = 0; r < BLK_WIDTH; ++r) {
@@ -291,7 +316,9 @@
     for (int i = eob; i < eobmax; i++) {
       inv_input[scan[i]] = 0;
     }
-
+    if (gt_int16) {
+      inv_input[scan[eob - 1]] = ((int32_t)INT16_MAX * 100 / 141);
+    }
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
@@ -313,10 +340,13 @@
     for (int r = 0; r < rows; ++r) {
       for (int c = 0; c < cols; ++c) {
         uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
+        if (ref_value != output[r * stride + c]) {
+          printf(" ");
+        }
         ASSERT_EQ(ref_value, output[r * stride + c])
             << "[" << r << "," << c << "] " << cnt
             << " tx_size: " << static_cast<int>(tx_size)
-            << " tx_type: " << tx_type << " eob " << eob;
+            << " tx_type: " << tx_type_name[tx_type] << " eob " << eob;
       }
     }
   }
@@ -325,21 +355,34 @@
 TEST_P(AV1LbdInvTxfm2d, match) {
   for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
     for (int i = 0; i < (int)TX_TYPES; ++i) {
-      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
-                                         static_cast<TX_TYPE>(i))) {
-        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
-                            1);
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+                                         static_cast<TxType>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j), 1);
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, gt_int16) {
+  static const TxType types[] = { DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX,
+                                  V_DCT,   H_DCT,    H_ADST,       H_FLIPADST };
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    const TxSize sz = static_cast<TxSize>(j);
+    for (uint8_t i = 0; i < sizeof(types) / sizeof(types[0]); ++i) {
+      const TxType tp = types[i];
+      if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+        RunAV1InvTxfm2dTest(tp, sz, 1, 1);
       }
     }
   }
 }
 
 TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
-  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+  for (int j = 1; j < (int)(TX_SIZES_ALL); ++j) {
     for (int i = 0; i < (int)TX_TYPES; ++i) {
-      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
-                                         static_cast<TX_TYPE>(i))) {
-        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+                                         static_cast<TxType>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j),
                             10000000);
       }
     }
@@ -349,30 +392,31 @@
 #if HAVE_SSSE3
 #if defined(_MSC_VER) || defined(__SSSE3__)
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1LbdInvTxfm2d,
-                        ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
 #endif  // _MSC_VER || __SSSE3__
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
 extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
                                               uint8_t *output, int stride,
-                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              TxType tx_type, TxSize tx_size,
                                               int eob);
 
-INSTANTIATE_TEST_CASE_P(AVX2, AV1LbdInvTxfm2d,
-                        ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
 #endif  // HAVE_AVX2
 
+// TODO(yunqing): Re-enable this unit test for NEON version after the functions
+// are fixed.
 #if HAVE_NEON
-
 extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
                                               uint8_t *output, int stride,
                                               TX_TYPE tx_type, TX_SIZE tx_size,
                                               int eob);
 
-INSTANTIATE_TEST_CASE_P(NEON, AV1LbdInvTxfm2d,
-                        ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
 #endif  // HAVE_NEON
 
 }  // namespace

diff --git a/libaom/test/av1_nn_predict_test.cc b/libaom/test/av1_nn_predict_test.cc
index 0574a15..c03cba8 100644
--- a/libaom/test/av1_nn_predict_test.cc
+++ b/libaom/test/av1_nn_predict_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "aom/aom_integer.h"
@@ -25,9 +27,9 @@
 namespace {
 typedef void (*NnPredict_Func)(const float *const input_nodes,
                                const NN_CONFIG *const nn_config,
-                               float *const output);
+                               int reduce_prec, float *const output);
 
-typedef ::testing::tuple<const NnPredict_Func> NnPredictTestParam;
+typedef std::tuple<const NnPredict_Func> NnPredictTestParam;
 
 const float epsilon = 1e-3f;  // Error threshold for functional equivalence
 
@@ -115,8 +117,8 @@
       weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
     }
 
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
-    target_func_(inputs, &nn_config, outputs_test);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+    target_func_(inputs, &nn_config, 0, outputs_test);
     libaom_test::ClearSystemState();
 
     for (int node = 0; node < shape->num_outputs; node++) {
@@ -155,13 +157,13 @@
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    av1_nn_predict_c(inputs, &nn_config, outputs_ref);
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
   }
   aom_usec_timer_mark(&timer);
   const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    target_func_(inputs, &nn_config, outputs_test);
+    target_func_(inputs, &nn_config, 0, outputs_test);
   }
   aom_usec_timer_mark(&timer);
   libaom_test::ClearSystemState();
@@ -208,8 +210,8 @@
 }
 
 #if HAVE_SSE3
-INSTANTIATE_TEST_CASE_P(SSE3, NnPredictTest,
-                        ::testing::Values(av1_nn_predict_sse3));
+INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
+                         ::testing::Values(av1_nn_predict_sse3));
 #endif
 
 }  // namespace

diff --git a/libaom/test/av1_quantize_test.cc b/libaom/test/av1_quantize_test.cc
index aaf0939..39a3c33 100644
--- a/libaom/test/av1_quantize_test.cc
+++ b/libaom/test/av1_quantize_test.cc

@@ -73,7 +73,7 @@
     const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
     for (int i = 0; i < numTests; i++) {
       int err_count = 0;
-      ref_eob = eob = -1;
+      ref_eob = eob = UINT16_MAX;
       for (int j = 0; j < count; j++) {
         coeff_ptr[j] = rnd(coeffRange);
       }
@@ -83,7 +83,7 @@
         quant_shift_ptr[j] = rnd.Rand16();
         // int16_t positive
         dequant_ptr[j] = abs(rnd(dequantRange));
-        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]);
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
       }
       for (int j = 2; j < 8; ++j) {
@@ -145,7 +145,7 @@
     const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
 
     for (int i = 0; i < numTests; i++) {
-      ref_eob = eob = -1;
+      ref_eob = eob = UINT16_MAX;
       for (int j = 0; j < count; j++) {
         coeff_ptr[j] = 0;
       }
@@ -218,7 +218,7 @@
                      1024),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
@@ -233,7 +233,7 @@
                      1024),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace

diff --git a/libaom/test/av1_round_shift_array_test.cc b/libaom/test/av1_round_shift_array_test.cc
index 61dbed5..993fa9f 100644
--- a/libaom/test/av1_round_shift_array_test.cc
+++ b/libaom/test/av1_round_shift_array_test.cc

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <tuple>
 
 #include "config/av1_rtcd.h"
 
@@ -33,7 +34,7 @@
 };
 #endif  // HAVE_SSE4_1 || HAVE_NEON
 
-typedef ::testing::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
+typedef std::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
     CompRoundShiftParam;
 
 class AV1CompRoundShiftTest
@@ -111,7 +112,7 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1CompRoundShiftTest,
     ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1),
                        ::testing::ValuesIn(txsize_to_bsize),
@@ -119,7 +120,7 @@
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AV1CompRoundShiftTest,
     ::testing::Combine(::testing::Values(&av1_round_shift_array_neon),
                        ::testing::ValuesIn(txsize_to_bsize),

diff --git a/libaom/test/av1_txfm_test.cc b/libaom/test/av1_txfm_test.cc
index abbc475..aedd45d 100644
--- a/libaom/test/av1_txfm_test.cc
+++ b/libaom/test/av1_txfm_test.cc

@@ -119,7 +119,7 @@
   }
 }
 
-// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4_new'
+// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4'
 // function). Should be replaced by a proper reference function that takes
 // 'double' input & output.
 static void fadst4_new(const tran_low_t *input, tran_low_t *output) {

diff --git a/libaom/test/av1_wedge_utils_test.cc b/libaom/test/av1_wedge_utils_test.cc
index e8fbe69..f9dc838 100644
--- a/libaom/test/av1_wedge_utils_test.cc
+++ b/libaom/test/av1_wedge_utils_test.cc

@@ -221,7 +221,8 @@
 // av1_wedge_sign_from_residuals
 //////////////////////////////////////////////////////////////////////////////
 
-typedef int (*FSign)(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
+typedef int8_t (*FSign)(const int16_t *ds, const uint8_t *m, int N,
+                        int64_t limit);
 typedef libaom_test::FuncParam<FSign> TestFuncsFSign;
 
 class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
@@ -354,34 +355,34 @@
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, WedgeUtilsSSEOptTest,
     ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
                                     av1_wedge_sse_from_residuals_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, WedgeUtilsSignOptTest,
     ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
                                      av1_wedge_sign_from_residuals_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, WedgeUtilsDeltaSquaresOptTest,
     ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
                                    av1_wedge_compute_delta_squares_sse2)));
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, WedgeUtilsSSEOptTest,
     ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2,
                                     av1_wedge_sse_from_residuals_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, WedgeUtilsSignOptTest,
     ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2,
                                      av1_wedge_sign_from_residuals_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, WedgeUtilsDeltaSquaresOptTest,
     ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2,
                                    av1_wedge_compute_delta_squares_avx2)));

diff --git a/libaom/test/avg_test.cc b/libaom/test/avg_test.cc
new file mode 100644
index 0000000..1742aec
--- /dev/null
+++ b/libaom/test/avg_test.cc

@@ -0,0 +1,291 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class AverageTestBase : public ::testing::Test {
+ public:
+  AverageTestBase(int width, int height)
+      : width_(width), height_(height), source_data_(NULL), source_stride_(0),
+        bit_depth_(8) {}
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+
+  virtual void SetUp() {
+    source_data_ = static_cast<Pixel *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+    source_stride_ = (width_ + 31) & ~31;
+    bit_depth_ = 8;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  // Sum Pixels
+  static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 8; ++h) {
+      for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
+    }
+    return (average + 32) >> 6;
+  }
+
+  static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 4; ++h) {
+      for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
+    }
+    return (average + 8) >> 4;
+  }
+
+  void FillConstant(Pixel fill_constant) {
+    for (int i = 0; i < width_ * height_; ++i) {
+      source_data_[i] = fill_constant;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width_ * height_; ++i) {
+      source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
+    }
+  }
+
+  int width_, height_;
+  Pixel *source_data_;
+  int source_stride_;
+  int bit_depth_;
+
+  ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
+
+// Arguments: width, height, pitch, block size, avg function.
+typedef std::tuple<int, int, int, int, AverageFunction> AvgFunc;
+
+class AverageTest : public AverageTestBase<uint8_t>,
+                    public ::testing::WithParamInterface<AvgFunc> {
+ public:
+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    const int block_size = GET_PARAM(3);
+    unsigned int expected = 0;
+    if (block_size == 8) {
+      expected =
+          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
+    } else if (block_size == 4) {
+      expected =
+          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
+    }
+
+    unsigned int actual;
+    ASM_REGISTER_STATE_CHECK(
+        actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_));
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+
+TEST_P(AverageTest, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, MaxValue) {
+  FillConstant(255);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, Random) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+// Params: height, asm function, c function.
+typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    source_data_ = static_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+
+    hbuf_asm_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = NULL;
+    aom_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    aom_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+// Params: width, asm function, c function.
+typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+
+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &aom_avg_8x8_c),
+                      make_tuple(16, 16, 1, 4, &aom_avg_4x4_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_sse2),
+                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
+                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
+                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_sse2,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_sse2,
+                                 &aom_int_pro_col_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_neon),
+                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
+                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
+                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+#endif
+
+}  // namespace

diff --git a/libaom/test/blend_a64_mask_1d_test.cc b/libaom/test/blend_a64_mask_1d_test.cc
index f8844ee..1b6350c 100644
--- a/libaom/test/blend_a64_mask_1d_test.cc
+++ b/libaom/test/blend_a64_mask_1d_test.cc

@@ -194,13 +194,13 @@
                        0, 0);
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, BlendA64Mask1DTest8B,
     ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c),
                       TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c)));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64Mask1DTest8B,
     ::testing::Values(
         TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1),
@@ -208,17 +208,17 @@
 #endif  // HAVE_SSE4_1
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, BlendA64Mask1DTest8B,
-                        ::testing::Values(TestFuncs(blend_a64_hmask_ref,
-                                                    aom_blend_a64_hmask_neon),
-                                          TestFuncs(blend_a64_vmask_ref,
-                                                    aom_blend_a64_vmask_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64Mask1DTest8B,
+    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_neon),
+                      TestFuncs(blend_a64_vmask_ref,
+                                aom_blend_a64_vmask_neon)));
 #endif  // HAVE_NEON
 
 //////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                      uint32_t src0_stride, const uint8_t *src1,
                      uint32_t src1_stride, const uint8_t *mask, int w, int h,
@@ -321,7 +321,7 @@
       BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, BlendA64Mask1DTestHBD,
     ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
                                    aom_highbd_blend_a64_hmask_c),
@@ -329,11 +329,12 @@
                                    aom_highbd_blend_a64_vmask_c)));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64Mask1DTestHBD,
     ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
                                    aom_highbd_blend_a64_hmask_sse4_1),
                       TestFuncsHBD(highbd_blend_a64_vmask_ref,
                                    aom_highbd_blend_a64_vmask_sse4_1)));
 #endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/blend_a64_mask_test.cc b/libaom/test/blend_a64_mask_test.cc
index 7592533..5c2c291 100644
--- a/libaom/test/blend_a64_mask_test.cc
+++ b/libaom/test/blend_a64_mask_test.cc

@@ -246,15 +246,15 @@
   }
 }
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
-                        ::testing::Values(TestFuncs(
-                            aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BlendA64MaskTest8B,
+                         ::testing::Values(TestFuncs(
+                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
-                        ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
-                                                    aom_blend_a64_mask_avx2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B,
+                         ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+                                                     aom_blend_a64_mask_avx2)));
 #endif  // HAVE_AVX2
 
 //////////////////////////////////////////////////////////////////////////////
@@ -342,21 +342,21 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64MaskTest8B_d16,
     ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
                                     aom_lowbd_blend_a64_d16_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, BlendA64MaskTest8B_d16,
     ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
                                     aom_lowbd_blend_a64_d16_mask_avx2)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, BlendA64MaskTest8B_d16,
     ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
                                     aom_lowbd_blend_a64_d16_mask_neon)));
@@ -365,7 +365,7 @@
 //////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                      uint32_t src0_stride, const uint8_t *src1,
                      uint32_t src1_stride, const uint8_t *mask,
@@ -457,7 +457,7 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64MaskTestHBD,
     ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
                                    aom_highbd_blend_a64_mask_sse4_1)));
@@ -589,19 +589,19 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, BlendA64MaskTestHBD_d16,
     ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64MaskTestHBD_d16,
     ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
                                        aom_highbd_blend_a64_d16_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, BlendA64MaskTestHBD_d16,
     ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
                                        aom_highbd_blend_a64_d16_mask_avx2)));
@@ -610,10 +610,11 @@
 // TODO(slavarnway): Enable the following in the avx2 commit. (56501)
 #if 0
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BlendA64MaskTestHBD,
     ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
                                    aom_highbd_blend_a64_mask_avx2)));
 #endif  // HAVE_AVX2
 #endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/blockd_test.cc b/libaom/test/blockd_test.cc
index ab62400..17e6968 100644
--- a/libaom/test/blockd_test.cc
+++ b/libaom/test/blockd_test.cc

@@ -56,7 +56,7 @@
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
     }, {
                                     BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
@@ -65,7 +65,7 @@
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
     }, {
                                     BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
@@ -74,7 +74,7 @@
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
     }, {
                                     BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
@@ -83,7 +83,7 @@
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
     }, {
                                     BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
       BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,

diff --git a/libaom/test/cdef_test.cc b/libaom/test/cdef_test.cc
index 8201818..a2ec1e3 100644
--- a/libaom/test/cdef_test.cc
+++ b/libaom/test/cdef_test.cc

@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -28,8 +29,8 @@
 
 namespace {
 
-typedef ::testing::tuple<cdef_filter_block_func, cdef_filter_block_func,
-                         BLOCK_SIZE, int, int>
+typedef std::tuple<cdef_filter_block_func, cdef_filter_block_func, BLOCK_SIZE,
+                   int, int>
     cdef_dir_param_t;
 
 class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
@@ -185,7 +186,7 @@
 typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
                           int coeff_shift);
 
-typedef ::testing::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+typedef std::tuple<find_dir_t, find_dir_t> find_dir_param_t;
 
 class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
  public:
@@ -285,140 +286,140 @@
   test_finddir_speed(finddir, ref_finddir);
 }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 // VS compiling for 32 bit targets does not support vector types in
 // structs as arguments, which makes the v256 type of the intrinsics
 // hard to support, so optimizations for this target are disabled.
 #if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, CDEFBlockTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                      &cdef_find_dir_c)));
 #endif
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, CDEFBlockTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, CDEFBlockTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, CDEFBlockTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, CDEFBlockTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                      &cdef_find_dir_c)));
 #endif
 
 // Test speed for all supported architectures
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, CDEFSpeedTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, CDEFSpeedTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, CDEFSpeedTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, CDEFSpeedTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, CDEFSpeedTest,
     ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
                        ::testing::Values(&cdef_filter_block_c),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
                        ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
-INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
-                                                     &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                      &cdef_find_dir_c)));
 #endif
 
 #endif  // defined(_WIN64) || !defined(_MSC_VER)

diff --git a/libaom/test/cfl_test.cc b/libaom/test/cfl_test.cc
index f087dd9..d297315 100644
--- a/libaom/test/cfl_test.cc
+++ b/libaom/test/cfl_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
@@ -17,37 +19,44 @@
 #include "test/util.h"
 #include "test/acm_random.h"
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 using libaom_test::ACMRandom;
 
 #define NUM_ITERATIONS (100)
 #define NUM_ITERATIONS_SPEED (INT16_MAX)
 
-#define ALL_CFL_TX_SIZES(function)                                     \
-  make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function),        \
-      make_tuple(TX_4X16, &function), make_tuple(TX_8X4, &function),   \
-      make_tuple(TX_8X8, &function), make_tuple(TX_8X16, &function),   \
-      make_tuple(TX_8X32, &function), make_tuple(TX_16X4, &function),  \
-      make_tuple(TX_16X8, &function), make_tuple(TX_16X16, &function), \
-      make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
-      make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
+#define ALL_CFL_TX_SIZES(function)                           \
+  make_tuple(static_cast<TX_SIZE>(TX_4X4), &function),       \
+      make_tuple(static_cast<TX_SIZE>(TX_4X8), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_4X16), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X4), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X8), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X16), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X32), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X4), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X8), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X16), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_16X32), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X8), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_32X16), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X32), &function)
 
-#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444) \
-  make_tuple(TX_4X4, &fun420, &fun422, &fun444),           \
-      make_tuple(TX_4X8, &fun420, &fun422, &fun444),       \
-      make_tuple(TX_4X16, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_8X4, &fun420, &fun422, &fun444),       \
-      make_tuple(TX_8X8, &fun420, &fun422, &fun444),       \
-      make_tuple(TX_8X16, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_8X32, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_16X4, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_16X8, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_16X16, &fun420, &fun422, &fun444),     \
-      make_tuple(TX_16X32, &fun420, &fun422, &fun444),     \
-      make_tuple(TX_32X8, &fun420, &fun422, &fun444),      \
-      make_tuple(TX_32X16, &fun420, &fun422, &fun444),     \
-      make_tuple(TX_32X32, &fun420, &fun422, &fun444)
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444)                   \
+  make_tuple(static_cast<TX_SIZE>(TX_4X4), &fun420, &fun422, &fun444),       \
+      make_tuple(static_cast<TX_SIZE>(TX_4X8), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_4X16), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X4), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X8), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X16), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X32), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X4), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X8), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X16), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_16X32), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X8), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_32X16), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X32), &fun420, &fun422, &fun444)
 
 namespace {
 
@@ -159,14 +168,14 @@
 };
 
 typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
+typedef std::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
 class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
                       public CFLTestWithData<int16_t> {
  public:
   virtual void SetUp() {
-    CFLTest::init(::testing::get<0>(this->GetParam()));
-    sub_avg = ::testing::get<1>(this->GetParam())(tx_size);
-    sub_avg_ref = get_subtract_average_fn_c(tx_size);
+    CFLTest::init(std::get<0>(this->GetParam()));
+    sub_avg = std::get<1>(this->GetParam())(tx_size);
+    sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size);
   }
   virtual ~CFLSubAvgTest() {}
 
@@ -209,10 +218,10 @@
                          public CFLTestWithData<I> {
  public:
   virtual void SetUp() {
-    CFLTest::init(::testing::get<0>(this->GetParam()));
-    fun_420 = ::testing::get<1>(this->GetParam())(this->tx_size);
-    fun_422 = ::testing::get<2>(this->GetParam())(this->tx_size);
-    fun_444 = ::testing::get<3>(this->GetParam())(this->tx_size);
+    CFLTest::init(std::get<0>(this->GetParam()));
+    fun_420 = std::get<1>(this->GetParam())(this->tx_size);
+    fun_422 = std::get<2>(this->GetParam())(this->tx_size);
+    fun_444 = std::get<3>(this->GetParam())(this->tx_size);
   }
 
  protected:
@@ -262,8 +271,8 @@
 };
 
 typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
-                         get_subsample_lbd_fn>
+typedef std::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
+                   get_subsample_lbd_fn>
     subsample_lbd_param;
 class CFLSubsampleLBDTest
     : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
@@ -303,9 +312,10 @@
   subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
-                         get_subsample_hbd_fn>
+typedef std::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
+                   get_subsample_hbd_fn>
     subsample_hbd_param;
 class CFLSubsampleHBDTest
     : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
@@ -344,16 +354,17 @@
 TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
   subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
+typedef std::tuple<TX_SIZE, get_predict_fn> predict_param;
 class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
                        public CFLTestWithAlignedData<uint8_t> {
  public:
   virtual void SetUp() {
-    CFLTest::init(::testing::get<0>(this->GetParam()));
-    predict = ::testing::get<1>(this->GetParam())(tx_size);
-    predict_ref = get_predict_lbd_fn_c(tx_size);
+    CFLTest::init(std::get<0>(this->GetParam()));
+    predict = std::get<1>(this->GetParam())(tx_size);
+    predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
   }
   virtual ~CFLPredictTest() {}
 
@@ -391,15 +402,16 @@
   assertFaster(ref_elapsed_time, elapsed_time);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+typedef std::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
 class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
                           public CFLTestWithAlignedData<uint16_t> {
  public:
   virtual void SetUp() {
-    CFLTest::init(::testing::get<0>(this->GetParam()));
-    predict = ::testing::get<1>(this->GetParam())(tx_size);
-    predict_ref = get_predict_hbd_fn_c(tx_size);
+    CFLTest::init(std::get<0>(this->GetParam()));
+    predict = std::get<1>(this->GetParam())(tx_size);
+    predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
   }
   virtual ~CFLPredictHBDTest() {}
 
@@ -438,13 +450,14 @@
   printSpeed(ref_elapsed_time, elapsed_time, width, height);
   assertFaster(ref_elapsed_time, elapsed_time);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSE2
 const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
-    get_subtract_average_fn_sse2) };
+    cfl_get_subtract_average_fn_sse2) };
 
-INSTANTIATE_TEST_CASE_P(SSE2, CFLSubAvgTest,
-                        ::testing::ValuesIn(sub_avg_sizes_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_sse2));
 
 #endif
 
@@ -455,34 +468,36 @@
                              cfl_get_luma_subsampling_444_lbd_ssse3)
 };
 
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_ssse3));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
                              cfl_get_luma_subsampling_422_hbd_ssse3,
                              cfl_get_luma_subsampling_444_hbd_ssse3)
 };
 
-const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
-    get_predict_lbd_fn_ssse3) };
-
 const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
-    get_predict_hbd_fn_ssse3) };
+    cfl_get_predict_hbd_fn_ssse3) };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleLBDTest,
-                        ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
 
-INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleHBDTest,
-                        ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
-
-INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
-                        ::testing::ValuesIn(predict_sizes_ssse3));
-
-INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictHBDTest,
-                        ::testing::ValuesIn(predict_sizes_hbd_ssse3));
-#endif
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_ssse3));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
 const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
-    get_subtract_average_fn_avx2) };
+    cfl_get_subtract_average_fn_avx2) };
 
 const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
@@ -490,38 +505,42 @@
                              cfl_get_luma_subsampling_444_lbd_avx2)
 };
 
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
                              cfl_get_luma_subsampling_422_hbd_avx2,
                              cfl_get_luma_subsampling_444_hbd_avx2)
 };
 
-const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
-    get_predict_lbd_fn_avx2) };
-
 const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
-    get_predict_hbd_fn_avx2) };
+    cfl_get_predict_hbd_fn_avx2) };
 
-INSTANTIATE_TEST_CASE_P(AVX2, CFLSubAvgTest,
-                        ::testing::ValuesIn(sub_avg_sizes_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_avx2));
 
-INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleLBDTest,
-                        ::testing::ValuesIn(subsample_lbd_sizes_avx2));
-
-INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleHBDTest,
-                        ::testing::ValuesIn(subsample_hbd_sizes_avx2));
-
-INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
-                        ::testing::ValuesIn(predict_sizes_avx2));
-
-INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictHBDTest,
-                        ::testing::ValuesIn(predict_sizes_hbd_avx2));
-#endif
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_avx2));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
-
 const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
-    get_subtract_average_fn_neon) };
+    cfl_get_subtract_average_fn_neon) };
+
+const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_neon) };
 
 const subsample_lbd_param subsample_lbd_sizes_neon[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
@@ -529,39 +548,38 @@
                              cfl_get_luma_subsampling_444_lbd_neon)
 };
 
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 const subsample_hbd_param subsample_hbd_sizes_neon[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
                              cfl_get_luma_subsampling_422_hbd_neon,
                              cfl_get_luma_subsampling_444_hbd_neon)
 };
 
-const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
-    get_predict_lbd_fn_neon) };
-
 const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES(
-    get_predict_hbd_fn_neon) };
+    cfl_get_predict_hbd_fn_neon) };
 
-INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
-                        ::testing::ValuesIn(sub_avg_sizes_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_neon));
 
-INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleLBDTest,
-                        ::testing::ValuesIn(subsample_lbd_sizes_neon));
-
-INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleHBDTest,
-                        ::testing::ValuesIn(subsample_hbd_sizes_neon));
-
-INSTANTIATE_TEST_CASE_P(NEON, CFLPredictTest,
-                        ::testing::ValuesIn(predict_sizes_neon));
-
-INSTANTIATE_TEST_CASE_P(NEON, CFLPredictHBDTest,
-                        ::testing::ValuesIn(predict_sizes_hbd_neon));
-#endif
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_neon));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_NEON
 
 #if HAVE_VSX
 const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
-    get_subtract_average_fn_vsx) };
+    cfl_get_subtract_average_fn_vsx) };
 
-INSTANTIATE_TEST_CASE_P(VSX, CFLSubAvgTest,
-                        ::testing::ValuesIn(sub_avg_sizes_vsx));
+INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_vsx));
 #endif
 }  // namespace

diff --git a/libaom/test/cnn_test.cc b/libaom/test/cnn_test.cc
new file mode 100644
index 0000000..4410493
--- /dev/null
+++ b/libaom/test/cnn_test.cc

@@ -0,0 +1,2496 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/cnn.h"
+
+#define SQR(x) ((x) * (x))
+
+// Best possible pixelwise guarenteed preicison given each float has at most
+// 3 specified decimals.
+#define PIXELWISE_FLOAT_TOL 1E-2
+
+#define MSE_FLOAT_TOL 1E-6
+#define MSE_INT_TOL 0
+
+namespace {
+
+class CNNTest : public ::testing::Test {
+ protected:
+  static void RunCNNTest(int image_width, int image_height, const float *input,
+                         const float *expected, const CNN_CONFIG *cnn_config,
+                         int in_stride, CNN_THREAD_DATA *thread_data,
+                         double tolerance) {
+    int out_width, out_height, out_channels;
+    av1_find_cnn_output_size(image_width, image_height, cnn_config, &out_width,
+                             &out_height, &out_channels);
+
+    const int out_size = out_width * out_height;
+    const int out_stride = out_width;
+
+    float *output_ =
+        (float *)aom_malloc(sizeof(*output_) * out_size * out_channels);
+    float *output[CNN_MAX_CHANNELS] = { nullptr };
+    for (int channel = 0; channel < out_channels; ++channel) {
+      output[channel] = output_ + (channel * out_size);
+    }
+    const int num_outputs = 1;
+    const int output_chs[1] = { out_channels };
+    const int output_strides[1] = { out_stride };
+    CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_strides,
+                                    output };
+
+    RunMultiOutCNNTest(&input, image_width, image_height, in_stride, cnn_config,
+                       thread_data, &output_struct, &expected, tolerance);
+
+    aom_free(output_);
+  }
+
+  static void RunMultiOutCNNTest(const float **input, int image_width,
+                                 int image_height, int in_stride,
+                                 const CNN_CONFIG *cnn_config,
+                                 CNN_THREAD_DATA *thread_data,
+                                 CNN_MULTI_OUT *output, const float **expected,
+                                 double tolerance) {
+    const int num_outputs = output->num_outputs;
+    const int *output_chs = output->output_channels;
+
+    int *out_widths = (int *)aom_calloc(num_outputs, sizeof(*out_widths));
+    int *out_heights = (int *)aom_calloc(num_outputs, sizeof(*out_heights));
+    int *not_used = (int *)aom_calloc(num_outputs, sizeof(*not_used));
+
+    av1_find_cnn_output_size(image_width, image_height, cnn_config, out_widths,
+                             out_heights, not_used);
+    av1_cnn_predict(input, image_width, image_height, in_stride, cnn_config,
+                    thread_data, output);
+
+    int channel_offset = 0;
+    for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+      const float *expected_out = expected[output_idx];
+      const int curr_output_chs = output_chs[output_idx];
+      const int out_size = out_widths[output_idx] * out_heights[output_idx];
+
+      double mse = 0;
+      int expected_ite = 0;
+      for (int channel = 0; channel < curr_output_chs; ++channel) {
+        const float *buf_out = output->output_buffer[channel_offset];
+
+        for (int i = 0; i < out_size; ++i) {
+          EXPECT_NEAR(expected_out[expected_ite], buf_out[i],
+                      PIXELWISE_FLOAT_TOL)
+              << " output " << output_idx << " channel " << channel << " pixel "
+              << expected_ite % out_size << ": " << expected_out[expected_ite]
+              << "/" << buf_out[i] << std::endl;
+          mse += SQR(expected_out[expected_ite] - buf_out[i]);
+          expected_ite++;
+        }
+
+        channel_offset++;
+      }
+      mse /= (out_size * curr_output_chs);
+      EXPECT_LE(mse, tolerance) << " output " << output_idx << std::endl;
+    }
+
+    aom_free(out_widths);
+    aom_free(out_heights);
+    aom_free(not_used);
+  }
+
+  static void AssignLayerWeightsBiases(CNN_CONFIG *cnn_config, float *weights,
+                                       float *bias) {
+    size_t weight_offset = 0;
+    size_t bias_offset = 0;
+    for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+      CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+      layer_config->weights = weights + weight_offset;
+      layer_config->bias = bias + bias_offset;
+      weight_offset += layer_config->filter_width *
+                       layer_config->filter_height * layer_config->in_channels *
+                       layer_config->out_channels;
+      bias_offset += layer_config->out_channels;
+
+      ASSERT_NE(layer_config->weights, nullptr);
+      ASSERT_NE(layer_config->bias, nullptr);
+    }
+  }
+};
+
+}  // namespace
+
+TEST_F(CNNTest, TestMultilayerConvolution) {
+  int image_height = 16;
+  int image_width = 16;
+  int filter_height = 5;
+  int filter_width = 4;
+
+  float input[] = {
+    -3, 1,  -3, 2,  -2, -2, 2,  -2, 1,  -2, -3, 1,  2,  2,  2,  -2, 0,  1,  -1,
+    -3, -1, -1, 1,  0,  -3, 1,  0,  -1, 1,  0,  0,  -3, -3, -3, 0,  2,  1,  -1,
+    2,  0,  1,  -3, -1, 2,  2,  1,  -2, 0,  -1, 0,  -2, -2, -1, 1,  0,  0,  0,
+    -2, -2, -2, 1,  1,  -2, 1,  1,  -2, -2, 1,  -2, -1, -2, -3, 2,  -3, -1, 1,
+    0,  -2, -2, -2, 1,  -2, -2, -1, -1, 2,  2,  2,  -1, 1,  -3, -3, 0,  2,  0,
+    2,  1,  -3, -3, 1,  2,  2,  1,  -2, -3, 0,  -3, 0,  -3, -2, 0,  1,  1,  0,
+    -3, 2,  -1, 2,  1,  0,  1,  -2, 1,  -1, -1, 2,  0,  -2, -3, 1,  1,  -2, -1,
+    -3, -3, -1, 0,  -3, -2, 0,  0,  1,  0,  -3, -2, -1, 1,  0,  2,  1,  0,  -3,
+    -2, -3, -3, -1, 0,  -2, 2,  -1, -3, 0,  -1, -1, 2,  0,  -3, -2, -1, 0,  0,
+    1,  -2, 1,  2,  1,  2,  2,  -3, 2,  -1, 0,  0,  -1, 0,  2,  2,  -1, 2,  -2,
+    1,  1,  -3, -3, 1,  -1, -1, -2, 2,  -2, -2, 2,  -1, -3, 2,  -3, 1,  -1, -1,
+    -3, 1,  -1, 1,  0,  -3, -3, 1,  -3, -3, 0,  2,  2,  -2, -1, 2,  0,  2,  1,
+    -1, -3, 0,  0,  -1, -1, 1,  0,  2,  0,  -3, 2,  1,  0,  1,  -3, 2,  -3, -3,
+    -1, -3, -3, 2,  0,  2,  -2, 1,  -1,
+  };
+
+  float weights[] = {
+    -2, 2,  -2, 2,  -1, -3, 2,  2,  0,  0,  -3, -1, -2, -3, 1,  -1, 0,  0,  0,
+    2,  -2, 2,  -2, -3, 1,  1,  1,  -3, -1, 0,  1,  2,  -2, 0,  -1, -3, -1, -2,
+    2,  -3, -3, 1,  -2, -3, 0,  2,  1,  -3, -3, -1, -3, -2, -1, -3, -1, -3, -2,
+    -1, -3, -1, -2, -2, -3, 2,  0,  -3, 0,  -3, -3, 1,  -3, -1, 0,  -1, 1,  1,
+    -1, 1,  -2, 0,  2,  0,  -3, 1,  -1, -1, 2,  0,  1,  -3, -3, 1,  2,  -3, -3,
+    1,  -3, 2,  0,  -3, 1,  2,  2,  -2, -1, -2, 1,  1,  0,  -2, -2, 1,  2,  -1,
+    -3, 1,  -2, 2,  -3, -2, -3, 2,  1,  0,  -2, 0,  1,  -3, 2,  -2, -2, 0,  2,
+    -3, 2,  0,  0,  1,  -2, 1,  1,  -2, -1, -2, 1,  -2, 0,  -2, -2, 0,  -1, -1,
+    -3, -3, -3, 1,  -3, -2, 2,  -1, 2,  0,  2,  -2, 2,  -2, 1,  -3, -3, -1, 0,
+    2,  2,  1,  -1, -3, -1, -3, 2,  1,  -2, 0,  -3, -1, -3, -1, 2,  1,  0,  2,
+    -1, 1,  0,  1,  2,  -1, -2, 2,  1,  -3, -1, -3, 0,  1,  -2, 0,  -2, -3, 0,
+    -2, 2,  2,  0,  0,  2,  -3, 2,  -3, -2, 1,  2,  -3, -3, -1, -3, 0,  -3, -3,
+    -2, -2, -2, 0,  0,  1,  0,  0,  -1, 0,  0,  -3, 0,  -3, -1, -2, 1,  -2, -1,
+    2,  -2, 0,  0,  1,  0,  -2, -1, 0,  -3, 1,  0,  -1, -3, 1,  -1, 1,  -1, -3,
+    1,  0,  1,  1,  -1, 2,  2,  0,  0,  1,  -3, 2,  -2, -2, -3, -2, -1, -2, 2,
+    0,  2,  -2, -3, -1, -3, 2,  2,  -1, 2,  2,  -1, 0,  -3, 1,
+  };
+
+  float bias[] = {
+    1, -1, 0, 1, 1, 1, -2,
+  };
+
+  float expected_same[] = {
+    -1125, 2926,  6406,  631,   -1244, 97,    -1454, 2526,  1065,  3292,  3464,
+    2553,  -330,  532,   1038,  1182,  -402,  3758,  3392,  9854,  4365,  1408,
+    4736,  3134,  3838,  2409,  3221,  4350,  6750,  4045,  815,   1188,  2959,
+    9802,  9590,  4572,  5740,  4253,  1701,  7974,  7012,  6854,  7093,  3907,
+    4539,  3886,  4267,  3505,  465,   7824,  9219,  10026, 7968,  957,   2295,
+    5594,  10811, 9641,  5950,  10043, 8783,  3132,  1421,  1110,  4108,  13929,
+    10660, -84,   -61,   3932,  -180,  6811,  13393, 15147, 15640, 9337,  6961,
+    3808,  1604,  1398,  1047,  6739,  10144, 6517,  4698,  2678,  7389,  2595,
+    5248,  12075, 11272, 13951, 8820,  1090,  2199,  2206,  2788,  12116, 6683,
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 13208, 7832,  4664,  4657,
+    3534,  1298,  -666,  4250,  7707,  9103,  5760,  688,   9571,  15782, 14203,
+    14878, 17339, 14684, 8690,  5671,  875,   1429,  1531,  6173,  2984,  5558,
+    2996,  7928,  6733,  16117, 15262, 12757, 7980,  3923,  4795,  5973,  2051,
+    455,   -1922, 1816,  5906,  3321,  10908, 10910, 7377,  12204, 12809, 11195,
+    7451,  6666,  74,    -1645, -35,   -391,  3813,  7324,  892,   1656,  6095,
+    12193, 14648, 12156, 14663, 10251, 10325, 7821,  3925,  323,   697,   442,
+    1324,  4669,  7002,  5485,  5171,  5086,  10582, 11053, 9709,  11353, 8543,
+    5256,  2873,  235,   -628,  1496,  1878,  -867,  3420,  6865,  5937,  10182,
+    13277, 10069, 10789, 5998,  624,   -2082, 4417,  1258,  -1080, -819,  -1430,
+    1033,  5220,  6335,  8471,  8980,  11908, 14430, 12584, 8404,  1576,  -803,
+    985,   1481,  1367,  -193,  873,   3684,  2288,  6676,  9477,  11155, 9602,
+    9707,  10507, 4739,  3174,  -575,  -178,  3002,  1710,  423,   -477,  554,
+    3088,  2029,  5113,  5000,  3771,  6090,  5365,  1185,  2855,  399,   -312,
+    -1577, 176,   955,
+  };
+
+  float expected_replicate[] = {
+    13768, 13528, 12999, 6906,  4618,  4043,  2611,  9955,  6685,  4776,  2753,
+    1036,  3063,  4544,  5183,  7349,  12451, 12501, 9131,  12753, 8908,  4058,
+    6299,  7542,  7115,  3307,  3360,  3543,  9754,  7808,  5991,  9019,  14320,
+    14919, 12492, 6871,  7373,  3336,  2085,  10604, 9377,  6882,  5009,  3103,
+    6220,  6278,  7588,  10196, 11045, 11563, 11842, 11911, 8279,  2030,  1858,
+    6368,  12123, 9909,  6347,  10345, 9365,  4038,  1673,  3051,  16492, 16649,
+    12276, 408,   -301,  4122,  -654,  7864,  14038, 15279, 15315, 9744,  8243,
+    5298,  746,   380,   9824,  9124,  10895, 6640,  4712,  2669,  6980,  2759,
+    5385,  12345, 11336, 13129, 8600,  2370,  3682,  5219,  12407, 13123, 6784,
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 13397, 7543,  3916,  4153,
+    4477,  4314,  7983,  8418,  9163,  9103,  5760,  688,   9571,  15782, 14203,
+    14878, 17718, 14570, 7940,  6642,  5094,  7133,  9964,  10219, 3224,  5558,
+    2996,  7928,  6733,  16117, 15262, 12757, 7958,  4401,  5187,  5476,  5529,
+    6055,  2206,  3909,  6015,  3321,  10908, 10910, 7377,  12204, 12809, 11195,
+    6967,  6840,  481,   -1600, 274,   1,     10373, 8514,  1123,  2117,  6758,
+    12736, 16223, 13585, 15988, 11771, 10600, 7918,  4156,  2840,  3111,  3287,
+    6359,  7652,  8813,  6530,  6967,  7789,  13671, 13990, 13247, 13241, 9836,
+    5251,  3024,  2313,  1834,  4187,  2637,  -1312, 2139,  7378,  7665,  11933,
+    15591, 15314, 15678, 9531,  2820,  -1516, 3400,  1314,  22,    363,   -2896,
+    -898,  5906,  7308,  10650, 12975, 16978, 20370, 18817, 12381, 4118,  -861,
+    -137,  236,   1802,  1632,  -350,  2334,  3400,  8680,  14064, 18216, 18675,
+    21765, 22871, 11491, 4937,  -1555, -11,   1669,  2392,  3265,  -5254, -217,
+    5001,  8063,  13444, 18884, 19706, 22794, 21064, 9545,  6689,  -7,    289,
+    -2021, 504,   2347,
+  };
+
+  float expected_valid[] = {
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 9103,  5760,  688,
+    9571,  15782, 14203, 14878, 5558,  2996,  7928,  6733,  16117, 15262,
+    12757, 3321,  10908, 10910, 7377,  12204, 12809, 11195,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    3,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    3,
+                                    filter_width,
+                                    filter_height,
+                                    3,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    3,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  for (int i = 0; i < cnn_config.num_layers; ++i) {
+    cnn_config.layer_config[i].pad = PADDING_SAME_REPLICATE;
+  }
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  for (int i = 0; i < cnn_config.num_layers; ++i) {
+    cnn_config.layer_config[i].pad = PADDING_VALID;
+  }
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestRELUSingleLayer) {
+  int image_width = 8;
+  int image_height = 8;
+  int filter_height = 5;
+  int filter_width = 4;
+  float input[] = {
+    0, -2, -3, 1,  -1, 2,  -2, 1,  -3, -1, 0,  1,  -2, -3, -2, -2,
+    1, -3, 2,  -3, -1, -1, 2,  0,  -2, -3, 0,  -2, -3, 1,  -1, -1,
+    2, -2, 0,  -2, -3, -3, 1,  1,  -1, 1,  0,  1,  -3, 0,  2,  2,
+    0, -3, 1,  -3, 2,  -2, 1,  -1, -1, -2, -3, -2, -1, -3, -2, -1,
+  };
+  float expected_same[] = {
+    9,  0,  1,  1,  0,  3,  0,  19, 0,  12, 10, 0,  0,  0,  5, 0,
+    0,  18, 21, 7,  19, 4,  3,  0,  0,  9,  16, 0,  11, 16, 0, 11,
+    12, 2,  0,  11, 0,  16, 6,  0,  8,  22, 13, 10, 12, 0,  0, 0,
+    0,  1,  2,  12, 29, 6,  10, 0,  13, 0,  0,  5,  8,  10, 0, 0,
+  };
+  float expected_replicate[] = {
+    18, 17, 12, 2,  0,  0,  5,  11, 0,  17, 22, 6,  0,  0,  17, 0,
+    0,  18, 21, 7,  19, 4,  3,  5,  3,  9,  16, 0,  11, 16, 0,  3,
+    3,  2,  0,  11, 0,  16, 6,  0,  17, 22, 13, 10, 12, 0,  0,  0,
+    0,  4,  1,  10, 30, 7,  10, 0,  23, 8,  0,  13, 15, 19, 8,  10,
+  };
+  float expected_valid[] = {
+    18, 21, 7, 19, 4, 9, 16, 0, 11, 16, 2, 0, 11, 0, 16, 22, 13, 10, 12, 0,
+  };
+  float weights[] = {
+    -2, -3, 1, 2, 2, -2, -3, 0, -3, 2, 2, -3, -3, -2, 0, 1, 2, 0, -1, -1,
+  };
+  float bias[] = { -3 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                filter_width,
+                                filter_height,
+                                1,
+                                1,
+                                1,
+                                0,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                RELU,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestVaryingStridesVaryingDimImages) {
+  float weights[] = {
+    1,  -5, -3, -4, -1, 1,  2,  -3, 2,  2,  -1, 1,  -5, 1,  1,
+    -3, -5, 3,  1,  4,  -2, -5, -2, -3, -5, 0,  -1, -5, 2,  -2,
+    -2, 1,  -2, -4, 1,  3,  -2, 2,  0,  -3, 2,  -3, -2, -3,
+  };
+  float bias[] = { 2 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    4,
+                                    11,
+                                    1,
+                                    7,
+                                    6,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  int image_height = 24;
+  int image_width = 17;
+  float input[] = {
+    -1, -3, 4,  4,  -5, 4,  3,  -5, -1, -3, 4,  -4, 2,  -3, 3,  -5, 2,  -1, -5,
+    1,  -1, 3,  1,  -3, -3, 4,  0,  2,  -3, -5, -5, -4, 0,  -5, -2, -3, -1, -2,
+    2,  -5, 4,  4,  0,  -4, -3, 1,  -3, -5, -4, -4, 1,  -2, -3, 3,  -3, -3, -1,
+    -5, -5, -2, 3,  1,  -1, -5, -5, 1,  -4, -2, -1, -2, -4, -4, 2,  -2, 2,  1,
+    -2, -4, -1, 1,  -2, -5, 3,  -2, -1, -1, -5, -3, 1,  -2, -2, -3, -1, -2, -4,
+    -2, 1,  -4, -1, 4,  3,  -4, 0,  4,  2,  2,  4,  -3, -5, 2,  2,  1,  -1, -4,
+    -2, 1,  3,  2,  0,  4,  -1, -3, 2,  1,  -4, 2,  2,  -4, -2, 0,  -2, -1, 4,
+    4,  2,  3,  -4, 2,  -4, -5, 4,  -1, -3, -1, 0,  -4, 1,  3,  -1, -3, -5, 3,
+    -2, -4, 1,  2,  -2, -3, -3, -5, 1,  -3, -1, 0,  -1, 3,  -4, -1, -5, -5, 1,
+    0,  0,  -2, -2, 2,  -2, 0,  0,  2,  0,  -3, 0,  -1, -4, -4, -1, 3,  -4, -4,
+    -1, 0,  -5, -3, -2, 4,  -3, -4, -4, 0,  -5, 1,  -2, -3, -3, -4, 4,  3,  4,
+    3,  3,  -1, 3,  1,  -3, -2, 3,  3,  0,  2,  -4, -3, 2,  2,  0,  -2, 4,  -2,
+    2,  -2, -1, -4, -2, 2,  -4, 3,  -1, 4,  1,  1,  4,  -1, -4, -4, 1,  1,  -2,
+    4,  -1, 3,  2,  -3, 4,  3,  1,  4,  0,  -4, 2,  0,  2,  4,  -2, -2, 4,  2,
+    -1, -2, 1,  -3, 2,  3,  -5, -3, 4,  4,  2,  -5, -4, -5, -2, -4, 2,  0,  2,
+    -5, 4,  -4, -2, -5, 2,  1,  0,  4,  1,  -2, -3, -4, -3, -4, 3,  3,  2,  0,
+    -3, 1,  -5, 4,  0,  4,  -1, 3,  -5, -5, -2, -1, -1, 4,  3,  3,  4,  3,  -4,
+    4,  -3, -3, -1, -4, -1, -4, -1, -2, 4,  -2, -4, 4,  4,  -3, -4, -1, 1,  2,
+    -1, -2, -2, 3,  2,  2,  -3, 0,  -1, 0,  3,  2,  -5, 0,  -4, 0,  0,  2,  -4,
+    -1, -1, 0,  -2, 0,  1,  0,  0,  4,  -5, -1, -5, 2,  -1, 0,  2,  -1, 1,  3,
+    -3, -5, -2, -3, 4,  -2, -2, -1, -3, -4, -1, -2, -4, 1,  4,  -3, -2, -1, 3,
+    -3, -2, 3,  2,  1,  -4, -3, -5, 1,
+  };
+  float expected_1[] = {
+    41, -26, 5, 76, 13, 83, -21, 53, -54, -14, 21, 121,
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_1, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 6;
+  cnn_config.layer_config[0].skip_height = 7;
+
+  float expected_2[] = {
+    21, -50, 41, 20, 72, 127, -21, 103, 62, -37, 83, -3,
+  };
+  RunCNNTest(image_width, image_height, input, expected_2, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 3;
+  cnn_config.layer_config[0].skip_height = 10;
+
+  float expected_3[] = {
+    -26, -21, -35, 69, 49,  4,  -51, -43, -56,
+    -41, 15,  -44, 40, -62, 63, 38,  27,  47,
+  };
+  RunCNNTest(image_width, image_height, input, expected_3, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 10;
+  cnn_config.layer_config[0].skip_height = 3;
+
+  float expected_4[] = {
+    21, 49, 28, 87, 50, 40, 102, 81, 58, 85, 51, 66, 36, 19, -37, -45,
+  };
+
+  RunCNNTest(image_width, image_height, input, expected_4, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestMaxPool) {
+  int image_width = 8;
+  int image_height = 8;
+  int stride = 3;
+  float input[] = {
+    1,  -4, -4, 8, 0, 7, -5, -2, 8, 2, 2, 8,  5,  -1, -1, 9,
+    -3, 0,  -2, 0, 6, 3, -4, 8,  7, 8, 7, -1, 4,  -1, 0,  2,
+    -5, -2, 8,  5, 5, 4, 2,  7,  4, 6, 2, 8,  8,  -4, -3, -4,
+    -3, -1, 2,  3, 3, 6, -5, 8,  9, 5, 0, -2, -1, 6,  5,  7,
+  };
+
+  float expected[] = {
+    49, 58, 70, 68, 68, 70, 48, 57, 88,
+  };
+
+  float weights[] = {
+    3, 1, 3, 4, -1, 5, -2, 1, -4,
+  };
+
+  float bias[] = {
+    -3,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                3,
+                                3,
+                                1,
+                                stride,
+                                stride,
+                                1,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestDeconvolveNonActivationSingleLayerSingleKernel) {
+  int image_width = 4;
+  int image_height = 7;
+  float input[] = {
+    9,  6,   181, 9,  218, 30, 80,  108, 68,  216, 70, 128, 179, 228,
+    33, 212, 34,  14, 48,  27, 230, 23,  202, 113, 80, 56,  122, 112,
+  };
+
+  float expected_1_same[] = {
+    15,   -30,  36,   -525,  377, -193, 558, 531,  6,   -24,  -15,  124,
+    166,  -561, -356, -754,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    433,  -311, 711,  381,   247, -317, 453, 129,  215, -627, -409, -885,
+    17,   -255, -55,  -647,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    133,  -719, 633,  -225,  785, 191,  463, 79,   65,  9,    77,   -853,
+    -365, -949, -15,  -667,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    355,  -866, 990,  207,   747, 12,   520, -116, 176, -312, -133, -1370,
+    -426, -802, 143,  -771,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    65,   -79,  127,  -59,   135, -90,  195, 114,  31,  -91,  -57,  -133,
+    17,   -176, -72,  -276,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    457,  -302, 733,  58,    470, -475, 829, 490,  227, -670, -440, -790,
+    153,  -588, -294, -1150, -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    157,  -251, 349,  -185,  409, -293, 587, 251,  77,  -187, -107, -369,
+    7,    -481, -135, -827,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+  };
+  float expected_1_valid[] = {
+    -30,  15,   -30,  36,   -525,  377,  -193,  558,  531,  24,   24,   6,
+    6,    -24,  -15,  124,  166,   -561, -356,  -754, -21,  -39,  -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -657, 433,  -311,
+    711,  381,  247,  -317, 453,   129,  321,   321,  215,  215,  -627, -409,
+    -885, 17,   -255, -55,  -647,  -219, -435,  -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -207,  133,  -719, 633,  -225, 785,
+    191,  463,  79,   381,  381,   65,   65,    9,    77,   -853, -365, -949,
+    -15,  -667, -259, -515, -3,    -3,   -3,    -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -540, 355,   -866, 990,   207,  747,  12,   520,  -116,
+    633,  633,  176,  176,  -312,  -133, -1370, -426, -802, 143,  -771, -427,
+    -851, -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -3,   -3,   -3,
+    -105, 65,   -79,  127,  -59,   135,  -90,   195,  114,  78,   78,   31,
+    31,   -91,  -57,  -133, 17,    -176, -72,   -276, -57,  -111, -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -693, 457,  -302,
+    733,  58,   470,  -475, 829,   490,  336,   336,  227,  227,  -670, -440,
+    -790, 153,  -588, -294, -1150, -229, -455,  -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -243,  157,  -251, 349,  -185, 409,
+    -293, 587,  251,  333,  333,   77,   77,    -187, -107, -369, 7,    -481,
+    -135, -827, -227, -451,
+  };
+  float weights_1[] = { -3, 2, -1, 3, 3, 1, 1, -3, -2, -4 };
+  float bias_1[] = { -3 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                5,
+                                2,
+                                1,
+                                2,
+                                3,
+                                0,
+                                weights_1,
+                                bias_1,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                1,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_1_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  // Change padding to valid
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_1_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  float expected_12_same[] = {
+    15,  -12,  6,    36,   -9,   -528, 377,  -184, 513,  558,  -12,  24,
+    6,   -30,  -15,  -33,  -21,  166,  154,  -546, -356, -718, -30,  -21,
+    433, -221, 561,  711,  -33,  -153, 247,  -83,  -87,  453,  -111, 321,
+    215, -657, -409, -845, -93,  17,   -43,  -243, -55,  -215, -327, -219,
+    133, -71,  -447, 633,  -219, 435,  785,  -73,  -177, 463,  -131, 381,
+    65,  -207, 77,   -59,  -651, -365, -797, -213, -15,  -155, -387, -259,
+    355, -182, -150, 990,  -231, 582,  747,  -36,  -540, 520,  -215, 633,
+    176, -540, -133, -491, -687, -426, -882, -102, 143,  77,   -639, -427,
+    65,  -37,  57,   127,  -17,  -105, 135,  -51,  60,   195,  -30,  78,
+    31,  -105, -57,  -125, -45,  17,   -11,  -147, -72,  -168, -84,  -57,
+    457, -233, 618,  733,  -26,  -540, 470,  -205, 264,  829,  -116, 336,
+    227, -693, -440, -900, -72,  153,  107,  -609, -294, -698, -342, -229,
+    157, -83,  69,   349,  -59,  -201, 409,  -125, 27,   587,  -115, 333,
+    77,  -243, -107, -267, -171, 7,    -105, -369, -135, -379, -339, -227,
+  };
+  float expected_12_valid[] = {
+    -30,  15,   -12,  6,    36,   -9,   -528, 377,  -184, 513,  558,  -12,
+    24,   24,   6,    6,    -30,  -15,  -33,  -21,  166,  154,  -546, -356,
+    -718, -30,  -21,  -39,  -657, 433,  -221, 561,  711,  -33,  -153, 247,
+    -83,  -87,  453,  -111, 321,  321,  215,  215,  -657, -409, -845, -93,
+    17,   -43,  -243, -55,  -215, -327, -219, -435, -207, 133,  -71,  -447,
+    633,  -219, 435,  785,  -73,  -177, 463,  -131, 381,  381,  65,   65,
+    -207, 77,   -59,  -651, -365, -797, -213, -15,  -155, -387, -259, -515,
+    -540, 355,  -182, -150, 990,  -231, 582,  747,  -36,  -540, 520,  -215,
+    633,  633,  176,  176,  -540, -133, -491, -687, -426, -882, -102, 143,
+    77,   -639, -427, -851, -105, 65,   -37,  57,   127,  -17,  -105, 135,
+    -51,  60,   195,  -30,  78,   78,   31,   31,   -105, -57,  -125, -45,
+    17,   -11,  -147, -72,  -168, -84,  -57,  -111, -693, 457,  -233, 618,
+    733,  -26,  -540, 470,  -205, 264,  829,  -116, 336,  336,  227,  227,
+    -693, -440, -900, -72,  153,  107,  -609, -294, -698, -342, -229, -455,
+    -243, 157,  -83,  69,   349,  -59,  -201, 409,  -125, 27,   587,  -115,
+    333,  333,  77,   77,   -243, -107, -267, -171, 7,    -105, -369, -135,
+    -379, -339, -227, -451,
+  };
+
+  // Change skip_width, skip_height to {2, 3}
+  cnn_config.layer_config[0].skip_width = 3;
+  cnn_config.layer_config[0].skip_height = 2;
+  // Set padding to same
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_12_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  // Change padding to valid
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+  RunCNNTest(image_width, image_height, input, expected_12_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].filter_width = 4;
+  cnn_config.layer_config[0].filter_height = 3;
+  float weights_2[] = { -1, -3, -1, -3, 0, 2, -2, 4, 3, 0, 1, 4 };
+  float bias_2[] = { -4 };
+  cnn_config.layer_config[0].weights = weights_2;
+  cnn_config.layer_config[0].bias = bias_2;
+
+  cnn_config.layer_config[0].skip_width = 5;
+  cnn_config.layer_config[0].skip_height = 2;
+  float expected_2_same[] = {
+    -13,  -31,  -13,  -31,  -4,   -10,  -22,  -10,  -22,  -4,   -185, -547,
+    -185, -547, -4,   -13,  -31,  -13,  -31,  -4,   -4,   14,   -22,  32,
+    -4,   -4,   8,    -16,  20,   -4,   -4,   358,  -366, 720,  -4,   -4,
+    14,   -22,  32,   -4,   -195, -658, -213, -622, -4,   -16,  -94,  -28,
+    -70,  -4,   459,  -244, 97,   480,  -4,   -85,  -328, -103, -292, -4,
+    -4,   432,  -440, 868,  -4,   -4,   56,   -64,  116,  -4,   -4,   156,
+    -164, 316,  -4,   -4,   212,  -220, 428,  -4,   582,  -208, 146,  664,
+    -4,   -130, -652, -190, -532, -4,   166,  -214, 6,    106,  -4,   192,
+    -388, -24,  44,   -4,   -4,   132,  -140, 268,  -4,   -4,   428,  -436,
+    860,  -4,   -4,   136,  -144, 276,  -4,   -4,   252,  -260, 508,  -4,
+    21,   -541, -115, -269, -4,   416,  -688, -16,  176,  -4,   173,  -103,
+    33,   177,  -4,   168,  -640, -88,  -128, -4,   -4,   354,  -362, 712,
+    -4,   -4,   452,  -460, 908,  -4,   -4,   62,   -70,  128,  -4,   -4,
+    420,  -428, 844,  -4,   499,  -106, 141,  610,  -4,   666,  -46,  210,
+    866,  -4,   47,   -148, -19,  -16,  -4,   605,  -85,  181,  763,  -4,
+    -4,   64,   -72,  132,  -4,   -4,   24,   -32,  52,   -4,   -4,   92,
+    -100, 188,  -4,   -4,   50,   -58,  104,  -4,   -132, -694, -200, -558,
+    -4,   15,   -73,  -13,  -17,  -4,   -62,  -610, -158, -418, -4,   -36,
+    -343, -90,  -235, -4,   -4,   456,  -464, 916,  -4,   -4,   42,   -50,
+    88,   -4,   -4,   400,  -408, 804,  -4,   -4,   222,  -230, 448,  -4,
+    606,  -244, 146,  676,  -4,   9,    -172, -37,  -80,  -4,   480,  -370,
+    76,   438,  -4,   223,  -340, -3,   112,  -4,   -4,   156,  -164, 316,
+    -4,   -4,   108,  -116, 220,  -4,   -4,   240,  -248, 484,  -4,   -4,
+    220,  -228, 444,  -4,
+  };
+  float expected_2_valid[] = {
+    -13,  -31,  -13,  -31,  -4,   -10,  -22,  -10,  -22,  -4,   -185, -547,
+    -185, -547, -4,   -13,  -31,  -13,  -31,  -4,   14,   -22,  32,   -4,
+    -4,   8,    -16,  20,   -4,   -4,   358,  -366, 720,  -4,   -4,   14,
+    -22,  32,   -195, -658, -213, -622, -4,   -16,  -94,  -28,  -70,  -4,
+    459,  -244, 97,   480,  -4,   -85,  -328, -103, -292, -4,   432,  -440,
+    868,  -4,   -4,   56,   -64,  116,  -4,   -4,   156,  -164, 316,  -4,
+    -4,   212,  -220, 428,  582,  -208, 146,  664,  -4,   -130, -652, -190,
+    -532, -4,   166,  -214, 6,    106,  -4,   192,  -388, -24,  44,   -4,
+    132,  -140, 268,  -4,   -4,   428,  -436, 860,  -4,   -4,   136,  -144,
+    276,  -4,   -4,   252,  -260, 508,  21,   -541, -115, -269, -4,   416,
+    -688, -16,  176,  -4,   173,  -103, 33,   177,  -4,   168,  -640, -88,
+    -128, -4,   354,  -362, 712,  -4,   -4,   452,  -460, 908,  -4,   -4,
+    62,   -70,  128,  -4,   -4,   420,  -428, 844,  499,  -106, 141,  610,
+    -4,   666,  -46,  210,  866,  -4,   47,   -148, -19,  -16,  -4,   605,
+    -85,  181,  763,  -4,   64,   -72,  132,  -4,   -4,   24,   -32,  52,
+    -4,   -4,   92,   -100, 188,  -4,   -4,   50,   -58,  104,  -132, -694,
+    -200, -558, -4,   15,   -73,  -13,  -17,  -4,   -62,  -610, -158, -418,
+    -4,   -36,  -343, -90,  -235, -4,   456,  -464, 916,  -4,   -4,   42,
+    -50,  88,   -4,   -4,   400,  -408, 804,  -4,   -4,   222,  -230, 448,
+    606,  -244, 146,  676,  -4,   9,    -172, -37,  -80,  -4,   480,  -370,
+    76,   438,  -4,   223,  -340, -3,   112,  -4,   156,  -164, 316,  -4,
+    -4,   108,  -116, 220,  -4,   -4,   240,  -248, 484,  -4,   -4,   220,
+    -228, 444,  236,  -4,   76,   316,  -4,   164,  -4,   52,   220,  -4,
+    362,  -4,   118,  484,  -4,   332,  -4,   108,  444,
+  };
+  // Set padding to same
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_2_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_2_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 2;
+  cnn_config.layer_config[0].skip_height = 5;
+  float expected_21_same[] = {
+    -31,  -19,  -49,   -191, -565, -194, -574, -13,  14,   -22,  44,   -16,
+    382,  -366, 738,   -22,  -4,   23,   32,   545,  20,   204,  720,  5,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -658, -252, -748, -114, -334, -192, -568, -112,
+    432,  -440, 928,   -64,  276,  -164, 532,  -220, -4,   304,  868,  266,
+    116,  400,  316,   104,  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -208, -288, -856, -290,
+    -862, -202, -598,  -132, 132,  -140, 700,  -436, 1000, -144, 532,  -260,
+    -4,   712,  268,   422,  860,  450,  276,  124,  -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -541, -411, -1225, -265, -787, -249, -739, -216, 354,  -362, 1168, -460,
+    974,  -70,  552,   -428, -4,   859,  712,  323,  908,  665,  128,  208,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -106, -52,  -148, -66,  -190, -79,  -229, -31,
+    64,   -72,  160,   -32,  148,  -100, 242,  -58,  -4,   72,   132,  154,
+    52,   125,  188,   23,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -694, -257, -763, -229,
+    -679, -319, -949,  -117, 456,  -464, 962,  -50,  492,  -408, 1030, -230,
+    -4,   295,  916,   625,  88,   537,  804,  109,  -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -244, -140, -412,  -182, -538, -238, -706, -116, 156,  -164, 428,  -116,
+    464,  -248, 708,   -228, -4,   244,  316,  418,  220,  454,  484,  108,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,
+  };
+  float expected_21_valid[] = {
+    -13,  -31,  -19,  -49,  -191, -565, -194, -574, -13,  -31,   -4,   14,
+    -22,  44,   -16,  382,  -366, 738,  -22,  32,   23,   -4,    23,   32,
+    545,  20,   204,  720,  5,    32,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -222, -658, -252, -748, -114, -334, -192, -568,  -112, -328,
+    -4,   432,  -440, 928,  -64,  276,  -164, 532,  -220, 428,   650,  -4,
+    304,  868,  266,  116,  400,  316,  104,  428,  -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -72,  -208, -288, -856, -290, -862,  -202, -598,
+    -132, -388, -4,   132,  -140, 700,  -436, 1000, -144, 532,   -260, 508,
+    200,  -4,   712,  268,  422,  860,  450,  276,  124,  508,   -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -183, -541, -411, -1225, -265, -787,
+    -249, -739, -216, -640, -4,   354,  -362, 1168, -460, 974,   -70,  552,
+    -428, 844,  533,  -4,   859,  712,  323,  908,  665,  128,   208,  844,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -38,  -106,  -52,  -148,
+    -66,  -190, -79,  -229, -31,  -85,  -4,   64,   -72,  160,   -32,  148,
+    -100, 242,  -58,  104,  98,   -4,   72,   132,  154,  52,    125,  188,
+    23,   104,  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -234, -694,
+    -257, -763, -229, -679, -319, -949, -117, -343, -4,   456,   -464, 962,
+    -50,  492,  -408, 1030, -230, 448,  686,  -4,   295,  916,   625,  88,
+    537,  804,  109,  448,  -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -84,  -244, -140, -412, -182, -538, -238, -706, -116, -340,  -4,   156,
+    -164, 428,  -116, 464,  -248, 708,  -228, 444,  236,  -4,    244,  316,
+    418,  220,  454,  484,  108,  444,
+  };
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_21_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_21_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestLargeKernelsAndStrides) {
+  float input_10x11[] = {
+    4,  4,  2,  4,  2,  -5, -2, 3, -1, 0,  0,  1,  2,  0,  -5, -2, -5, 1,  -3,
+    -1, 4,  -3, 2,  -2, 1,  0,  1, -3, -3, -4, -2, -2, 1,  -4, -1, 4,  1,  -4,
+    -4, -4, 3,  2,  -5, 3,  -5, 1, 2,  -4, 1,  -1, 3,  4,  -2, 3,  -3, 3,  0,
+    2,  -4, -5, -5, -2, -1, -2, 1, 1,  1,  -2, 4,  -5, 4,  -1, -1, 2,  3,  -4,
+    2,  2,  3,  0,  0,  1,  0,  3, 2,  3,  1,  -2, 3,  -4, 3,  2,  4,  -2, 0,
+    4,  -4, 1,  -3, -3, -3, -5, 1, -3, -5, 0,  4,  -1, -3, 2,
+  };
+
+  float weights_10x11[] = {
+    -3, 4,  -4, -3, -5, 1,  -2, 3,  1,  -4, -4, 0,  -1, 0,  3,  1,  -3, -2, 0,
+    -1, 1,  3,  -4, -4, -3, -3, -2, 4,  3,  -5, 4,  2,  -3, 4,  -2, -1, 2,  -1,
+    -5, 0,  -3, 0,  3,  -5, -5, 3,  -4, -1, -5, 3,  4,  0,  4,  -5, 2,  -1, 2,
+    -1, -1, -1, -5, 0,  -4, 3,  -1, 1,  1,  -1, 3,  2,  -5, -4, 0,  -4, 4,  -5,
+    -3, 4,  -5, 2,  -5, -4, -4, -1, 3,  3,  0,  2,  -4, 1,  -2, 1,  1,  0,  3,
+    -2, 0,  1,  2,  4,  -3, -1, -5, -5, 2,  -4, 1,  1,  2,  -4, -2, -2, 2,  1,
+    3,  4,  -5, 1,  -1, -3, -3, -1, -2, -5, 1,  -1, 0,  1,  4,  4,  0,  0,  4,
+    -3, -1, -5, -3, 0,  1,  1,  1,  -5, 3,  4,  3,  -5, 3,  -2, -2, 0,  -4, 0,
+    0,  -2, 1,  -4, -1, 0,  -5, -2, -2, -5, -3, -3, 1,  1,  -3, 2,  4,  2,  4,
+    -4, -3, 3,  1,  1,  3,  -4, 4,  -2, -3, -3, -3, -3, -4, -2, 3,  -5, 2,  4,
+    -1, -4, -4, 4,  -2, -1, 3,  -3, -4, -4, -2, 4,  1,  0,  2,  -1, 4,  -3, 1,
+    4,  -3, 4,  4,  0,  -4, 3,  -2, -3, 2,  3,  -1, -3, 2,  1,  4,  -2, -3, 1,
+    4,  -2, 2,  -2, -5, -2, 1,  4,  -1, -4, 4,  -5, 2,  -5, -4, -1, -2, 3,  1,
+    2,  1,  -5, 1,  -5, -4, -1, -2, 2,  -2, -4, -3, -2, -2, 4,  -1, 2,  2,  -4,
+    2,  -2, 4,  -4, -2, -2, 1,  -1, 1,  1,  1,  -4, -5, -2, 3,  -4, -1, 3,  -2,
+    3,  2,  -5, -4, 0,  3,  -2, -4, -5, 3,  -2, -4, 2,  -2, 1,  -4, 0,  2,  -5,
+    1,  -4, -1, -1, 4,  -5, -4, 0,  -5, -4, -3, -5, -4, 0,  2,  0,  -4, 2,  -2,
+    1,  1,  -3, 2,  0,  -4, 0,  -4, 1,  0,  -5, -1, -1, -1, -5, 4,  2,  2,  -4,
+    3,  -2, -2, 2,  -3, -2, -1, 2,  -4, -5, 2,  -2, -4, -5, -5, -1, 2,  -1, 0,
+    -5, -2, -2, -5, 0,  1,  -1, -5, 0,  3,  2,  3,  0,  -3, -2, 0,  -5, -1, -2,
+    2,  -4, -1, 2,  2,  -5, 2,  -4, 0,  3,  -3, 1,  0,  0,  1,  -5, -3, 1,  -1,
+    0,  -4, -3, 2,  -4, -4, 4,  -1, 0,  1,  2,  -4, -5, 4,  -2, 1,  -4, -4, -3,
+    -1, -1, 1,  -1, -4, -1, -4, -3, 2,  -1, -2, -4, 1,  1,  0,  -2, 0,  -4, 3,
+    -3, 0,  -4, -1, -4, 2,  -1, -2, -5, -1, -2, -3, 3,  -1, 0,  -3, 0,  1,  -5,
+    1,  -5, 0,  1,
+  };
+
+  float bias_10x11[] = { 3 };
+
+  float expected_10x11[] = {
+    118,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                23,
+                                20,
+                                1,
+                                15,
+                                20,
+                                0,
+                                weights_10x11,
+                                bias_10x11,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  int image_height = 10;
+  int image_width = 11;
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input_10x11, expected_10x11,
+             &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+
+  float input_11x10[] = {
+    -2, -2, 3,  -5, -1, -3, 1,  3,  2,  1,  1,  -5, 4,  1,  3,  -5, 3,  -3, -5,
+    0,  -1, -3, -3, 1,  1,  -5, -1, -5, -5, -3, 0,  1,  -3, -1, -3, -3, 0,  3,
+    4,  -4, -1, 3,  -3, -1, -3, 1,  -3, -2, -1, -4, -3, 2,  -4, 1,  -4, -1, -3,
+    -5, -1, 2,  3,  0,  2,  2,  -5, 4,  1,  2,  -1, -4, 4,  -4, -4, 0,  -1, 1,
+    -1, 1,  -3, -3, -2, 1,  2,  4,  4,  4,  -3, -3, 0,  1,  0,  1,  4,  1,  3,
+    4,  -3, -2, -4, 4,  2,  0,  3,  4,  -1, 2,  -2, 1,  -3, -2,
+  };
+
+  float weights_11x10[] = {
+    4,  -1, 1,  -1, 2,  4,  3,  3,  -4, 3,  -5, 1,  -1, -1, -2, -2, 0,  2,  -3,
+    -2, 3,  -5, -1, 0,  -1, -2, -2, -1, 2,  4,  3,  1,  0,  0,  -3, 3,  -4, -1,
+    -5, 4,  -2, -2, 1,  2,  -1, -3, 1,  2,  -5, 1,  -3, 3,  3,  0,  -4, -4, -5,
+    -3, -4, -4, 4,  -2, 4,  4,  -2, 2,  -5, -1, -2, -5, -1, 4,  -3, 3,  -2, 0,
+    -4, -3, 0,  -1, -2, 4,  2,  0,  -2, -5, -4, 1,  4,  -4, -2, 2,  -2, 1,  1,
+    -4, 1,  -4, -4, -2, 4,  2,  -1, -5, -5, 1,  -3, -3, 3,  -3, -5, -3, 4,  -1,
+    -1, -3, 0,  -4, 3,  -1, 0,  -2, 0,  -5, -2, -5, 2,  0,  -5, 2,  3,  -2, 2,
+    4,  -1, 1,  -3, 2,  3,  2,  0,  -5, -4, -5, 2,  1,  1,  -1, -2, 3,  4,  2,
+    -2, 4,  -2, 3,  1,  -4, -3, -1, 4,  4,  -3, -5, -2, 2,  0,  3,  -2, 3,  -1,
+    -4, 0,  -2, 0,  3,  4,  -2, -3, -2, 0,  3,  4,  2,  -4, 0,  1,  2,  2,  -1,
+    -1, 4,  1,  4,  -2, -1, -1, -5, 1,  -3, 3,  3,  -1, -4, 3,  -5, 0,  0,  -1,
+    -4, -1, -2, 4,  -2, 3,  3,  -3, 1,  -1, 2,  -1, 4,  4,  -2, -2, 4,  -2, 0,
+    3,  -3, -5, -1, -2, 4,  -4, 2,  -4, 0,  -2, 3,  -3, 2,  2,  -2, -5, -1, 4,
+    3,  -2, -1, 3,  3,  -1, 3,  0,  -3, 0,  4,  2,  0,  -1, 4,  1,  1,  2,  1,
+    3,  1,  1,  1,  -3, -5, -4, 4,  -4, 2,  0,  0,  -4, 1,  4,  -5, 4,  4,  0,
+    1,  0,  -2, -4, -4, -3, 0,  1,  -5, 4,  0,  -3, -2, -4, 2,  4,  1,  -5, 1,
+    -4, 1,  0,  -3, -3, 0,  2,  -5, 4,  3,  -2, -5, 3,  1,  -1, 0,  3,  -2, -2,
+    3,  -2, -5, 4,  1,  -2, 2,  -1, 0,  4,  0,  -5, 3,  -2, 1,  2,  1,  -5, -3,
+    -2, -5, 4,  -4, 0,  3,  2,  -1, -4, -1, 2,  1,  -2, 3,  -1, -4, 2,  0,  -3,
+    1,  -1, 2,  -5, -4, -1, -5, 1,  4,  3,  4,  2,  -3, 1,  -5, -1, 3,  0,  -1,
+    -4, 3,  4,  -5, 4,  4,  -3, 2,  -3, -1, -3, -5, -3, 2,  -3, -2, 1,  1,  0,
+    -5, 3,  2,  1,  -5, 1,  1,  1,  3,  4,  -4, -1, -2, 0,  -5, -3, -5, -2, -4,
+    3,  3,  3,  4,  0,  -4, -1, -5, 0,  -3, 1,  4,  4,  -4, 4,  -5, -5, -1, -2,
+    -5, 3,  -4, 4,  3,  0,  -3, 2,  -2, 0,  0,  4,  4,  0,  -2, 1,  -1, -3, 2,
+    -1, 1,  -3, -5,
+  };
+
+  float bias_11x10[] = {
+    -5,
+  };
+
+  float expected_11x10[] = {
+    36,  -84,  95,   45,  18,   46,   77,  -54, -99,  -149, 66,  49,  161, 11,
+    39,  61,   -66,  61,  4,    -3,   34,  -44, -23,  31,   64,  29,  47,  72,
+    -27, -27,  121,  -3,  100,  1,    30,  -78, -12,  -89,  -59, 8,   -16, 112,
+    91,  -102, -26,  -4,  30,   54,   4,   -84, -24,  -58,  27,  -53, -33, 5,
+    53,  -26,  63,   50,  -103, -130, -23, 6,   -104, -207, 73,  23,  77,  132,
+    38,  32,   -130, -44, -60,  7,    27,  176, 45,   -32,  -2,  99,  -97, 63,
+    69,  126,  47,   63,  136,  -57,  5,   16,  -40,  -157, 8,   38,  -44, -10,
+    91,  7,    122,  140, 30,   -105, 4,   -1,  113,  64,   180, 141,
+  };
+
+  cnn_config.layer_config[0].weights = weights_11x10;
+  cnn_config.layer_config[0].bias = bias_11x10;
+  cnn_config.layer_config[0].filter_width = 20;
+  cnn_config.layer_config[0].filter_height = 23;
+  cnn_config.layer_config[0].skip_width = 1;
+  cnn_config.layer_config[0].skip_height = 1;
+  image_height = 11;
+  image_width = 10;
+
+  RunCNNTest(image_width, image_height, input_11x10, expected_11x10,
+             &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSoftsignSingleLayer) {
+  int image_width = 8;
+  int image_height = 8;
+  int filter_height = 5;
+  int filter_width = 4;
+  float input[] = {
+    -0.5220f, 0.8410f,  -0.8990f, -0.0090f, 0.6710f,  -0.9470f, -0.8240f,
+    -0.0870f, 0.5380f,  0.4750f,  0.570f,   -0.3760f, -0.6960f, -0.5940f,
+    -0.3830f, 0.080f,   -0.0980f, -0.4940f, -0.4030f, 0.9460f,  -0.6020f,
+    0.4220f,  0.6190f,  0.6640f,  -0.9210f, -0.1470f, -0.2480f, -0.1120f,
+    -0.580f,  -0.0650f, 0.3330f,  0.9860f,  -0.7430f, 0.7610f,  0.4840f,
+    0.1030f,  0.9570f,  0.6120f,  -0.5240f, -0.1220f, -0.5850f, -0.270f,
+    0.7840f,  -0.9790f, 0.7290f,  -0.30f,   -0.6460f, 0.0780f,  0.4750f,
+    -0.0510f, 0.4550f,  0.3850f,  -0.7230f, 0.4460f,  -0.6260f, -0.810f,
+    0.8720f,  -0.2120f, -0.580f,  -0.9510f, -0.8430f, -0.1340f, -0.0850f,
+    0.9190f,
+  };
+  float expected_same[] = {
+    0.430f,   0.660f,  0.5510f,  -0.610f,  0.450f,  -0.1610f, 0.0520f,  0.3240f,
+    0.6820f,  0.3820f, 0.6360f,  0.7480f,  0.3080f, 0.090f,   0.3910f,  0.1730f,
+    0.340f,   0.6660f, -0.4990f, 0.4280f,  0.1540f, 0.120f,   0.4670f,  0.6150f,
+    -0.3880f, 0.7590f, 0.4190f,  0.7350f,  0.5310f, -0.5160f, -0.1760f, 0.6790f,
+    -0.6780f, 0.5470f, 0.5750f,  -0.6420f, 0.7210f, -0.4620f, 0.5430f,  0.770f,
+    -0.1990f, 0.3950f, 0.7860f,  -0.4380f, 0.7540f, 0.2640f,  -0.6430f, 0.4510f,
+    -0.1260f, 0.1590f, -0.2110f, -0.0560f, 0.6570f, 0.680f,   0.5870f,  0.4720f,
+    0.4040f,  0.3630f, 0.670f,   0.2360f,  0.410f,  0.6980f,  -0.5350f, 0.3940f,
+  };
+  float expected_replicate[] = {
+    0.540f,   0.7230f,  -0.3530f, -0.2130f, 0.7440f,  -0.4470f, -0.6260f,
+    -0.2050f, 0.7230f,  0.4630f,  0.5920f,  0.7440f,  0.6080f,  0.3130f,
+    -0.5670f, -0.4720f, 0.5480f,  0.6660f,  -0.4990f, 0.4280f,  0.1540f,
+    0.120f,   0.3390f,  0.6090f,  0.4160f,  0.7590f,  0.4190f,  0.7350f,
+    0.5310f,  -0.5160f, -0.490f,  0.4450f,  -0.610f,  0.5470f,  0.5750f,
+    -0.6420f, 0.7210f,  -0.4620f, 0.3150f,  0.7370f,  -0.5820f, 0.3950f,
+    0.7860f,  -0.4380f, 0.7540f,  0.2640f,  -0.7430f, -0.5340f, -0.6270f,
+    0.4430f,  0.4730f,  0.4570f,  0.7450f,  0.630f,   0.2620f,  0.3140f,
+    -0.1840f, 0.1810f,  0.7210f,  0.2760f,  0.6430f,  0.6720f,  -0.4390f,
+    0.2040f,
+  };
+  float expected_valid[] = {
+    0.6660f,  -0.4990f, 0.4280f,  0.1540f,  0.120f,  0.7590f,  0.4190f,
+    0.7350f,  0.5310f,  -0.5160f, 0.5470f,  0.5750f, -0.6420f, 0.7210f,
+    -0.4620f, 0.3950f,  0.7860f,  -0.4380f, 0.7540f, 0.2640f,
+  };
+  float weights[] = {
+    0.6210f,  0.3710f,  -0.2770f, -0.7230f, -0.2450f, 0.6770f,  0.3080f,
+    -0.9880f, -0.080f,  0.7190f,  -0.6760f, -0.0170f, -0.8970f, 0.8260f,
+    0.7390f,  -0.4550f, -0.4260f, -0.6330f, 0.0880f,  -0.9390f,
+  };
+  float bias[] = {
+    0.750f,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                filter_width,
+                                filter_height,
+                                1,
+                                1,
+                                1,
+                                0,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                SOFTSIGN,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorAdd) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+  };
+
+  float weights[] = {
+    -3, -1, 4,  -1, -3, 3,  3,  0,  2,  0,  3,  2,  4,  4, 4,  -5, 1, -4,
+    2,  -4, 1,  -3, 0,  4,  -5, 4,  0,  -4, -3, -1, 0,  0, -2, 0,  0, 2,
+    -5, -1, 1,  -3, 3,  4,  3,  0,  1,  -1, 1,  1,  2,  4, -2, -5, 2, -2,
+    3,  -2, 4,  -1, 0,  2,  3,  2,  -2, -1, -3, 1,  3,  4, -1, -3, 0, -4,
+    4,  2,  -3, -3, -1, 0,  1,  0,  3,  3,  -3, 0,  3,  2, -5, -3, 4, -5,
+    3,  -1, -1, -3, 0,  1,  -1, -4, 2,  4,  -1, 4,  -1, 1, 3,  4,  4, 4,
+    0,  -1, -3, -3, -3, -3, 2,  -3, -2, 2,  3,  -3,
+  };
+
+  float bias[] = {
+    3, 4, -1, -1, 2, 1, -2, 1, 4, 1, 3,
+  };
+
+  float expected[] = {
+    -11502, -4101, -3424, 668,   -17950, -5470, -5504, 626,
+    4835,   446,   1779,  -3483, 3679,   -4214, 4578,  -105,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 6,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                  1,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  weights,
+                                  bias,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_INPUT,
+                                  BRANCH_NOC,
+                                  {
+                                      0x02,
+                                      0,
+                                      0x00,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_ADD,
+                                  {
+                                      0x00,
+                                      0,
+                                      0x02,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  1,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  0,
+                              } } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorConcatenation) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+  };
+
+  float weights[] = {
+    3,  0,  2,  0,  2,  3,  1,  -3, 1,  -5, -3, 0,  -4, 4,  0,  -5, 0,  -5, -1,
+    -2, -5, 0,  -3, 2,  -4, 2,  0,  2,  -1, 0,  -4, 3,  0,  0,  -1, -5, 2,  -1,
+    4,  -4, -2, -3, -3, 3,  4,  -2, -1, -4, -1, 4,  4,  -1, 4,  3,  -4, 2,  -2,
+    -4, -3, -2, 3,  -3, -5, -1, 3,  -2, 4,  1,  -4, -3, -5, -5, -3, 4,  -2, -2,
+    -1, -5, -5, 0,  -1, -2, -3, 3,  -4, -5, 2,  -3, 1,  0,  -5, 2,  2,  -2, 0,
+    2,  2,  -2, 4,  2,  2,  0,  1,  -5, -3, 0,  2,  -2, 1,  2,  -5, 2,  3,  3,
+    -1, 3,  0,  -3, 3,  -4, -4, 3,  3,  -4, -2, 2,  -2, 2,  -2, -1, 3,  0,
+  };
+
+  float bias[] = {
+    -3, -5, 4, -4, -3, -2, 0, 3, -4, 4, -3,
+  };
+
+  float expected[] = {
+    -33533, -32087, -6741,  -2124, 39979, 41453, 14034, 689,
+    -22611, -42203, -14882, -239,  15781, 15963, 9524,  837,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 6,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                  1,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  weights,
+                                  bias,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_INPUT,
+                                  BRANCH_NOC,
+                                  {
+                                      0x02,
+                                      0,
+                                      0x00,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_CAT,
+                                  {
+                                      0x00,
+                                      0,
+                                      0x02,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels + channels,
+                                  filter_width,
+                                  filter_height,
+                                  1,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  0,
+                              } } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+// TODO(logangw): Add test to test all combinations of branch_copy_type.
+
+TEST_F(CNNTest, TestBranchCombinations) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    3, 2, -5, -4, 4, -2, -4, -3, 4, 2, -3, 2, -3, 1, -5, -1,
+  };
+
+  float weights[] = {
+    2,  3,  0,  4,  4,  3,  1,  0,  1,  -5, 4,  -3, 3,  0,  4,  -1, -1, -5,
+    2,  1,  -3, -5, 3,  -1, -3, -2, 0,  -2, 3,  0,  -2, -4, -2, -2, 2,  -5,
+    4,  -5, 0,  1,  -5, -4, -3, -4, 2,  -2, 1,  0,  3,  -2, -4, 3,  4,  -4,
+    -1, -1, -3, -2, -2, -1, 2,  0,  2,  -1, 2,  -4, -4, -1, 2,  0,  3,  -2,
+    -2, 3,  -3, 4,  -2, 4,  3,  4,  1,  0,  -2, -3, -5, 1,  -3, 2,  0,  -2,
+    -2, -1, -1, -5, -2, -3, -1, 3,  3,  4,  4,  0,  2,  1,  3,  -3, 2,  -5,
+    -5, 1,  -5, -1, 3,  3,  2,  -4, -1, 3,  -4, -2, -5, -2, 1,  3,  2,  2,
+    -5, -2, -3, -1, -2, -4, -1, -2, 2,  1,  -4, -4, 2,  0,  2,  0,  2,  -3,
+    -2, -4, 4,  0,  1,  -3, -5, 4,  -1, 2,  3,  -5, -1, 0,  4,  -1, -1, 3,
+    -1, -3, 3,  1,  4,  3,  4,  3,  -4, -5, -1, 3,  3,  -4, 3,  1,  3,  -5,
+    3,  4,  -5, 4,  2,  -1, -5, 2,  1,  0,  4,  0,  -3, 2,  0,  2,  -2, 1,
+    -1, -2, -1, -5, 4,  3,  3,  -2, 2,  4,  -5, -5, -3, -2, 4,  0,  -4, 1,
+  };
+
+  float bias[] = {
+    -1, 4, 0, 2, 2, -2, 0, -4, -5, -1, 1, -2, 3, 0, 4, -2, 1, 0, 0,
+  };
+
+  float expected[] = {
+    149496, 15553,  -24193, -20956, 134094, 86432,  -68283, -6366,
+    -53031, 133739, 67407,  -13539, -53205, -58635, -20033, 1979,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 10,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_INPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x06,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_OUTPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x08,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    3,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x08,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    1,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    1,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x0C,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x02,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSplittingTensors) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -1, -1, 2, 1, 3, 2, 4, -3, -4, -2, 2, -3, 1, -3, 4, -2,
+  };
+
+  float weights[] = {
+    -4, 1,  0,  2,  3,  4,  4,  -4, -5, -3, 2,  2,  -4, -3, 3,  2,
+    4,  -4, -3, -4, -4, 1,  -3, -5, -3, 4,  2,  -2, 2,  -1, -4, -1,
+    -2, -3, 1,  1,  0,  -5, -1, 3,  3,  -5, -3, 0,  -3, 1,  -3, -1,
+    1,  -3, -2, -2, 4,  -2, 0,  1,  2,  2,  -4, 2,  4,  0,  -5, -2,
+    4,  4,  -5, 1,  0,  2,  -2, -5, -5, -3, -5, -5, 4,  -3, 0,  0,
+    -4, -4, 0,  -5, -4, 0,  0,  -3, -5, -3, -1, 2,  -1, 4,  -1, 2,
+  };
+
+  float bias[] = {
+    -4, -2, -3, -3, 3, 1, -2,
+  };
+
+  float expected[] = {
+    530,  -762,  1469, 777,  849,   -771, -1698, 600,
+    -658, -1821, 98,   -668, -1798, 30,   887,   -971,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    4,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_OUTPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x02,
+                                        2,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    4,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x02,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    4,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestOutputChannelsCount) {
+  int filter_width = 1;
+  int filter_height = 1;
+
+  int image_width = 2;
+  int image_height = 2;
+
+  float input[] = { 0, 0, 0, 0 };
+
+  float weights[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  float bias[] = { 0, 0, 0, 0, 0, 0 };
+
+  float expected[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_INPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x06,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x03,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    2,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x04,
+                                    },
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBatchNorm) {
+  int image_width = 28;
+  int image_height = 28;
+  int filter_height = 7;
+  int filter_width = 7;
+  float input[] = {
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0117647f,  0.0705882f,  0.0705882f,  0.0705882f,
+    0.494118f,  0.533333f,  0.686275f,   0.101961f,   0.65098f,    1.0f,
+    0.968627f,  0.498039f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.117647f,   0.141176f,   0.368627f,   0.603922f,
+    0.666667f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.992157f,
+    0.882353f,  0.67451f,   0.992157f,   0.94902f,    0.764706f,   0.25098f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.192157f,
+    0.933333f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.984314f,   0.364706f,   0.321569f,
+    0.321569f,  0.219608f,  0.152941f,   0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0705882f,  0.858824f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.992157f,   0.776471f,   0.713725f,
+    0.968627f,  0.945098f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.313725f,   0.611765f,   0.419608f,   0.992157f,
+    0.992157f,  0.803922f,  0.0431373f,  0.0f,        0.168627f,   0.603922f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.054902f,  0.00392157f, 0.603922f,   0.992157f,   0.352941f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.545098f,  0.992157f,   0.745098f,   0.00784314f, 0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0431373f,
+    0.745098f,  0.992157f,  0.27451f,    0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.137255f,   0.945098f,
+    0.882353f,  0.627451f,  0.423529f,   0.00392157f, 0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.317647f,   0.941176f,   0.992157f,
+    0.992157f,  0.466667f,  0.0980392f,  0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.176471f,   0.729412f,   0.992157f,   0.992157f,
+    0.588235f,  0.105882f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0627451f, 0.364706f,   0.988235f,   0.992157f,   0.733333f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.976471f,  0.992157f,   0.976471f,   0.25098f,    0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.180392f,   0.509804f,   0.717647f,   0.992157f,
+    0.992157f,  0.811765f,  0.00784314f, 0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.152941f,   0.580392f,
+    0.898039f,  0.992157f,  0.992157f,   0.992157f,   0.980392f,   0.713725f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0941176f, 0.447059f,  0.866667f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.788235f,  0.305882f,   0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0901961f,  0.258824f,   0.835294f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.776471f,   0.317647f,   0.00784314f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0705882f,  0.670588f,
+    0.858824f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.764706f,
+    0.313725f,  0.0352941f, 0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.215686f,  0.67451f,   0.886275f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.956863f,  0.521569f,   0.0431373f,  0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.533333f,   0.992157f,
+    0.992157f,  0.992157f,  0.831373f,   0.529412f,   0.517647f,   0.0627451f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f
+  };
+  float expected[] = {
+    -0.836424f, -0.857365f, -1.62739f,  -1.62739f,  -0.836424f, 5.40742f,
+    0.920853f,  -0.692567f, -0.836424f, -0.534405f, -1.62739f,  -0.836424f,
+    1.32602f,   1.36312f,   0.112766f,  -0.836424f, -0.192962f, 1.56975f,
+    2.45777f,   0.944414f,  -0.192962f, -1.5519f,   -1.5519f,   -0.554006f,
+    -0.192962f, 1.4231f,    -1.5519f,   -0.192962f, 1.3661f,    -1.5519f,
+    -1.5519f,   -0.192962f, -0.843708f, -0.359025f, -0.843708f, -0.843708f,
+    -0.843708f, 4.53065f,   0.0429584f, -0.796804f, -0.843708f, 0.3473f,
+    -0.843708f, -0.843708f, -0.114439f, 3.14817f,   0.0811934f, -0.843708f
+  };
+  float kernel[] = {
+    0.119643f,    -0.237864f,   0.0462892f,   0.0502297f,   -0.0134528f,
+    0.146347f,    0.153133f,    0.0513307f,   0.0752369f,   0.0135557f,
+    -0.111434f,   0.0941854f,   0.0788362f,   0.0299412f,   0.111762f,
+    0.144066f,    0.00431504f,  -0.0177954f,  0.0738092f,   -0.0344215f,
+    0.0832582f,   0.053989f,    -0.112691f,   0.0962145f,   0.0186525f,
+    -0.00660205f, -0.111962f,   -0.126801f,   -0.231625f,   0.17309f,
+    0.0748875f,   -0.179569f,   -0.00513812f, -0.156579f,   -0.147322f,
+    0.184168f,    0.189308f,    -0.200359f,   -0.0156733f,  0.140649f,
+    0.0858496f,   -0.0263217f,  -0.0740749f,  -0.112563f,   0.107528f,
+    0.0609729f,   -0.221625f,   0.0769944f,   -0.00900815f, -0.00136441f,
+    -0.0236521f,  -0.0418025f,  -0.00286299f, 0.12241f,     0.0964093f,
+    -0.0150897f,  0.0532171f,   0.0625916f,   0.116939f,    0.118024f,
+    0.161918f,    -0.00909767f, 0.100897f,    -0.054563f,   -0.175179f,
+    -0.0687892f,  0.00734235f,  0.109833f,    -0.113776f,   0.0595405f,
+    -0.170255f,   0.0124815f,   -0.0363301f,  -0.0127038f,  0.0445554f,
+    -0.0729894f,  0.107428f,    -0.0341417f,  0.132619f,    0.00984557f,
+    -0.00443654f, 0.202929f,    0.0945134f,   0.0148725f,   0.00998574f,
+    -0.0226449f,  0.0478197f,   -0.0793442f,  0.0707599f,   -0.084225f,
+    0.0865795f,   0.071104f,    -0.047894f,   0.0838322f,   0.0635493f,
+    -0.00370265f, -0.157247f,   -0.0289622f,  -0.0590963f,  0.13207f,
+    0.00468011f,  -0.0345372f,  0.217939f,    0.18861f,     -0.0290393f,
+    -0.0440664f,  0.0126197f,   -0.129132f,   -0.124943f,   0.0968156f,
+    -0.0853643f,  -0.182305f,   0.00461618f,  -0.147095f,   -0.230282f,
+    0.00856019f,  0.0278893f,   -0.0300229f,  0.0417871f,   0.0804717f,
+    -0.0768571f,  -0.0397085f,  -0.0601096f,  0.100901f,    -0.0184926f,
+    0.0350673f,   0.0971094f,   -0.0171837f,  -0.289644f,   -0.0899041f,
+    0.08998f,     -0.160319f,   -0.0195103f,  0.0392167f,   -0.137864f,
+    -0.0136294f,  0.0330886f,   -0.0409244f,  -0.092533f,   -0.0427934f,
+    -0.191144f,   -0.0969461f,  0.112035f,    0.138611f,    0.128717f,
+    0.191184f,    0.197462f
+  };
+  float bias[] = { 0.186703f, 0.204358f, -0.0230452f };
+
+  float bn_gamma[] = { 1.32173f, 1.26171f, 1.21966f };
+  float bn_beta[] = { -0.232595f, -0.222652f, -0.232209f };
+  float bn_mean[] = { 0.329233f, 0.199894f, 0.12389f };
+  float bn_std[] = { 0.311986f, 0.189737f, 0.247104f };
+
+  CNN_BATCHNORM_PARAMS bn_params = {
+    bn_gamma,
+    bn_beta,
+    bn_mean,
+    bn_std,
+  };
+
+  CNN_CONFIG cnn_config = {
+    1,
+    0,
+    0,
+    0,
+    0,
+    {
+        {
+            1,
+            filter_width,
+            filter_height,
+            3,
+            7,
+            7,
+            0,
+            kernel,
+            bias,
+            PADDING_VALID,
+            RELU,
+            0,
+            0,
+            BRANCH_NO_COPY,
+            BRANCH_NOC,
+            {},
+            bn_params,
+            0,
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestMultithreading) {
+  int image_height = 2;
+  int image_width = 2;
+  int filter_height = 3;
+  int filter_width = 3;
+
+  float input[] = {
+    -2,
+    4,
+    1,
+    0,
+  };
+
+  float weights[] = {
+    -4, 2, -2, 0,  -4, 4, -3, -3, -3, -1, 1,  0,  -5, -3, 0, -5, 0, 0,
+    -1, 0, 2,  -5, 0,  1, 4,  2,  1,  0,  -2, -1, -5, -3, 2, -2, 1, -5,
+  };
+
+  float bias[] = {
+    -4,
+    -3,
+    -2,
+    3,
+  };
+
+  float expected[] = {
+    2, 10, -8, -17, -24, 5, -15, 6, -5, -5, 7, -10, 4, 13, 9, -14,
+  };
+
+  CNN_CONFIG cnn_config = {
+    1,
+    0,
+    0,
+    0,
+    0,
+    {
+        {
+            1,
+            filter_width,
+            filter_height,
+            4,
+            1,
+            1,
+            0,
+            weights,
+            bias,
+            PADDING_SAME_ZERO,
+            NONE,
+            0,
+            0,
+            BRANCH_NO_COPY,
+            BRANCH_NOC,
+            {},
+            {},
+            0,
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  AVxWorker workers[4];
+
+  for (int i = 0; i < 4; ++i) {
+    winterface->init(&workers[i]);
+  }
+
+  thread_data = { 4, workers };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  for (int i = 0; i < 4; ++i) {
+    winterface->end(&workers[i]);
+  }
+}
+
+TEST_F(CNNTest, TestMultiOutput) {
+  const int image_dim = 8;
+  const int image_ch = 3;
+  const int filter_dim = 2;
+  const int stride = 2;
+  const int num_filters = 2;
+
+  const float input_[] = {
+    1.7537929121f,     0.134331551012f,    0.123580039877f,   0.957731845246f,
+    0.391006834217f,   1.00699352042f,     -0.778177955829f,  -0.814166433059f,
+    -0.656374394915f,  0.321967305228f,    -2.19455719176f,   0.708035038966f,
+    0.409148822266f,   -0.318254408902f,   0.152450211189f,   -0.250210793369f,
+    0.826811563186f,   1.6804156584f,      0.273626975978f,   0.437936241887f,
+    -0.329935520167f,  -0.288761611645f,   0.156937008304f,   0.271054157295f,
+    -0.0224828854332f, 1.70110336895f,     -0.989066699309f,  1.30863131729f,
+    -0.165813705702f,  0.00380178619265f,  -0.0837342367587f, 0.760954783156f,
+    -0.413610373524f,  1.17968204175f,     0.720295719536f,   0.308718974472f,
+    -1.10091337671f,   0.693160033687f,    -0.0202862320697f, 1.0221927503f,
+    -1.24521801881f,   -0.478501952308f,   -1.71648619442f,   -0.182571723636f,
+    0.339292649504f,   2.0806519131f,      0.967974033444f,   0.175248672328f,
+    0.0658124561472f,  0.795504169496f,    0.750592557361f,   -1.46631013249f,
+    -1.79052846838f,   -1.03672179515f,    -0.841985521653f,  1.20995011489f,
+    0.140859718215f,   -0.651552622661f,   0.451065110806f,   1.1189443693f,
+    0.100213260593f,   -0.834076868118f,   -1.28734321611f,   1.22064420095f,
+    -0.364143084361f,  0.750961509335f,    -0.888689074553f,  -0.8253547106f,
+    -1.21800999027f,   -0.966670603566f,   1.37384014741f,    0.47281264834f,
+    -0.420416235531f,  0.520163906493f,    0.501296589423f,   1.53418976951f,
+    0.715234751485f,   0.644551588907f,    0.0763504863375f,  -0.0018541943723f,
+    0.322853189656f,   -0.795099723224f,   -0.125177096675f,  1.4476577471f,
+    -0.585888410088f,  -1.44391754955f,    -0.610543221933f,  -0.221859179799f,
+    0.252060200774f,   -0.86287169623f,    -0.0350246229157f, 1.0932311997f,
+    0.899464648842f,   -0.468806951704f,   -0.300861137168f,  1.15776414206f,
+    1.03268544738f,    -0.171579585622f,   -0.179136557119f,  -0.354091003368f,
+    -0.612298249394f,  -1.20237379258f,    1.54604109659f,    0.130664370287f,
+    0.885225111868f,   1.0362799581f,      0.980561720868f,   -0.619379186999f,
+    -1.33818929924f,   -0.237233737961f,   -1.89335425073f,   0.567821011321f,
+    0.862420368465f,   -1.37380916821f,    0.352190056666f,   0.611261516274f,
+    0.393237747152f,   0.894686247967f,    0.190405182149f,   0.264872662911f,
+    -0.0657009133797f, 0.0580512653493f,   -0.401825294366f,  0.4106081318f,
+    0.49484512188f,    -0.0751103149442f,  -1.43243736382f,   1.79855656009f,
+    -1.1075351975f,    0.000354882733011f, -0.950716438608f,  1.27129831688f,
+    1.00495189838f,    0.110358656713f,    1.08315032822f,    -0.972676676218f,
+    -0.0757668962831f, 1.88932045165f,     -0.0672638136275f, 0.425913010161f,
+    -0.781540372017f,  0.976000248609f,    0.687218504122f,   1.31374513445f,
+    -0.932658930672f,  -1.25339468479f,    0.422071294078f,   -0.24189927912f,
+    0.216906604642f,   -1.88720997548f,    1.99252872889f,    0.353943735777f,
+    0.737434784132f,   -1.17848645017f,    1.70424254896f,    0.775297112968f,
+    -0.516392797501f,  0.398130609129f,    0.737248101457f,   0.166282500886f,
+    1.24699015468f,    0.47116183125f,     1.19091180182f,    -0.372695424578f,
+    0.219773209389f,   -0.829467838962f,   -0.52533122724f,   1.98707754595f,
+    0.553692606972f,   -0.933228902369f,   1.55427751643f,    -1.08813399144f,
+    -0.325686682094f,  0.205091443796f,    -1.70381666435f,   0.466465327942f,
+    1.73126863447f,    -0.939133672634f,   1.48318077459f,    -0.599414038168f,
+    -1.1583078687f,    0.518116190201f,    0.133571482458f,   0.84958342672f,
+    1.02205000597f,    -0.0772082009087f,  -1.69567503859f,   1.4697939436f,
+    1.67813743122f,    -0.627911582938f,   0.131380509137f,   -1.35717850726f,
+  };
+  const float *input[3] = { input_, &input_[image_dim * image_dim],
+                            &input_[2 * image_dim * image_dim] };
+
+  const float bias[] = { 0.0f, 0.0f };
+
+  const float weights_1[] = {
+    -0.489547413618f, 0.141916424749f,  -0.279286485585f,  -0.115322211094f,
+    0.299572786936f,  0.205289980785f,  -0.536254480088f,  -0.253626313744f,
+    -0.422883815849f, -0.169702966298f, -0.540104704793f,  0.495319646763f,
+    0.298799079422f,  -0.10054550901f,  -0.306085047056f,  0.171061886165f,
+    -0.108058703878f, -0.410734629888f, -0.0640674673049f, -0.386524840979f,
+    -0.157203423678f, -0.362138920529f, -0.216206085209f,  0.147502517971f,
+  };
+
+  const float weights_2[] = {
+    0.207580604357f,  0.480821146263f,  -0.29111909562f,   0.47422567493f,
+    0.206892553253f,  -0.235067084092f, 0.354516800602f,   -0.212399370252f,
+    -0.419071343731f, -0.050350731631f, -0.0516457320279f, -0.0359310500731f,
+    0.567044864811f,  -0.060341127522f, 0.0501464839637f,  -0.437785677916f,
+  };
+
+  const float weights_3[] = {
+    -0.0690452401448f, -0.356657338763f,   -0.219464031809f, 0.551288365843f,
+    0.181372090853f,   -0.00245268542109f, 0.409000696276f,  -0.593209108763f,
+    0.587352566749f,   -0.243720660227f,   0.266232713887f,  -0.00439285245097f,
+    0.252883228305f,   0.152646192631f,    0.0918944932026f, 0.398853715057f,
+  };
+
+  const float weights_4[] = {
+    0.207560791573f,   0.194201350401f,   0.227802322443f,  0.206533663345f,
+    0.0557331066805f,  0.0224159800424f,  -0.143939197467f, -0.27703361602f,
+    0.130643888389f,   -0.269456557461f,  0.186242862864f,  -0.162879944774f,
+    -0.145503996718f,  -0.0768822987581f, -0.203127976359f, -0.238119922873f,
+    -0.258806479994f,  0.0357957680385f,  -0.1027606976f,   -0.287920082345f,
+    0.189047820993f,   0.250711538481f,   -0.272815714175f, -0.0431449742024f,
+    0.207261230996f,   -0.0396472677451f, 0.131236557412f,  0.174291832499f,
+    -0.251515885765f,  -0.107164007499f,  0.185824534748f,  -0.00561585838161f,
+    0.273393799578f,   -0.139563699075f,  -0.263922456031f, -0.118859844081f,
+    0.109230982597f,   -0.170170294794f,  0.0123025648515f, -0.0839368964355f,
+    -0.0774058234297f, 0.255847138286f,   -0.208430879637f, 0.279170114319f,
+    -0.272890330712f,  -0.217725903006f,  -0.295923275459f, -0.17008723953f,
+    -0.284281803405f,  0.281406323629f,   0.266910044663f,  -0.209963914338f,
+    0.271980962964f,   0.142013581699f,   -0.143896509026f, -0.290509242975f,
+    -0.305768180935f,  0.196902832117f,   -0.090424189662f, -0.147460802346f,
+    0.217722016651f,   0.12353848977f,    -0.169177363577f, -0.0454230918512f,
+  };
+
+  const float expected_0[] = {
+    -2.04858441055f,  -2.12883075791f,    -0.045177363807f, 0.763949675768f,
+    -0.544361512821f, -1.58123168032f,    1.89319847039f,   0.16859080901f,
+    -1.16023321135f,  -0.396988107751f,   1.76637090744f,   -1.40434786514f,
+    0.908227575669f,  0.817064817605f,    0.215631134908f,  -0.848605613428f,
+    -0.106756747018f, 0.0193027166685f,   0.801345615113f,  -0.395407237598f,
+    -1.79983795658f,  -1.73054496242f,    0.0584392594454f, -0.388786095569f,
+    -0.237269619354f, 0.000843578271263f, -1.24043512104f,  0.487839445893f,
+    -0.394259726605f, 0.559632843424f,    -0.527224052291f, -1.53792340282f,
+  };
+
+  const float expected_1[] = {
+    0.0f, 0.0f,           0.0f, 0.0f, 0.4057888292f, 0.325309571755f,
+    0.0f, 1.22013465602f,
+  };
+
+  const float expected_2[] = {
+    0.156119444687f,
+    0.517385299817f,
+  };
+
+  const float expected_3[] = {
+    0.224177852984f,
+    0.503384419034f,
+    0.156119444687f,
+    0.517385299817f,
+  };
+
+  const float *expected[] = { expected_0, expected_1, expected_2, expected_3 };
+
+  CNN_CONFIG cnn_config = {
+    4,  // num_layers
+    0,  // is_residue
+    0,  // ext_width
+    0,  // ext_height
+    0,  // strict_bounds
+    {
+        // layer_config
+        {
+            image_ch,           // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_1,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            NONE,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_OUTPUT,      // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            { 2, 0, 0 },        // branch_config
+            {},                 // bn_params
+            0,                  // output_num
+        },
+        {
+            num_filters,        // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_2,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            RELU,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_NO_COPY,     // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            {},                 // branch_config
+            {},                 // bn_params
+            1,                  // output_num
+        },
+        {
+            num_filters,        // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_3,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            RELU,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_NO_COPY,     // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            {},                 // branch_config
+            {},                 // bn_params
+            2,                  // output_num
+        },
+        {
+            num_filters,     // in_channels
+            2 * filter_dim,  // filter_width
+            2 * filter_dim,  // filter_height
+            num_filters,     // out_channels
+            2 * stride,      // skip_width
+            2 * stride,      // skip_height
+            0,               // max_pool
+            weights_4,       // weights
+            bias,            // bias
+            PADDING_VALID,   // pad
+            RELU,            // activation
+            0,               // deconvolve
+            1,               // branch
+            BRANCH_NO_COPY,  // branch_copy_type
+            BRANCH_CAT,      // branch_combine_type
+            { 0, 0, 1 },     // branch_config
+            {},              // bn_params
+            3,               // output_num
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  const int num_outputs = 4;
+  const int output_chs[4] = { filter_dim, filter_dim, filter_dim,
+                              2 * filter_dim };
+  const int output_dims[4] = { 4, 2, 1, 1 };
+  const int output_sizes[4] = {
+    output_chs[0] * output_dims[0] * output_dims[0],
+    output_chs[1] * output_dims[1] * output_dims[1],
+    output_chs[2] * output_dims[2] * output_dims[2],
+    output_chs[3] * output_dims[3] * output_dims[3],
+  };
+  float *const output_ = (float *)aom_malloc(
+      sizeof(*output_) *
+      (output_sizes[0] + output_sizes[1] + output_sizes[2] + output_sizes[3]));
+  float *output[CNN_MAX_CHANNELS] = { nullptr };
+  int ch_ite = 0;
+  float *output_ite = output_;
+  for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+    for (int channel = 0; channel < output_chs[output_idx]; ++channel) {
+      output[ch_ite++] = output_ite;
+      output_ite += output_dims[output_idx] * output_dims[output_idx];
+    }
+  }
+  CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_dims,
+                                  output };
+
+  RunMultiOutCNNTest(input, image_dim, image_dim, image_dim, &cnn_config,
+                     &thread_data, &output_struct, expected, MSE_FLOAT_TOL);
+
+  aom_free(output_);
+}

diff --git a/libaom/test/codec_factory.h b/libaom/test/codec_factory.h
index dd99110..801b894 100644
--- a/libaom/test/codec_factory.h
+++ b/libaom/test/codec_factory.h

@@ -11,6 +11,8 @@
 #ifndef AOM_TEST_CODEC_FACTORY_H_
 #define AOM_TEST_CODEC_FACTORY_H_
 
+#include <tuple>
+
 #include "config/aom_config.h"
 
 #include "aom/aom_decoder.h"
@@ -40,11 +42,11 @@
                                  const aom_codec_flags_t flags) const = 0;
 
   virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
-                                 const unsigned long init_flags,
+                                 const aom_codec_flags_t init_flags,
                                  TwopassStatsStore *stats) const = 0;
 
   virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
-                                               int usage) const = 0;
+                                               unsigned int usage) const = 0;
 };
 
 /* Provide CodecTestWith<n>Params classes for a variable number of parameters
@@ -54,27 +56,28 @@
 template <class T1>
 class CodecTestWithParam
     : public ::testing::TestWithParam<
-          ::testing::tuple<const libaom_test::CodecFactory *, T1> > {};
+          std::tuple<const libaom_test::CodecFactory *, T1> > {};
 
 template <class T1, class T2>
 class CodecTestWith2Params
     : public ::testing::TestWithParam<
-          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
+          std::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
 
 template <class T1, class T2, class T3>
 class CodecTestWith3Params
     : public ::testing::TestWithParam<
-          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
 
 template <class T1, class T2, class T3, class T4>
 class CodecTestWith4Params
-    : public ::testing::TestWithParam< ::testing::tuple<
-          const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
 
 template <class T1, class T2, class T3, class T4, class T5>
 class CodecTestWith5Params
-    : public ::testing::TestWithParam< ::testing::tuple<
-          const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {};
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {
+};
 
 /*
  * AV1 Codec Definitions
@@ -98,7 +101,7 @@
 
 class AV1Encoder : public Encoder {
  public:
-  AV1Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+  AV1Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
              TwopassStatsStore *stats)
       : Encoder(cfg, init_flags, stats) {}
 
@@ -132,7 +135,7 @@
   }
 
   virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
-                                 const unsigned long init_flags,
+                                 const aom_codec_flags_t init_flags,
                                  TwopassStatsStore *stats) const {
 #if CONFIG_AV1_ENCODER
     return new AV1Encoder(cfg, init_flags, stats);
@@ -145,7 +148,7 @@
   }
 
   virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+                                               unsigned int usage) const {
 #if CONFIG_AV1_ENCODER
     return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
 #else
@@ -159,7 +162,7 @@
 const libaom_test::AV1CodecFactory kAV1;
 
 #define AV1_INSTANTIATE_TEST_CASE(test, ...)                                \
-  INSTANTIATE_TEST_CASE_P(                                                  \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
       AV1, test,                                                            \
       ::testing::Combine(                                                   \
           ::testing::Values(static_cast<const libaom_test::CodecFactory *>( \

diff --git a/libaom/test/coding_path_sync.cc b/libaom/test/coding_path_sync.cc
index 6735236..4c613dc 100644
--- a/libaom/test/coding_path_sync.cc
+++ b/libaom/test/coding_path_sync.cc

@@ -15,12 +15,13 @@
 
 #include "config/aom_config.h"
 
-#include "aom_ports/mem.h"  // ROUND_POWER_OF_TWO
 #include "aom/aomcx.h"
 #include "aom/aomdx.h"
 #include "aom/aom_encoder.h"
 #include "aom/aom_decoder.h"
 
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
 using libaom_test::ACMRandom;
 namespace {
 

diff --git a/libaom/test/comp_avg_pred_test.cc b/libaom/test/comp_avg_pred_test.cc
index 3e5632e..ac625a7 100644
--- a/libaom/test/comp_avg_pred_test.cc
+++ b/libaom/test/comp_avg_pred_test.cc

@@ -14,10 +14,12 @@
 using libaom_test::ACMRandom;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+#if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
-using ::testing::make_tuple;
-using ::testing::tuple;
+#endif
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 
@@ -26,9 +28,9 @@
 TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
-                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                            aom_dist_wtd_comp_avg_pred_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_dist_wtd_comp_avg_pred_ssse3));
 #endif
 
 TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
@@ -40,11 +42,12 @@
 }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
-                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                            aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
 #endif
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
@@ -54,9 +57,9 @@
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
-                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                            aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
 #endif
 
 TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
@@ -68,9 +71,10 @@
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
-                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                            aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
 #endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace

diff --git a/libaom/test/comp_avg_pred_test.h b/libaom/test/comp_avg_pred_test.h
index 01ea35d..7f73312 100644
--- a/libaom/test/comp_avg_pred_test.h
+++ b/libaom/test/comp_avg_pred_test.h

@@ -12,6 +12,8 @@
 #ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
 #define AOM_TEST_COMP_AVG_PRED_TEST_H_
 
+#include <tuple>
+
 #include "config/aom_dsp_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -38,6 +40,12 @@
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
     int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
 
+typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
+
+typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+    DISTWTDCOMPAVGUPSAMPLEDParam;
+
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*highbddistwtdcompavgupsampled_func)(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
@@ -45,28 +53,11 @@
     int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
     int subpel_search);
 
-typedef ::testing::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
-
-typedef ::testing::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
-    DISTWTDCOMPAVGUPSAMPLEDParam;
-
-typedef ::testing::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
-    HighbdDISTWTDCOMPAVGParam;
-
-typedef ::testing::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
     HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
 
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
-    distwtdcompavg_func filter) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
-    distwtdcompavgupsampled_func filter) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
+typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGParam;
 
 ::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
     distwtdcompavg_func filter, int is_hbd) {
@@ -82,6 +73,19 @@
                             ::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+    distwtdcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
 
 class AV1DISTWTDCOMPAVGTest
     : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
@@ -315,6 +319,7 @@
   libaom_test::ACMRandom rnd_;
 };  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
 
+#if CONFIG_AV1_HIGHBITDEPTH
 class AV1HighBDDISTWTDCOMPAVGTest
     : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
  public:
@@ -555,7 +560,8 @@
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+};      // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace AV1DISTWTDCOMPAVG
 }  // namespace libaom_test

diff --git a/libaom/test/comp_mask_variance_test.cc b/libaom/test/comp_mask_variance_test.cc
index 53ba56c..b666306 100644
--- a/libaom/test/comp_mask_variance_test.cc
+++ b/libaom/test/comp_mask_variance_test.cc

@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 #include <new>
+#include <tuple>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
@@ -40,7 +41,7 @@
   BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
 };
 #endif
-typedef ::testing::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
+typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
 
 class AV1CompMaskVarianceTest
     : public ::testing::TestWithParam<CompMaskPredParam> {
@@ -105,8 +106,7 @@
                                              BLOCK_SIZE bsize, int inv) {
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
@@ -123,8 +123,7 @@
                                            BLOCK_SIZE bsize) {
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   int wedge_index = wedge_types / 2;
   const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
   const int num_loops = 1000000000 / (w + h);
@@ -158,14 +157,14 @@
 }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, AV1CompMaskVarianceTest,
     ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
                        ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1CompMaskVarianceTest,
     ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
                        ::testing::ValuesIn(kValidBlockSize)));
@@ -189,7 +188,7 @@
                                                BLOCK_SIZE bsize, int inv) {
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   int subpel_search;
   for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
        ++subpel_search) {
@@ -224,8 +223,7 @@
   const int h = block_size_high[bsize];
   const int subx = havSub ? 3 : 0;
   const int suby = havSub ? 4 : 0;
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   int wedge_index = wedge_types / 2;
   const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
@@ -262,14 +260,14 @@
 }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, AV1CompMaskUpVarianceTest,
     ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
                        ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1CompMaskUpVarianceTest,
     ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
                        ::testing::ValuesIn(kValidBlockSize)));
@@ -277,13 +275,14 @@
 
 #endif  // ifndef aom_comp_mask_pred
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
                                            const uint8_t *pred8, int width,
                                            int height, const uint8_t *ref8,
                                            int ref_stride, const uint8_t *mask,
                                            int mask_stride, int invert_mask);
 
-typedef ::testing::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
+typedef std::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
     HighbdCompMaskPredParam;
 
 class AV1HighbdCompMaskVarianceTest
@@ -347,11 +346,9 @@
 void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
     highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
   int bd_ = GET_PARAM(2);
-
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
 
   for (int i = 0; i < MAX_SB_SQUARE; ++i) {
     pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
@@ -381,8 +378,7 @@
 
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   int wedge_index = wedge_types / 2;
 
   for (int i = 0; i < MAX_SB_SQUARE; ++i) {
@@ -426,7 +422,7 @@
 }
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdCompMaskVarianceTest,
     ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
                        ::testing::ValuesIn(kValidBlockSize),
@@ -434,7 +430,7 @@
 #endif
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1HighbdCompMaskVarianceTest,
     ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
                        ::testing::ValuesIn(kValidBlockSize),
@@ -463,7 +459,7 @@
   int bd_ = GET_PARAM(2);
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
 
   for (int i = 0; i < MAX_SB_SQUARE; ++i) {
     pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
@@ -515,8 +511,7 @@
   const int h = block_size_high[bsize];
   const int subx = havSub ? 3 : 0;
   const int suby = havSub ? 4 : 0;
-
-  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const int wedge_types = get_wedge_types_lookup(bsize);
   int wedge_index = wedge_types / 2;
   const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
@@ -562,7 +557,7 @@
 }
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdCompMaskUpVarianceTest,
     ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
                        ::testing::ValuesIn(kValidBlockSize),
@@ -570,7 +565,7 @@
 #endif
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1HighbdCompMaskUpVarianceTest,
     ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
                        ::testing::ValuesIn(kValidBlockSize),
@@ -578,4 +573,5 @@
 #endif
 
 #endif  // ifndef aom_highbd_comp_mask_pred
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace AV1CompMaskVariance

diff --git a/libaom/test/convolve_round_test.cc b/libaom/test/convolve_round_test.cc
index 2f801e7..4f17b54 100644
--- a/libaom/test/convolve_round_test.cc
+++ b/libaom/test/convolve_round_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <tuple>
 
 #include "config/av1_rtcd.h"
 
@@ -52,7 +53,7 @@
 
 typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
 
-using ::testing::tuple;
+using std::tuple;
 
 typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
     ConvolveRoundParam;
@@ -162,7 +163,7 @@
 
 TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 #if HAVE_AVX2
 const ConvolveRoundParam kConvRndParamArray[] = {
   make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
@@ -177,7 +178,7 @@
              &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
              HIGHBITDEPTH_TEST)
 };
-INSTANTIATE_TEST_CASE_P(AVX2, ConvolveRoundTest,
-                        ::testing::ValuesIn(kConvRndParamArray));
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveRoundTest,
+                         ::testing::ValuesIn(kConvRndParamArray));
 #endif  // HAVE_AVX2
 }  // namespace

diff --git a/libaom/test/convolve_test.cc b/libaom/test/convolve_test.cc
index 760d2c5..0b1eea1 100644
--- a/libaom/test/convolve_test.cc
+++ b/libaom/test/convolve_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -47,7 +48,7 @@
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
-typedef ::testing::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+typedef std::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
 #define ALL_SIZES_64(convolve_fn)                                         \
   make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
@@ -748,8 +749,10 @@
          UUT_->use_highbd_, elapsed_time);
 }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
+// WRAP macro is only used for high bitdepth build.
+#if CONFIG_AV1_HIGHBITDEPTH
 #define WRAP(func, bd)                                                       \
   static void wrap_##func##_##bd(                                            \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
@@ -793,28 +796,34 @@
 WRAP(convolve8_horiz_avx2, 12)
 WRAP(convolve8_vert_avx2, 12)
 #endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #undef WRAP
 
-const ConvolveFunctions convolve8_c(wrap_convolve_copy_c_8,
-                                    wrap_convolve8_horiz_c_8,
-                                    wrap_convolve8_vert_c_8, 8);
-const ConvolveFunctions convolve10_c(wrap_convolve_copy_c_10,
-                                     wrap_convolve8_horiz_c_10,
-                                     wrap_convolve8_vert_c_10, 10);
-const ConvolveFunctions convolve12_c(wrap_convolve_copy_c_12,
-                                     wrap_convolve8_horiz_c_12,
-                                     wrap_convolve8_vert_c_12, 12);
-const ConvolveParam kArrayConvolve_c[] = {
-  ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
-};
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_c(wrap_convolve_copy_c_8,
+                                         wrap_convolve8_horiz_c_8,
+                                         wrap_convolve8_vert_c_8, 8);
+const ConvolveFunctions wrap_convolve10_c(wrap_convolve_copy_c_10,
+                                          wrap_convolve8_horiz_c_10,
+                                          wrap_convolve8_vert_c_10, 10);
+const ConvolveFunctions wrap_convolve12_c(wrap_convolve_copy_c_12,
+                                          wrap_convolve8_horiz_c_12,
+                                          wrap_convolve8_vert_c_12, 12);
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(wrap_convolve8_c),
+                                           ALL_SIZES(wrap_convolve10_c),
+                                           ALL_SIZES(wrap_convolve12_c) };
+#else
+const ConvolveFunctions convolve8_c(aom_convolve_copy_c, aom_convolve8_horiz_c,
+                                    aom_convolve8_vert_c, 0);
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
+#endif
 
-INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_c));
 
 #if HAVE_SSE2 && ARCH_X86_64
-const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_sse2,
-                                       aom_convolve8_vert_sse2, 0);
+#if CONFIG_AV1_HIGHBITDEPTH
 const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve_copy_sse2_8,
                                             wrap_convolve8_horiz_sse2_8,
                                             wrap_convolve8_vert_sse2_8, 8);
@@ -824,12 +833,17 @@
 const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve_copy_sse2_12,
                                              wrap_convolve8_horiz_sse2_12,
                                              wrap_convolve8_vert_sse2_12, 12);
-const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
-                                              ALL_SIZES(wrap_convolve8_sse2),
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(wrap_convolve8_sse2),
                                               ALL_SIZES(wrap_convolve10_sse2),
                                               ALL_SIZES(wrap_convolve12_sse2) };
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve_sse2));
+#else
+const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_sse2,
+                                       aom_convolve8_vert_sse2, 0);
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
+#endif
+INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
@@ -838,15 +852,12 @@
                                         aom_convolve8_vert_ssse3, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
-INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
 #if HAVE_AVX2
-const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_avx2,
-                                       aom_convolve8_vert_avx2, 0);
-
+#if CONFIG_AV1_HIGHBITDEPTH
 const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
                                             wrap_convolve8_horiz_avx2_8,
                                             wrap_convolve8_vert_avx2_8, 8);
@@ -858,10 +869,17 @@
                                              wrap_convolve8_vert_avx2_12, 12);
 const ConvolveParam kArray_Convolve8_avx2[] = {
   ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
-  ALL_SIZES_64(wrap_convolve12_avx2), ALL_SIZES(convolve8_avx2)
+  ALL_SIZES_64(wrap_convolve12_avx2)
 };
-INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
-                        ::testing::ValuesIn(kArray_Convolve8_avx2));
+#else
+const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_avx2,
+                                       aom_convolve8_vert_avx2, 0);
+const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+#endif
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest,
+                         ::testing::ValuesIn(kArray_Convolve8_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace

diff --git a/libaom/test/corner_match_test.cc b/libaom/test/corner_match_test.cc
index af2baa7..c685dca 100644
--- a/libaom/test/corner_match_test.cc
+++ b/libaom/test/corner_match_test.cc

@@ -8,6 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <tuple>
+
 #include "config/av1_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -28,8 +30,8 @@
                                        int y1, unsigned char *im2, int stride2,
                                        int x2, int y2);
 
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
 
 class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
@@ -88,13 +90,13 @@
     int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
 
     double res_c =
-        compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
     double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (j = 0; j < run_times; j++) {
-        compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -125,17 +127,17 @@
 TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1CornerMatchTest,
-    ::testing::Values(make_tuple(0, compute_cross_correlation_sse4_1),
-                      make_tuple(1, compute_cross_correlation_sse4_1)));
+    ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1),
+                      make_tuple(1, &av1_compute_cross_correlation_sse4_1)));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1CornerMatchTest,
-    ::testing::Values(make_tuple(0, compute_cross_correlation_avx2),
-                      make_tuple(1, compute_cross_correlation_avx2)));
+    ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2),
+                      make_tuple(1, &av1_compute_cross_correlation_avx2)));
 #endif
 }  // namespace AV1CornerMatch
 

diff --git a/libaom/test/datarate_test.cc b/libaom/test/datarate_test.cc
index 1588d3c..053c055 100644
--- a/libaom/test/datarate_test.cc
+++ b/libaom/test/datarate_test.cc

@@ -13,19 +13,26 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
+#include "test/datarate_test.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "aom/aom_codec.h"
 
+namespace datarate_test {
 namespace {
 
+// Params: test mode, speed, aq mode and index for bitrate array.
 class DatarateTestLarge
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
-      public ::libaom_test::EncoderTest {
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 unsigned int, int>,
+      public DatarateTest {
  public:
-  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+  DatarateTestLarge() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
 
  protected:
   virtual ~DatarateTestLarge() {}
@@ -33,135 +40,42 @@
   virtual void SetUp() {
     InitializeConfig();
     SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
     ResetModel();
   }
 
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    tot_frame_number_ = 0;
-    first_drop_ = 0;
-    num_drops_ = 0;
-    // Denoiser is off by default.
-    denoiser_on_ = 0;
-    bits_total_ = 0;
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-  }
+  virtual void BasicRateTargetingVBRTest() {
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_lag_in_frames = 0;
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-    }
-
-    encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_);
-
-    const aom_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    if (duration > 1) {
-      // If first drop not set and we have a drop set it to this time.
-      if (!first_drop_) first_drop_ = last_pts_ + 1;
-      // Update the number of frame drops.
-      num_drops_ += static_cast<int>(duration - 1);
-      // Update counter for total number of frames (#frames input to encoder).
-      // Needed for setting the proper layer_id below.
-      tot_frame_number_ += static_cast<int>(duration - 1);
-    }
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0)
-        << "Buffer Underrun at frame " << pkt->data.frame.pts;
-
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Update the total encoded bits.
-    bits_total_ += frame_size_in_bits;
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-    ++frame_number_;
-    ++tot_frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    duration_ = (last_pts_ + 1) * timebase_;
-    // Effective file datarate:
-    effective_datarate_ = (bits_total_ / 1000.0) / duration_;
-  }
-
-  aom_codec_pts_t last_pts_;
-  double timebase_;
-  int frame_number_;      // Counter for number of non-dropped/encoded frames.
-  int tot_frame_number_;  // Counter for total number of input frames.
-  int64_t bits_total_;
-  double duration_;
-  double effective_datarate_;
-  int set_cpu_used_;
-  int64_t bits_in_buffer_model_;
-  aom_codec_pts_t first_drop_;
-  int num_drops_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-};
-
-// Check basic rate targeting for VBR mode.
-TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = AOM_VBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    const int bitrate_array[2] = { 400, 800 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.75)
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
         << " The datarate for the file is greater than target by too much!";
   }
-}
 
-// Check basic rate targeting for CBR,
-TEST_P(DatarateTestLarge, BasicRateTargeting) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.g_lag_in_frames = 0;
+  virtual void BasicRateTargetingCBRTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
 
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 150; i < 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    const int bitrate_array[2] = { 150, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
@@ -169,25 +83,74 @@
     ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
         << " The datarate for the file is greater than target by too much!";
   }
-}
 
-// Check basic rate targeting for CBR.
-TEST_P(DatarateTestLarge, BasicRateTargeting444) {
-  ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+  virtual void BasicRateTargetingCBRPeriodicKeyFrameTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // Periodic keyframe
+    cfg_.kf_max_dist = 50;
 
-  cfg_.g_profile = 1;
-  cfg_.g_timebase = video.timebase();
+    ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+                                         30, 1, 0, 310);
+    const int bitrate_array[2] = { 150, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
 
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = AOM_CBR;
+  virtual void BasicRateTargetingAQModeOnOffCBRTest() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_pass = AOM_RC_ONE_PASS;
+    cfg_.g_usage = AOM_USAGE_REALTIME;
+    cfg_.kf_mode = AOM_KF_DISABLED;
 
-  for (int i = 250; i < 900; i += 400) {
-    cfg_.rc_target_bitrate = i;
+    ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+                                         30, 1, 0, 310);
+    const int bitrate_array[1] = { 60 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargeting444CBRTest() {
+    ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+    cfg_.g_profile = 1;
+    cfg_.g_timebase = video.timebase();
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+
+    const int bitrate_array[2] = { 250, 650 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
@@ -198,58 +161,213 @@
         << " The datarate for the file missed the target!"
         << cfg_.rc_target_bitrate << " " << effective_datarate_;
   }
+};
+
+// Params: test mode, speed, aq mode.
+class DatarateTestFrameDropLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 unsigned int>,
+      public DatarateTest {
+ public:
+  DatarateTestFrameDropLarge() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
+
+ protected:
+  virtual ~DatarateTestFrameDropLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void ChangingDropFrameThreshTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_dropframe_thresh = 10;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    // TODO(marpan): Investigate datarate target failures with a smaller
+    // keyframe interval (128).
+    cfg_.kf_max_dist = 9999;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 100);
+
+    const int kDropFrameThreshTestStep = 30;
+    aom_codec_pts_t last_drop = 140;
+    int last_num_drops = 0;
+    for (int i = 40; i < 100; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.17)
+          << " The datarate for the file is greater than target by too much!";
+      if (last_drop > 0) {
+        ASSERT_LE(first_drop_, last_drop)
+            << " The first dropped frame for drop_thresh " << i
+            << " > first dropped frame for drop_thresh "
+            << i - kDropFrameThreshTestStep;
+      }
+      ASSERT_GE(num_drops_, last_num_drops * 0.7)
+          << " The number of dropped frames for drop_thresh " << i
+          << " < number of dropped frames for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+      last_num_drops = num_drops_;
+    }
+  }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
+  BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingCBR) {
+  BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBR) {
+  BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargeting444CBR) {
+  BasicRateTargeting444CBRTest();
 }
 
 // Check that (1) the first dropped frame gets earlier and earlier
 // as the drop frame threshold is increased, and (2) that the total number of
 // frame drops does not decrease as we increase frame drop threshold.
 // Use a lower qp-max to force some frame drops.
-TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 50;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_error_resilient = 1;
-  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
-  // interval (128).
-  cfg_.kf_max_dist = 9999;
+TEST_P(DatarateTestFrameDropLarge, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
 
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 100);
+TEST_P(DatarateTestLarge, BasicRateTargetingAQModeOnOffCBR) {
+  BasicRateTargetingAQModeOnOffCBRTest();
+}
 
-  const int kDropFrameThreshTestStep = 30;
-  aom_codec_pts_t last_drop = 140;
-  int last_num_drops = 0;
-  for (int i = 40; i < 100; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
+class DatarateTestRealtime : public DatarateTestLarge {};
+
+class DatarateTestFrameDropRealtime : public DatarateTestFrameDropLarge {};
+
+// Params: aq mode.
+class DatarateTestSpeedChangeRealtime
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 unsigned int>,
+      public DatarateTest {
+ public:
+  DatarateTestSpeedChangeRealtime() : DatarateTest(GET_PARAM(0)) {
+    aq_mode_ = GET_PARAM(1);
+    speed_change_test_ = true;
+  }
+
+ protected:
+  virtual ~DatarateTestSpeedChangeRealtime() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void ChangingSpeedTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_dropframe_thresh = 10;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    // TODO(marpan): Investigate datarate target failures with a smaller
+    // keyframe interval (128).
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_dropframe_thresh = 0;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 100);
+
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.20)
         << " The datarate for the file is greater than target by too much!";
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.85)
-        << " The number of dropped frames for drop_thresh " << i
-        << " < number of dropped frames for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-    last_num_drops = num_drops_;
   }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestRealtime, BasicRateTargetingVBR) {
+  BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) {
+  BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBR) {
+  BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargeting444CBR) {
+  BasicRateTargeting444CBRTest();
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestFrameDropRealtime, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestSpeedChangeRealtime, ChangingSpeedTest) {
+  ChangingSpeedTest();
 }
 
 AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood,
-                                            ::libaom_test::kRealTime),
-                          ::testing::Values(2, 5));
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 7), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 7), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestSpeedChangeRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Values(0, 3));
+
 }  // namespace
+}  // namespace datarate_test

diff --git a/libaom/test/datarate_test.h b/libaom/test/datarate_test.h
new file mode 100644
index 0000000..3c15731
--- /dev/null
+++ b/libaom/test/datarate_test.h

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace datarate_test {
+namespace {
+class DatarateTest : public ::libaom_test::EncoderTest {
+ public:
+  explicit DatarateTest(const ::libaom_test::CodecFactory *codec)
+      : EncoderTest(codec), set_cpu_used_(0), aq_mode_(0),
+        speed_change_test_(false) {}
+
+ protected:
+  virtual ~DatarateTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    bits_total_ = 0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+      if (cfg_.g_usage == AOM_USAGE_REALTIME) {
+        encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+        encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+        encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+        encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      }
+    }
+
+    if (speed_change_test_) {
+      if (video->frame() == 0) {
+        encoder->Control(AOME_SET_CPUUSED, 8);
+      }
+      if (video->frame() == 30) {
+        encoder->Control(AOME_SET_CPUUSED, 7);
+      }
+      if (video->frame() == 60) {
+        encoder->Control(AOME_SET_CPUUSED, 6);
+      }
+      if (video->frame() == 90) {
+        encoder->Control(AOME_SET_CPUUSED, 7);
+      }
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    const aom_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits.
+    bits_total_ += frame_size_in_bits;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = (last_pts_ + 1) * timebase_;
+    // Effective file datarate:
+    effective_datarate_ = (bits_total_ / 1000.0) / duration_;
+  }
+
+  aom_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_;
+  double duration_;
+  double effective_datarate_;
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  aom_codec_pts_t first_drop_;
+  int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  unsigned int aq_mode_;
+  bool speed_change_test_;
+};
+
+}  // namespace
+}  // namespace datarate_test

diff --git a/libaom/test/decode_api_test.cc b/libaom/test/decode_api_test.cc
index c1beace..910640d 100644
--- a/libaom/test/decode_api_test.cc
+++ b/libaom/test/decode_api_test.cc

@@ -33,19 +33,19 @@
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-            aom_codec_decode(NULL, buf, NELEMENTS(buf), NULL));
+            aom_codec_decode(NULL, buf, sizeof(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-            aom_codec_decode(NULL, NULL, NELEMENTS(buf), NULL));
+            aom_codec_decode(NULL, NULL, sizeof(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+  for (const aom_codec_iface_t *iface : kCodecs) {
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_dec_init(NULL, kCodecs[i], NULL, 0));
+              aom_codec_dec_init(NULL, iface, NULL, 0));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_decode(&dec, NULL, NELEMENTS(buf), NULL));
+              aom_codec_decode(&dec, NULL, sizeof(buf), NULL));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));

diff --git a/libaom/test/decode_perf_test.cc b/libaom/test/decode_perf_test.cc
index 2f67342..691337c 100644
--- a/libaom/test/decode_perf_test.cc
+++ b/libaom/test/decode_perf_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <string>
+#include <tuple>
 
 #include "config/aom_version.h"
 
@@ -24,7 +25,7 @@
 #include "test/util.h"
 #include "test/webm_video_source.h"
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 namespace {
 
@@ -37,7 +38,7 @@
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
  */
-typedef ::testing::tuple<const char *, unsigned> DecodePerfParam;
+typedef std::tuple<const char *, unsigned> DecodePerfParam;
 
 // TODO(jimbankoski): Add actual test vectors here when available.
 // const DecodePerfParam kAV1DecodePerfVectors[] = {};
@@ -91,7 +92,7 @@
 }
 
 // TODO(jimbankoski): Enabled when we have actual AV1 Decode vectors.
-// INSTANTIATE_TEST_CASE_P(AV1, DecodePerfTest,
+// INSTANTIATE_TEST_SUITE_P(AV1, DecodePerfTest,
 //                        ::testing::ValuesIn(kAV1DecodePerfVectors));
 
 class AV1NewEncodeDecodePerfTest

diff --git a/libaom/test/decode_test_driver.h b/libaom/test/decode_test_driver.h
index d13e13e..64722f4 100644
--- a/libaom/test/decode_test_driver.h
+++ b/libaom/test/decode_test_driver.h

@@ -67,13 +67,13 @@
 
   void Control(int ctrl_id, const void *arg) {
     InitOnce();
-    const aom_codec_err_t res = aom_codec_control_(&decoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
   }
 
   void Control(int ctrl_id, int arg, aom_codec_err_t expected_value) {
     InitOnce();
-    const aom_codec_err_t res = aom_codec_control_(&decoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
     ASSERT_EQ(expected_value, res) << DecodeError();
   }
 

diff --git a/libaom/test/divu_small_test.cc b/libaom/test/divu_small_test.cc
index 064f8ee..f4d0846 100644
--- a/libaom/test/divu_small_test.cc
+++ b/libaom/test/divu_small_test.cc

@@ -18,7 +18,7 @@
 
 using libaom_test::ACMRandom;
 
-TEST(Daala, TestDIVUuptoMAX) {
+TEST(DivuSmallTest, TestDIVUuptoMAX) {
   for (int d = 1; d <= OD_DIVU_DMAX; d++) {
     for (uint32_t x = 1; x <= 1000000; x++) {
       GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
@@ -28,7 +28,7 @@
   }
 }
 
-TEST(Daala, TestDIVUrandI31) {
+TEST(DivuSmallTest, TestDIVUrandI31) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int d = 1; d < OD_DIVU_DMAX; d++) {
     for (int i = 0; i < 1000000; i++) {

diff --git a/libaom/test/dr_prediction_test.cc b/libaom/test/dr_prediction_test.cc
index 4be8489..e8865c0 100644
--- a/libaom/test/dr_prediction_test.cc
+++ b/libaom/test/dr_prediction_test.cc

@@ -8,6 +8,9 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
@@ -314,9 +317,9 @@
   }
 }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, LowbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
                                          NULL, AOM_BITS_8, kZ1Start),
@@ -325,6 +328,7 @@
                       DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
                                          NULL, AOM_BITS_8, kZ3Start)));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
 
 TEST_P(HighbdDrPredTest, SaturatedValues) {
@@ -337,7 +341,7 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, HighbdDrPredTest,
     ::testing::Values(
         DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
@@ -358,18 +362,17 @@
                                NULL, AOM_BITS_10, kZ3Start),
         DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
                                NULL, AOM_BITS_12, kZ3Start)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, LowbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
                                          &z1_wrapper<av1_dr_prediction_z1_avx2>,
                                          AOM_BITS_8, kZ1Start),
-                      /* TODO(niva213@gmail.com): Re-enable this test after
-                      fixing valgrind issue: https://crbug.com/aomedia/2316
                       DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
                                          &z2_wrapper<av1_dr_prediction_z2_avx2>,
-                                         AOM_BITS_8, kZ2Start), */
+                                         AOM_BITS_8, kZ2Start),
                       DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
                                          &z3_wrapper<av1_dr_prediction_z3_avx2>,
                                          AOM_BITS_8, kZ3Start)));
@@ -400,7 +403,8 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
     AVX2, HighbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred_Hbd>(
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
@@ -414,8 +418,6 @@
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
                           AOM_BITS_12, kZ1Start),
-                      /* TODO(niva213@gmail.com): Re-enable these tests after
-                      fixing valgrind issue: https://crbug.com/aomedia/2316
                       DrPredFunc<DrPred_Hbd>(
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
@@ -427,7 +429,7 @@
                       DrPredFunc<DrPred_Hbd>(
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
-                          AOM_BITS_12, kZ2Start),*/
+                          AOM_BITS_12, kZ2Start),
                       DrPredFunc<DrPred_Hbd>(
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
@@ -466,7 +468,7 @@
     }
   }
 }
-
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 }  // namespace

diff --git a/libaom/test/edge_detect_test.cc b/libaom/test/edge_detect_test.cc
index 77a731f..33fbbc0 100644
--- a/libaom/test/edge_detect_test.cc
+++ b/libaom/test/edge_detect_test.cc

@@ -10,6 +10,8 @@
  */
 
 #include <stdbool.h>
+#include <memory>
+#include <tuple>
 #include "aom_mem/aom_mem.h"
 #include "av1/encoder/rdopt.h"
 #include "test/util.h"
@@ -17,8 +19,8 @@
 
 namespace {
 
-using ::testing::get;
-using ::testing::tuple;
+using std::get;
+using std::tuple;
 
 static int get_pix(uint8_t *buf, int i, bool high_bd) {
   if (high_bd) {
@@ -59,13 +61,18 @@
   } else {
     dst = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * pad_w * pad_h);
   }
+  if (dst == nullptr) {
+    EXPECT_NE(dst, nullptr);
+    return nullptr;
+  }
+
   for (int j = 0; j < pad_h; ++j) {
     for (int i = 0; i < pad_w; ++i) {
       const int v = get_nearest_pix(data, w, h, i - 3, j - 3);
       if (high_bd) {
         *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v;
       } else {
-        dst[i + j * pad_w] = v;
+        dst[i + j * pad_w] = static_cast<uint8_t>(v);
       }
     }
   }
@@ -82,6 +89,18 @@
   }
 }
 
+struct Pad8TapConvolveDeleter {
+  Pad8TapConvolveDeleter(const int width, const bool high_bd)
+      : width(width), high_bd(high_bd) {}
+  void operator()(uint8_t *p) {
+    if (p != nullptr) {
+      free_pad_8tap(p, width, high_bd);
+    }
+  }
+  const int width;
+  const bool high_bd;
+};
+
 static uint8_t *malloc_bd(int num_entries, bool high_bd) {
   const int bytes_per_entry = high_bd ? sizeof(uint16_t) : sizeof(uint8_t);
 
@@ -101,6 +120,12 @@
   }
 }
 
+struct MallocBdDeleter {
+  explicit MallocBdDeleter(const bool high_bd) : high_bd(high_bd) {}
+  void operator()(uint8_t *p) { free_bd(p, high_bd); }
+  const bool high_bd;
+};
+
 class EdgeDetectBrightnessTest :
     // Parameters are (brightness, width, height, high bit depth representation,
     // bit depth).
@@ -116,13 +141,15 @@
     const bool high_bd = GET_PARAM(3);
 
     // Create the padded image of uniform brightness.
-    int *orig = (int *)malloc(width * height * sizeof(int));
+    std::unique_ptr<int[]> orig(new int[width * height]);
+    ASSERT_NE(orig, nullptr);
     for (int i = 0; i < width * height; ++i) {
       orig[i] = brightness;
     }
-    input_ = pad_8tap_convolve(orig, width, height, high_bd);
-    free(orig);
+    input_ = pad_8tap_convolve(orig.get(), width, height, high_bd);
+    ASSERT_NE(input_, nullptr);
     output_ = malloc_bd(width * height, high_bd);
+    ASSERT_NE(output_, nullptr);
   }
 
   void TearDown() override {
@@ -168,8 +195,8 @@
   const bool high_bd = GET_PARAM(3);
   const int bd = GET_PARAM(4);
 
-  gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd,
-                bd);
+  av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd,
+                    bd);
   for (int i = 0; i < width * height; ++i) {
     ASSERT_EQ(brightness, get_pix(output_, i, high_bd));
   }
@@ -190,20 +217,37 @@
              .magnitude);
 }
 
-INSTANTIATE_TEST_CASE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
-                        ::testing::Combine(
-                            // Brightness
-                            ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
-                                              256, 511, 512, 1023, 1024, 2048,
-                                              4095),
-                            // Width
-                            ::testing::Values(8, 16, 32),
-                            // Height
-                            ::testing::Values(4, 8, 12, 32),
-                            // High bit depth representation
-                            ::testing::Bool(),
-                            // Bit depth
-                            ::testing::Values(8, 10, 12)));
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
+                         ::testing::Combine(
+                             // Brightness
+                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
+                                               256, 511, 512, 1023, 1024, 2048,
+                                               4095),
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Bool(),
+                             // Bit depth
+                             ::testing::Values(8, 10, 12)));
+#else
+INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
+                         ::testing::Combine(
+                             // Brightness
+                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
+                                               256, 511, 512, 1023, 1024, 2048,
+                                               4095),
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Values(false),
+                             // Bit depth
+                             ::testing::Values(8)));
+#endif
 
 class EdgeDetectImageTest :
     // Parameters are (width, height, high bit depth representation, bit depth).
@@ -232,7 +276,7 @@
   const int bd = GET_PARAM(3);
 
   const int white = (1 << bd) - 1;
-  int *orig = (int *)malloc(width * height * sizeof(int));
+  std::unique_ptr<int[]> orig(new int[width * height]);
   for (int j = 0; j < height; ++j) {
     for (int i = 0; i < width; ++i) {
       if (i < width / 2) {
@@ -242,17 +286,18 @@
       }
     }
   }
-  uint8_t *padded = pad_8tap_convolve(orig, width, height, high_bd);
-  free(orig);
-  // Value should be between 556 and 560.
-  ASSERT_LE(556, av1_edge_exists(padded, stride_8tap(width), width, height,
-                                 high_bd, bd)
-                     .magnitude);
-  ASSERT_GE(560, av1_edge_exists(padded, stride_8tap(width), width, height,
-                                 high_bd, bd)
-                     .magnitude);
 
-  free_pad_8tap(padded, width, high_bd);
+  std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
+      pad_8tap_convolve(orig.get(), width, height, high_bd),
+      Pad8TapConvolveDeleter(width, high_bd));
+  ASSERT_NE(padded, nullptr);
+  // Value should be between 556 and 560.
+  ASSERT_LE(556, av1_edge_exists(padded.get(), stride_8tap(width), width,
+                                 height, high_bd, bd)
+                     .magnitude);
+  ASSERT_GE(560, av1_edge_exists(padded.get(), stride_8tap(width), width,
+                                 height, high_bd, bd)
+                     .magnitude);
 }
 
 // Hardcoded blur tests.
@@ -274,14 +319,18 @@
     if (bd > 8 && !high_bd) {
       break;
     }
-    uint8_t *output = malloc_bd(w * h, high_bd);
-    uint8_t *padded = pad_8tap_convolve(luma, w, h, high_bd);
-    gaussian_blur(padded, stride_8tap(w), w, h, output, high_bd, bd);
+    std::unique_ptr<uint8_t[], MallocBdDeleter> output(
+        malloc_bd(w * h, high_bd), MallocBdDeleter(high_bd));
+    ASSERT_NE(output, nullptr);
+    std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
+        pad_8tap_convolve(luma, w, h, high_bd),
+        Pad8TapConvolveDeleter(w, high_bd));
+    ASSERT_NE(padded, nullptr);
+    av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), high_bd,
+                      bd);
     for (int i = 0; i < w * h; ++i) {
-      ASSERT_EQ(expected[i], get_pix(output, i, high_bd));
+      ASSERT_EQ(expected[i], get_pix(output.get(), i, high_bd));
     }
-    free_pad_8tap(padded, w, high_bd);
-    free_bd(output, high_bd);
 
     // If we multiply the inputs by a constant factor, the output should not
     // vary more than 0.5 * factor.
@@ -290,21 +339,23 @@
       for (int i = 0; i < 32; ++i) {
         scaled_luma[i] = luma[i] * c;
       }
-      uint8_t *output = malloc_bd(w * h, high_bd);
-      uint8_t *padded = pad_8tap_convolve(scaled_luma, w, h, high_bd);
-      gaussian_blur(padded, stride_8tap(w), w, h, output, high_bd, bd);
+      padded.reset(pad_8tap_convolve(scaled_luma, w, h, high_bd));
+      ASSERT_NE(padded, nullptr);
+      av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(),
+                        high_bd, bd);
       for (int i = 0; i < w * h; ++i) {
-        ASSERT_GE(c / 2, abs(expected[i] * c - get_pix(output, i, high_bd)));
+        ASSERT_GE(c / 2,
+                  abs(expected[i] * c - get_pix(output.get(), i, high_bd)));
       }
-      free_pad_8tap(padded, w, high_bd);
-      free_bd(output, high_bd);
     }
   }
 }
 
 TEST(EdgeDetectImageTest, HardcodedBlurTest) {
   hardcoded_blur_test_aux(false);
+#if CONFIG_AV1_HIGHBITDEPTH
   hardcoded_blur_test_aux(true);
+#endif
 }
 
 TEST(EdgeDetectImageTest, SobelTest) {
@@ -312,32 +363,47 @@
   const uint8_t buf[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
   const int stride = 3;
   bool high_bd = false;
-  sobel_xy result = sobel(buf, stride, 1, 1, high_bd);
+  sobel_xy result = av1_sobel(buf, stride, 1, 1, high_bd);
   ASSERT_EQ(234, result.x);
   ASSERT_EQ(140, result.y);
 
+#if CONFIG_AV1_HIGHBITDEPTH
   // Verify it works for 8-bit values in a high bit-depth buffer.
   const uint16_t buf8_16[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
   high_bd = true;
-  result = sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd);
+  result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd);
   ASSERT_EQ(234, result.x);
   ASSERT_EQ(140, result.y);
 
   // Verify it works for high bit-depth values as well.
   const uint16_t buf16[9] = { 241, 147, 7, 90, 184, 2003, 1028, 186, 2 };
-  result = sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd);
+  result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd);
   ASSERT_EQ(-2566, result.x);
   ASSERT_EQ(-860, result.y);
+#endif
 }
 
-INSTANTIATE_TEST_CASE_P(EdgeDetectImages, EdgeDetectImageTest,
-                        ::testing::Combine(
-                            // Width
-                            ::testing::Values(8, 16, 32),
-                            // Height
-                            ::testing::Values(4, 8, 12, 32),
-                            // High bit depth representation
-                            ::testing::Bool(),
-                            // Bit depth
-                            ::testing::Values(8, 10, 12)));
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
+                         ::testing::Combine(
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Bool(),
+                             // Bit depth
+                             ::testing::Values(8, 10, 12)));
+#else
+INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
+                         ::testing::Combine(
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Values(false),
+                             // Bit depth
+                             ::testing::Values(8)));
+#endif
 }  // namespace

diff --git a/libaom/test/encode_api_test.cc b/libaom/test/encode_api_test.cc
index 235480a..25bdb5c 100644
--- a/libaom/test/encode_api_test.cc
+++ b/libaom/test/encode_api_test.cc

@@ -43,17 +43,17 @@
             aom_codec_enc_config_default(NULL, &cfg, 0));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
-    SCOPED_TRACE(aom_codec_iface_name(kCodecs[i]));
+  for (const aom_codec_iface_t *iface : kCodecs) {
+    SCOPED_TRACE(aom_codec_iface_name(iface));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+              aom_codec_enc_init(NULL, iface, NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+              aom_codec_enc_init(&enc, iface, NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_config_default(kCodecs[i], &cfg, 2));
+              aom_codec_enc_config_default(iface, &cfg, 2));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
 
     EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
 

diff --git a/libaom/test/encode_perf_test.cc b/libaom/test/encode_perf_test.cc
index fe649b1..390a6e0 100644
--- a/libaom/test/encode_perf_test.cc
+++ b/libaom/test/encode_perf_test.cc

@@ -123,32 +123,29 @@
 };
 
 TEST_P(AV1EncodePerfTest, PerfTest) {
-  for (size_t i = 0; i < NELEMENTS(kAV1EncodePerfTestVectors); ++i) {
-    for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
-      for (size_t k = 0; k < NELEMENTS(kEncodePerfTestThreads); ++k) {
-        if (kAV1EncodePerfTestVectors[i].width < 512 &&
-            kEncodePerfTestThreads[k] > 1)
+  for (const EncodePerfTestVideo &test_video : kAV1EncodePerfTestVectors) {
+    for (int speed : kEncodePerfTestSpeeds) {
+      for (int threads : kEncodePerfTestThreads) {
+        if (test_video.width < 512 && threads > 1)
           continue;
-        else if (kAV1EncodePerfTestVectors[i].width < 1024 &&
-                 kEncodePerfTestThreads[k] > 2)
+        else if (test_video.width < 1024 && threads > 2)
           continue;
 
-        set_threads(kEncodePerfTestThreads[k]);
+        set_threads(threads);
         SetUp();
 
         const aom_rational timebase = { 33333333, 1000000000 };
         cfg_.g_timebase = timebase;
-        cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate;
+        cfg_.rc_target_bitrate = test_video.bitrate;
 
         init_flags_ = AOM_CODEC_USE_PSNR;
 
-        const unsigned frames = kAV1EncodePerfTestVectors[i].frames;
-        const char *video_name = kAV1EncodePerfTestVectors[i].name;
-        libaom_test::I420VideoSource video(
-            video_name, kAV1EncodePerfTestVectors[i].width,
-            kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0,
-            kAV1EncodePerfTestVectors[i].frames);
-        set_speed(kEncodePerfTestSpeeds[j]);
+        const unsigned frames = test_video.frames;
+        const char *video_name = test_video.name;
+        libaom_test::I420VideoSource video(video_name, test_video.width,
+                                           test_video.height, timebase.den,
+                                           timebase.num, 0, test_video.frames);
+        set_speed(speed);
 
         aom_usec_timer t;
         aom_usec_timer_start(&t);
@@ -160,10 +157,9 @@
         const double fps = frames / elapsed_secs;
         const double minimum_psnr = min_psnr();
         std::string display_name(video_name);
-        if (kEncodePerfTestThreads[k] > 1) {
+        if (threads > 1) {
           char thread_count[32];
-          snprintf(thread_count, sizeof(thread_count), "_t-%d",
-                   kEncodePerfTestThreads[k]);
+          snprintf(thread_count, sizeof(thread_count), "_t-%d", threads);
           display_name += thread_count;
         }
 
@@ -175,8 +171,8 @@
         printf("\t\"totalFrames\" : %u,\n", frames);
         printf("\t\"framesPerSecond\" : %f,\n", fps);
         printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
-        printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
-        printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
+        printf("\t\"speed\" : %d,\n", speed);
+        printf("\t\"threads\" : %d\n", threads);
         printf("}\n");
       }
     }

diff --git a/libaom/test/encode_test_driver.cc b/libaom/test/encode_test_driver.cc
index d06168f..01f8d50 100644
--- a/libaom/test/encode_test_driver.cc
+++ b/libaom/test/encode_test_driver.cc

@@ -92,7 +92,11 @@
   switch (mode) {
     case kOnePassGood:
     case kTwoPassGood: break;
-    case kRealTime: cfg_.g_lag_in_frames = 0; break;
+    case kRealTime: {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.g_usage = AOM_USAGE_REALTIME;
+      break;
+    }
     default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
   }
   mode_ = mode;
@@ -219,65 +223,69 @@
     }
 #endif
 
+    number_spatial_layers_ = GetNumSpatialLayers();
+
     bool again;
     for (again = true; again; video->Next()) {
       again = (video->img() != NULL);
 
-      PreEncodeFrameHook(video);
-      PreEncodeFrameHook(video, encoder.get());
-      encoder->EncodeFrame(video, frame_flags_);
+      for (int sl = 0; sl < number_spatial_layers_; sl++) {
+        PreEncodeFrameHook(video);
+        PreEncodeFrameHook(video, encoder.get());
+        encoder->EncodeFrame(video, frame_flags_);
 
-      CxDataIterator iter = encoder->GetCxData();
+        CxDataIterator iter = encoder->GetCxData();
 
-      bool has_cxdata = false;
-      bool has_dxdata = false;
-      while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
-        pkt = MutateEncoderOutputHook(pkt);
-        again = true;
-        switch (pkt->kind) {
-          case AOM_CODEC_CX_FRAME_PKT:
-            has_cxdata = true;
-            if (decoder.get() != NULL && DoDecode()) {
-              aom_codec_err_t res_dec;
-              if (DoDecodeInvisible()) {
-                res_dec = decoder->DecodeFrame(
-                    (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
-              } else {
-                res_dec = decoder->DecodeFrame(
-                    (const uint8_t *)pkt->data.frame.buf +
-                        (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
-                    pkt->data.frame.vis_frame_size);
+        bool has_cxdata = false;
+        bool has_dxdata = false;
+        while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+          pkt = MutateEncoderOutputHook(pkt);
+          again = true;
+          switch (pkt->kind) {
+            case AOM_CODEC_CX_FRAME_PKT:
+              has_cxdata = true;
+              if (decoder.get() != NULL && DoDecode()) {
+                aom_codec_err_t res_dec;
+                if (DoDecodeInvisible()) {
+                  res_dec = decoder->DecodeFrame(
+                      (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+                } else {
+                  res_dec = decoder->DecodeFrame(
+                      (const uint8_t *)pkt->data.frame.buf +
+                          (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
+                      pkt->data.frame.vis_frame_size);
+                }
+
+                if (!HandleDecodeResult(res_dec, decoder.get())) break;
+
+                has_dxdata = true;
               }
+              ASSERT_GE(pkt->data.frame.pts, last_pts_);
+              if (sl == number_spatial_layers_) last_pts_ = pkt->data.frame.pts;
+              FramePktHook(pkt);
+              break;
 
-              if (!HandleDecodeResult(res_dec, decoder.get())) break;
+            case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
 
-              has_dxdata = true;
-            }
-            ASSERT_GE(pkt->data.frame.pts, last_pts_);
-            last_pts_ = pkt->data.frame.pts;
-            FramePktHook(pkt);
-            break;
-
-          case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
-
-          default: break;
-        }
-      }
-
-      if (has_dxdata && has_cxdata) {
-        const aom_image_t *img_enc = encoder->GetPreviewFrame();
-        DxDataIterator dec_iter = decoder->GetDxData();
-        const aom_image_t *img_dec = dec_iter.Next();
-        if (img_enc && img_dec) {
-          const bool res =
-              compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
-          if (!res) {  // Mismatch
-            MismatchHook(img_enc, img_dec);
+            default: break;
           }
         }
-        if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
-      }
-      if (!Continue()) break;
+
+        if (has_dxdata && has_cxdata) {
+          const aom_image_t *img_enc = encoder->GetPreviewFrame();
+          DxDataIterator dec_iter = decoder->GetDxData();
+          const aom_image_t *img_dec = dec_iter.Next();
+          if (img_enc && img_dec) {
+            const bool res =
+                compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+            if (!res) {  // Mismatch
+              MismatchHook(img_enc, img_dec);
+            }
+          }
+          if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+        }
+        if (!Continue()) break;
+      }  // Loop over spatial layers
     }
 
     EndPassHook();

diff --git a/libaom/test/encode_test_driver.h b/libaom/test/encode_test_driver.h
index 4f3f855..6319a52 100644
--- a/libaom/test/encode_test_driver.h
+++ b/libaom/test/encode_test_driver.h

@@ -82,7 +82,7 @@
 // level of abstraction will be fleshed out as more tests are written.
 class Encoder {
  public:
-  Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+  Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
           TwopassStatsStore *stats)
       : cfg_(cfg), init_flags_(init_flags), stats_(stats) {
     memset(&encoder_, 0, sizeof(encoder_));
@@ -105,23 +105,38 @@
   void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
 
   void Control(int ctrl_id, int arg) {
-    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
   void Control(int ctrl_id, int *arg) {
-    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
   void Control(int ctrl_id, struct aom_scaling_mode *arg) {
-    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_layer_id *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_ref_frame_config *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_params *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
 #if CONFIG_AV1_ENCODER
   void Control(int ctrl_id, aom_active_map_t *arg) {
-    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 #endif
@@ -149,7 +164,7 @@
 
   aom_codec_ctx_t encoder_;
   aom_codec_enc_cfg_t cfg_;
-  unsigned long init_flags_;
+  aom_codec_flags_t init_flags_;
   TwopassStatsStore *stats_;
 };
 
@@ -164,7 +179,7 @@
  protected:
   explicit EncoderTest(const CodecFactory *codec)
       : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0), mode_(kRealTime) {
+        last_pts_(0), mode_(kRealTime), number_spatial_layers_(1) {
     // Default to 1 thread.
     cfg_.g_threads = 1;
   }
@@ -178,9 +193,7 @@
   void SetMode(TestMode mode);
 
   // Set encoder flag.
-  void set_init_flags(unsigned long flag) {  // NOLINT(runtime/int)
-    init_flags_ = flag;
-  }
+  void set_init_flags(aom_codec_flags_t flag) { init_flags_ = flag; }
 
   // Main loop
   virtual void RunLoop(VideoSource *video);
@@ -227,6 +240,8 @@
     return AOM_CODEC_OK == res_dec;
   }
 
+  virtual int GetNumSpatialLayers() { return 1; }
+
   // Hook that can modify the encoder's output data
   virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
       const aom_codec_cx_pkt_t *pkt) {
@@ -238,10 +253,11 @@
   aom_codec_enc_cfg_t cfg_;
   unsigned int passes_;
   TwopassStatsStore stats_;
-  unsigned long init_flags_;
+  aom_codec_flags_t init_flags_;
   unsigned long frame_flags_;
   aom_codec_pts_t last_pts_;
   TestMode mode_;
+  int number_spatial_layers_;
 };
 
 }  // namespace libaom_test

diff --git a/libaom/test/encodetxb_test.cc b/libaom/test/encodetxb_test.cc
index 11cc073..385d3f1 100644
--- a/libaom/test/encodetxb_test.cc
+++ b/libaom/test/encodetxb_test.cc

@@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -20,8 +21,8 @@
 
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/idct.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
@@ -138,7 +139,7 @@
     for (int c = 0; c < eob; ++c) {
       levels_[get_padded_idx(scan[c], bwl)] =
           static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
-      coeff_contexts_[scan[c]] = rnd_.Rand16() >> 1;
+      coeff_contexts_[scan[c]] = static_cast<int8_t>(rnd_.Rand16() >> 1);
     }
 
     memcpy(coeff_contexts_ref_, coeff_contexts_,
@@ -177,15 +178,15 @@
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, EncodeTxbTest,
-                        ::testing::Values(av1_get_nz_map_contexts_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, EncodeTxbTest,
+                         ::testing::Values(av1_get_nz_map_contexts_sse2));
 #endif
 
 typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
                                          const int width, const int height,
                                          uint8_t *const levels);
 
-typedef ::testing::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+typedef std::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
 
 class EncodeTxbInitLevelTest
     : public ::testing::TestWithParam<TxbInitLevelParam> {
@@ -248,13 +249,13 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, EncodeTxbInitLevelTest,
     ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
 #endif
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, EncodeTxbInitLevelTest,
     ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));

diff --git a/libaom/test/end_to_end_test.cc b/libaom/test/end_to_end_test.cc
index 6ea09a6..162a7c7 100644
--- a/libaom/test/end_to_end_test.cc
+++ b/libaom/test/end_to_end_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <memory>
+#include <ostream>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -57,19 +58,21 @@
   return os << "TestVideoParam { filename:" << test_arg.filename
             << " input_bit_depth:" << test_arg.input_bit_depth
             << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
-            << " profile:" << test_arg.profile << "}";
+            << " profile:" << test_arg.profile << " }";
 }
 
 const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
   { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
   { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
   { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
   { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
   { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
   { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
   { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
 };
 
 // Encoding modes tested

diff --git a/libaom/test/error_block_test.cc b/libaom/test/error_block_test.cc
index 3664ccf..462661e 100644
--- a/libaom/test/error_block_test.cc
+++ b/libaom/test/error_block_test.cc

@@ -12,6 +12,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -35,9 +36,21 @@
                                   const tran_low_t *dqcoeff,
                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef ::testing::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
+typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
+                                       const tran_low_t *dqcoeff,
+                                       intptr_t block_size, int64_t *ssz);
+
+typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
     ErrorBlockParam;
 
+template <ErrorBlockFunc8Bits fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
+  return fn(coeff, dqcoeff, block_size, ssz);
+}
+
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
   virtual ~ErrorBlockTest() {}
@@ -220,29 +233,57 @@
   }
 }
 
-#if (HAVE_SSE2 || HAVE_AVX)
-using ::testing::make_tuple;
+using std::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ErrorBlockTest,
-    ::testing::Values(make_tuple(&av1_highbd_block_error_sse2,
-                                 &av1_highbd_block_error_c, AOM_BITS_10),
-                      make_tuple(&av1_highbd_block_error_sse2,
-                                 &av1_highbd_block_error_c, AOM_BITS_12),
-                      make_tuple(&av1_highbd_block_error_sse2,
-                                 &av1_highbd_block_error_c, AOM_BITS_8)));
+#if (HAVE_SSE2)
+const ErrorBlockParam kErrorBlockTestParamsSse2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_10),
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_12),
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_8),
+#endif
+  make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsSse2));
 #endif  // HAVE_SSE2
 
 #if (HAVE_AVX2)
-using ::testing::make_tuple;
+const ErrorBlockParam kErrorBlockTestParamsAvx2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_10),
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_12),
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_8),
+#endif
+  make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+};
 
-INSTANTIATE_TEST_CASE_P(
-    AVX2, ErrorBlockTest,
-    ::testing::Values(make_tuple(&av1_highbd_block_error_avx2,
-                                 &av1_highbd_block_error_c, AOM_BITS_10),
-                      make_tuple(&av1_highbd_block_error_avx2,
-                                 &av1_highbd_block_error_c, AOM_BITS_12),
-                      make_tuple(&av1_highbd_block_error_avx2,
-                                 &av1_highbd_block_error_c, AOM_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsAvx2));
 #endif  // HAVE_AVX2
+
+#if (HAVE_MSA)
+INSTANTIATE_TEST_SUITE_P(
+    MSA, ErrorBlockTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_msa>,
+                                 &BlockError8BitWrapper<av1_block_error_c>,
+                                 AOM_BITS_8)));
+#endif  // HAVE_MSA
+
+#if (HAVE_NEON)
+INSTANTIATE_TEST_SUITE_P(
+    NEON, ErrorBlockTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+                                 &BlockError8BitWrapper<av1_block_error_c>,
+                                 AOM_BITS_8)));
+#endif  // HAVE_NEON
 }  // namespace

diff --git a/libaom/test/error_resilience_test.cc b/libaom/test/error_resilience_test.cc
index 10bcd67..1d52bb2 100644
--- a/libaom/test/error_resilience_test.cc
+++ b/libaom/test/error_resilience_test.cc

@@ -152,13 +152,17 @@
     if ((encode_flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
                          AOM_EFLAG_NO_UPD_ARF)) ==
         (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
-      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE));
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_DROPPABLE));
     }
     if (encode_flags & AOM_EFLAG_SET_S_FRAME) {
-      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH));
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_SWITCH));
     }
     if (encode_flags & AOM_EFLAG_ERROR_RESILIENT) {
-      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT));
+      ASSERT_EQ(
+          pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT,
+          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_ERROR_RESILIENT));
     }
   }
 

diff --git a/libaom/test/external_frame_buffer_test.cc b/libaom/test/external_frame_buffer_test.cc
index 4938a64..1d726a4 100644
--- a/libaom/test/external_frame_buffer_test.cc
+++ b/libaom/test/external_frame_buffer_test.cc

@@ -117,13 +117,11 @@
   // Checks that the aom_image_t data is contained within the external frame
   // buffer private data passed back in the aom_image_t.
   void CheckImageFrameBuffer(const aom_image_t *img) {
-    if (img->fb_priv != NULL) {
-      const struct ExternalFrameBuffer *const ext_fb =
-          reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
+    const struct ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
 
-      ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
-                  img->planes[0] < (ext_fb->data + ext_fb->size));
-    }
+    ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                img->planes[0] < (ext_fb->data + ext_fb->size));
   }
 
   int num_used_buffers() const { return num_used_buffers_; }
@@ -244,7 +242,7 @@
     expected_md5[32] = '\0';
 
     ::libaom_test::MD5 md5_res;
-#if !CONFIG_LOWBITDEPTH
+#if FORCE_HIGHBITDEPTH_DECODING
     const aom_img_fmt_t shifted_fmt =
         (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
     if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
@@ -258,7 +256,7 @@
     } else {
 #endif
       md5_res.Add(&img);
-#if !CONFIG_LOWBITDEPTH
+#if FORCE_HIGHBITDEPTH_DECODING
     }
 #endif
     const char *const actual_md5 = md5_res.Get();
@@ -266,6 +264,12 @@
     // Check md5 match.
     ASSERT_STREQ(expected_md5, actual_md5)
         << "Md5 checksums don't match: frame number = " << frame_number;
+
+    const struct ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(img.fb_priv);
+
+    ASSERT_TRUE(img.planes[0] >= ext_fb->data &&
+                img.planes[0] < (ext_fb->data + ext_fb->size));
   }
 
   // Callback to get a free external frame buffer. Return value < 0 is an
@@ -311,6 +315,7 @@
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }
@@ -378,6 +383,7 @@
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }
@@ -426,7 +432,7 @@
   OpenMD5File(md5_filename);
 
   // Set decode config.
-  cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+  cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
   set_cfg(cfg);
 
   // Decode frame, and check the md5 matching.

diff --git a/libaom/test/fdct4x4_test.cc b/libaom/test/fdct4x4_test.cc
new file mode 100644
index 0000000..6600f2c
--- /dev/null
+++ b/libaom/test/fdct4x4_test.cc

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+template <typename OutputType>
+using FdctFunc = void (*)(const int16_t *in, OutputType *out, int stride);
+
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+                         TxfmParam *txfm_param);
+
+template <typename OutputType>
+using Fdct4x4Param =
+    std::tuple<FdctFunc<OutputType>, FhtFunc<OutputType>, aom_bit_depth_t, int>;
+
+#if HAVE_NEON || HAVE_SSE2
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 TxfmParam * /*txfm_param*/) {
+  aom_fdct4x4_c(in, out, stride);
+}
+
+void fdct4x4_lp_ref(const int16_t *in, int16_t *out, int stride,
+                    TxfmParam * /*txfm_param*/) {
+  aom_fdct4x4_lp_c(in, out, stride);
+}
+#endif
+
+template <typename OutputType>
+class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
+                     public ::testing::TestWithParam<Fdct4x4Param<OutputType>> {
+ public:
+  virtual ~Trans4x4FDCT() {}
+
+  using TxfmBaseOutType = libaom_test::TransformTestBase<OutputType>;
+  virtual void SetUp() {
+    fwd_txfm_ = std::get<0>(this->GetParam());
+    TxfmBaseOutType::pitch_ = 4;
+    TxfmBaseOutType::height_ = 4;
+    TxfmBaseOutType::fwd_txfm_ref = std::get<1>(this->GetParam());
+    TxfmBaseOutType::bit_depth_ = std::get<2>(this->GetParam());
+    TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
+    TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+
+  void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) {
+    (void)out;
+    (void)dst;
+    (void)stride;
+  }
+
+  FdctFunc<OutputType> fwd_txfm_;
+};
+
+using Trans4x4FDCTTranLow = Trans4x4FDCT<tran_low_t>;
+TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); }
+
+using Trans4x4FDCTInt16 = Trans4x4FDCT<int16_t>;
+TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); }
+
+using std::make_tuple;
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTTranLow,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_neon,
+                                                      &fdct4x4_ref, AOM_BITS_8,
+                                                      16)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTInt16,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_lp_neon,
+                                                      &fdct4x4_lp_ref,
+                                                      AOM_BITS_8, 16)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTTranLow,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
+                                                      &fdct4x4_ref, AOM_BITS_8,
+                                                      16)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTInt16,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_lp_sse2,
+                                                      &fdct4x4_lp_ref,
+                                                      AOM_BITS_8, 16)));
+#endif
+}  // namespace

diff --git a/libaom/test/fft_test.cc b/libaom/test/fft_test.cc
index e24e451..d23aa01 100644
--- a/libaom/test/fft_test.cc
+++ b/libaom/test/fft_test.cc

@@ -13,6 +13,7 @@
 
 #include <algorithm>
 #include <complex>
+#include <ostream>
 #include <vector>
 
 #include "aom_dsp/fft_common.h"
@@ -133,16 +134,16 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, FFT2DTest,
-                        ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
-                                          FFTTestArg(4, aom_fft4x4_float_c),
-                                          FFTTestArg(8, aom_fft8x8_float_c),
-                                          FFTTestArg(16, aom_fft16x16_float_c),
-                                          FFTTestArg(32,
-                                                     aom_fft32x32_float_c)));
+INSTANTIATE_TEST_SUITE_P(C, FFT2DTest,
+                         ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
+                                           FFTTestArg(4, aom_fft4x4_float_c),
+                                           FFTTestArg(8, aom_fft8x8_float_c),
+                                           FFTTestArg(16, aom_fft16x16_float_c),
+                                           FFTTestArg(32,
+                                                      aom_fft32x32_float_c)));
 #if ARCH_X86 || ARCH_X86_64
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, FFT2DTest,
     ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2),
                       FFTTestArg(8, aom_fft8x8_float_sse2),
@@ -150,7 +151,7 @@
                       FFTTestArg(32, aom_fft32x32_float_sse2)));
 #endif  // HAVE_SSE2
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, FFT2DTest,
     ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2),
                       FFTTestArg(16, aom_fft16x16_float_avx2),
@@ -227,7 +228,7 @@
     input_[i % (n * n)] = 0;
   }
 }
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, IFFT2DTest,
     ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c),
                       IFFTTestArg(4, aom_ifft4x4_float_c),
@@ -236,7 +237,7 @@
                       IFFTTestArg(32, aom_ifft32x32_float_c)));
 #if ARCH_X86 || ARCH_X86_64
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, IFFT2DTest,
     ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2),
                       IFFTTestArg(8, aom_ifft8x8_float_sse2),
@@ -245,7 +246,7 @@
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, IFFT2DTest,
     ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2),
                       IFFTTestArg(16, aom_ifft16x16_float_avx2),

diff --git a/libaom/test/filterintra_test.cc b/libaom/test/filterintra_test.cc
index 93e26ae..284353c 100644
--- a/libaom/test/filterintra_test.cc
+++ b/libaom/test/filterintra_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
@@ -22,7 +24,7 @@
 namespace {
 
 using libaom_test::ACMRandom;
-using ::testing::tuple;
+using std::tuple;
 
 typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                           const uint8_t *above, const uint8_t *left, int mode);
@@ -43,9 +45,9 @@
   virtual ~AV1FilterIntraPredTest() {}
   virtual void SetUp() {
     PredFuncMode funcMode = GET_PARAM(0);
-    predFuncRef_ = ::testing::get<0>(funcMode);
-    predFunc_ = ::testing::get<1>(funcMode);
-    mode_ = ::testing::get<2>(funcMode);
+    predFuncRef_ = std::get<0>(funcMode);
+    predFunc_ = std::get<1>(funcMode);
+    mode_ = std::get<2>(funcMode);
     txSize_ = GET_PARAM(1);
 
     alloc_ = new uint8_t[2 * MaxTxSize + 1];
@@ -108,7 +110,7 @@
 
 TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 const PredFuncMode kPredFuncMdArray[] = {
   make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
@@ -127,7 +129,7 @@
                             TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
                             TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1FilterIntraPredTest,
     ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
                        ::testing::ValuesIn(kTxSize)));

diff --git a/libaom/test/frame_error_test.cc b/libaom/test/frame_error_test.cc
new file mode 100644
index 0000000..6d74a68
--- /dev/null
+++ b/libaom/test/frame_error_test.cc

@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
+                                    const uint8_t *const dst, int p_width,
+                                    int p_height, int p_stride);
+#if HAVE_AVX2 || HAVE_SSE2
+const int kBlockWidth[] = {
+  832, 834, 640, 1280, 1920,
+};
+const int kBlockHeight[] = {
+  480, 482, 360, 720, 1080,
+};
+#endif
+typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
+
+class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
+ public:
+  virtual ~AV1FrameErrorTest() {}
+  virtual void SetUp() {
+    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RandomValues(frame_error_func test_impl, int width, int height);
+  void ExtremeValues(frame_error_func test_impl, int width, int height);
+  void RunSpeedTest(frame_error_func test_impl, int width, int height);
+  libaom_test::ACMRandom rnd_;
+};
+
+void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
+                                     int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int i = 0; i < max_blk_size; ++i) {
+    dst[i] = rnd_.Rand8();
+    ref[i] = rnd_.Rand8();
+  }
+  const int64_t ref_error =
+      av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
+  const int64_t test_error = test_impl(ref, stride, dst, width, height, stride);
+  ASSERT_EQ(test_error, ref_error) << width << "x" << height;
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
+                                      int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int r = 0; r < 2; r++) {
+    if (r == 0) {
+      memset(dst, 0, max_blk_size);
+      memset(ref, 255, max_blk_size);
+    } else if (r == 1) {
+      memset(dst, 255, max_blk_size);
+      memset(ref, 0, max_blk_size);
+    }
+    const int64_t ref_error =
+        av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
+    const int64_t test_error =
+        test_impl(ref, stride, dst, width, height, stride);
+    ASSERT_EQ(test_error, ref_error) << width << "x" << height;
+  }
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
+                                     int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int i = 0; i < max_blk_size; ++i) {
+    dst[i] = ref[i] = rnd_.Rand8();
+  }
+  const int num_loops = 10000000 / (width + height);
+  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    frame_error_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref, stride, dst, width, height, stride);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  aom_free(dst);
+  aom_free(ref);
+  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1FrameErrorTest, CheckOutput) {
+  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+}  // namespace

diff --git a/libaom/test/function_equivalence_test.h b/libaom/test/function_equivalence_test.h
index f270689..a299c48 100644
--- a/libaom/test/function_equivalence_test.h
+++ b/libaom/test/function_equivalence_test.h

@@ -12,6 +12,8 @@
 #ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
 #define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
 
+#include <ostream>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"

diff --git a/libaom/test/fwd_kf_test.cc b/libaom/test/fwd_kf_test.cc
index 6c428d9..50c2f36 100644
--- a/libaom/test/fwd_kf_test.cc
+++ b/libaom/test/fwd_kf_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <ostream>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -23,19 +25,23 @@
 } FwdKfTestParam;
 
 const FwdKfTestParam kTestParams[] = {
-  { 4, 37.3 },  { 6, 36.5 },  { 8, 35.8 },
-  { 12, 34.3 }, { 16, 34.3 }, { 18, 33.7 }
+  { 4, 33.4 },  { 6, 32.9 },  { 8, 32.6 },
+  { 12, 32.4 }, { 16, 32.3 }, { 18, 32.1 }
 };
 
-// Params: encoding mode and index into the kMaxKfDists array to control
-// kf-max-dist
+std::ostream &operator<<(std::ostream &os, const FwdKfTestParam &test_arg) {
+  return os << "FwdKfTestParam { max_kf_dist:" << test_arg.max_kf_dist
+            << " psnr_thresh:" << test_arg.psnr_thresh << " }";
+}
+
 class ForwardKeyTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 FwdKfTestParam>,
       public ::libaom_test::EncoderTest {
  protected:
   ForwardKeyTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        kf_max_dist_ind_(GET_PARAM(2)) {}
+        kf_max_dist_param_(GET_PARAM(2)) {}
   virtual ~ForwardKeyTest() {}
 
   virtual void SetUp() {
@@ -44,8 +50,8 @@
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
     cpu_used_ = 2;
-    kf_max_dist_ = kTestParams[kf_max_dist_ind_].max_kf_dist;
-    psnr_threshold_ = kTestParams[kf_max_dist_ind_].psnr_thresh;
+    kf_max_dist_ = kf_max_dist_param_.max_kf_dist;
+    psnr_threshold_ = kf_max_dist_param_.psnr_thresh;
     cfg_.rc_end_usage = AOM_VBR;
     cfg_.rc_target_bitrate = 200;
     cfg_.g_lag_in_frames = 10;
@@ -85,7 +91,7 @@
   double GetPsnrThreshold() { return psnr_threshold_; }
 
   ::libaom_test::TestMode encoding_mode_;
-  const int kf_max_dist_ind_;
+  const FwdKfTestParam kf_max_dist_param_;
   double psnr_threshold_;
   int kf_max_dist_;
   int cpu_used_;
@@ -104,7 +110,7 @@
       << "kf max dist = " << kf_max_dist_;
 }
 
-AV1_INSTANTIATE_TEST_CASE(
-    ForwardKeyTest, ::testing::Values(::libaom_test::kTwoPassGood),
-    ::testing::Range(0, static_cast<int>(GTEST_ARRAY_SIZE_(kTestParams))));
+AV1_INSTANTIATE_TEST_CASE(ForwardKeyTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::ValuesIn(kTestParams));
 }  // namespace

diff --git a/libaom/test/fwht4x4_test.cc b/libaom/test/fwht4x4_test.cc
index c8d98c5..d2f77b8 100644
--- a/libaom/test/fwht4x4_test.cc
+++ b/libaom/test/fwht4x4_test.cc

@@ -12,7 +12,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
+#include "aom_dsp/aom_dsp_common.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
@@ -35,7 +37,7 @@
 
 using libaom_test::FhtFunc;
 
-typedef ::testing::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
     Dct4x4Param;
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
@@ -51,7 +53,7 @@
   av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
 
-class Trans4x4WHT : public libaom_test::TransformTestBase,
+class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4WHT() {}
@@ -87,9 +89,9 @@
 TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using ::testing::make_tuple;
+using std::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans4x4WHT,
     ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
                                  AOM_BITS_10, 16),

diff --git a/libaom/test/gf_max_pyr_height_test.cc b/libaom/test/gf_pyr_height_test.cc
similarity index 62%
rename from libaom/test/gf_max_pyr_height_test.cc
rename to libaom/test/gf_pyr_height_test.cc
index 2d78493..b1ade67 100644
--- a/libaom/test/gf_max_pyr_height_test.cc
+++ b/libaom/test/gf_pyr_height_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <ostream>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -17,11 +19,38 @@
 
 namespace {
 
-static const struct GFMaxPyrHeightTestParam {
+static const struct GFPyrHeightTestParam {
+  int gf_min_pyr_height;
   int gf_max_pyr_height;
   double psnr_thresh;
 } kTestParams[] = {
-  { 0, 34.75 }, { 1, 34.75 }, { 2, 35.25 }, { 3, 35.50 }, { 4, 35.50 },
+  // gf_min_pyr_height = 0
+  { 0, 0, 33.40 },
+  { 0, 1, 34.00 },
+  { 0, 2, 34.00 },
+  { 0, 3, 34.20 },
+  { 0, 4, 34.30 },
+  { 0, 5, 34.40 },
+  // gf_min_pyr_height = 1
+  { 1, 1, 34.00 },
+  { 1, 2, 34.00 },
+  { 1, 3, 34.20 },
+  { 1, 4, 34.30 },
+  { 1, 5, 34.40 },
+  // gf_min_pyr_height = 2
+  { 2, 2, 34.00 },
+  { 2, 3, 34.20 },
+  { 2, 4, 34.30 },
+  { 2, 5, 34.40 },
+  // gf_min_pyr_height = 3
+  { 3, 3, 34.20 },
+  { 3, 4, 34.30 },
+  { 3, 5, 34.40 },
+  // gf_min_pyr_height = 4
+  { 4, 4, 34.30 },
+  { 4, 5, 34.40 },
+  // gf_min_pyr_height = 5
+  { 5, 5, 34.40 },
 };
 
 // Compiler may decide to add some padding to the struct above for alignment,
@@ -29,25 +58,28 @@
 // valgrind to complain that the padding is uninitialized. To avoid that, we
 // provide our own function to print the struct.
 // This also makes '--gtest_list_tests' output more understandable.
-std::ostream &operator<<(std::ostream &os, const GFMaxPyrHeightTestParam &p) {
-  os << "GFMaxPyrHeightTestParam { "
+std::ostream &operator<<(std::ostream &os, const GFPyrHeightTestParam &p) {
+  os << "GFPyrHeightTestParam { "
+     << "gf_min_pyr_height = " << p.gf_min_pyr_height << ", "
      << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", "
      << "psnr_thresh = " << p.psnr_thresh << " }";
   return os;
 }
 
-// Params: encoding mode and GFMaxPyrHeightTestParam object.
-class GFMaxPyrHeightTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
-                                                 GFMaxPyrHeightTestParam>,
+// Params: encoding mode, rate control mode and GFPyrHeightTestParam object.
+class GFPyrHeightTest
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, aom_rc_mode, GFPyrHeightTestParam>,
       public ::libaom_test::EncoderTest {
  protected:
-  GFMaxPyrHeightTest()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) {
-    gf_max_pyr_height_ = GET_PARAM(2).gf_max_pyr_height;
-    psnr_threshold_ = GET_PARAM(2).psnr_thresh;
+  GFPyrHeightTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        rc_mode_(GET_PARAM(2)) {
+    gf_min_pyr_height_ = GET_PARAM(3).gf_min_pyr_height;
+    gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height;
+    psnr_threshold_ = GET_PARAM(3).psnr_thresh;
   }
-  virtual ~GFMaxPyrHeightTest() {}
+  virtual ~GFPyrHeightTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -55,8 +87,10 @@
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
     cpu_used_ = 4;
-    cfg_.rc_end_usage = AOM_VBR;
-    cfg_.rc_target_bitrate = 200;
+    cfg_.rc_end_usage = rc_mode_;
+    if (rc_mode_ == AOM_VBR) {
+      cfg_.rc_target_bitrate = 200;
+    }
     cfg_.g_lag_in_frames = 19;
     cfg_.g_threads = 0;
     init_flags_ = AOM_CODEC_USE_PSNR;
@@ -76,11 +110,15 @@
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      if (rc_mode_ == AOM_Q) {
+        encoder->Control(AOME_SET_CQ_LEVEL, 32);
+      }
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
       }
+      encoder->Control(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, gf_min_pyr_height_);
       encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_);
     }
   }
@@ -93,23 +131,26 @@
   double GetPsnrThreshold() { return psnr_threshold_; }
 
   ::libaom_test::TestMode encoding_mode_;
+  aom_rc_mode rc_mode_;
   double psnr_threshold_;
+  int gf_min_pyr_height_;
   int gf_max_pyr_height_;
   int cpu_used_;
   int nframes_;
   double psnr_;
 };
 
-TEST_P(GFMaxPyrHeightTest, EncodeAndVerifyPSNR) {
+TEST_P(GFPyrHeightTest, EncodeAndVerifyPSNR) {
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                      cfg_.g_timebase.den, cfg_.g_timebase.num,
                                      0, 32);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+      << "GF Min Pyramid Height = " << gf_min_pyr_height_ << ", "
       << "GF Max Pyramid Height = " << gf_max_pyr_height_;
 }
 
-AV1_INSTANTIATE_TEST_CASE(GFMaxPyrHeightTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
+AV1_INSTANTIATE_TEST_CASE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
+                          ::testing::Values(AOM_Q, AOM_VBR),
                           ::testing::ValuesIn(kTestParams));
 }  // namespace

diff --git a/libaom/test/hadamard_test.cc b/libaom/test/hadamard_test.cc
new file mode 100644
index 0000000..7903259
--- /dev/null
+++ b/libaom/test/hadamard_test.cc

@@ -0,0 +1,261 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+                             tran_low_t *b);
+
+void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
+  tran_low_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * 8] + a[(i + 1) * 8];
+    b[i + 1] = a[i * 8] - a[(i + 1) * 8];
+  }
+  tran_low_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
+  tran_low_t input[64];
+  tran_low_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+    }
+  }
+  for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
+  for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
+}
+
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  ReferenceHadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  ReferenceHadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  ReferenceHadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[64];
+    const tran_low_t a2 = b[128];
+    const tran_low_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const tran_low_t b0 = (a0 + a1) >> 1;
+    const tran_low_t b1 = (a0 - a1) >> 1;
+    const tran_low_t b2 = (a2 + a3) >> 1;
+    const tran_low_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[0] = b0 + b2;
+    b[64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+  ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
+  ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
+  ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
+  ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
+
+  for (int i = 0; i < 256; ++i) {
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[256];
+    const tran_low_t a2 = b[512];
+    const tran_low_t a3 = b[768];
+
+    const tran_low_t b0 = (a0 + a1) >> 2;
+    const tran_low_t b1 = (a0 - a1) >> 2;
+    const tran_low_t b2 = (a2 + a3) >> 2;
+    const tran_low_t b3 = (a2 - a3) >> 2;
+
+    b[0] = b0 + b2;
+    b[256] = b1 + b3;
+    b[512] = b0 - b2;
+    b[768] = b1 - b3;
+
+    ++b;
+  }
+}
+
+struct HadamardFuncWithSize {
+  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
+  HadamardFunc func;
+  int block_size;
+};
+
+std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+  return os << "block size: " << hfs.block_size;
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam().func;
+    bwh_ = GetParam().block_size;
+    block_size_ = bwh_ * bwh_;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual int16_t Rand() = 0;
+
+  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
+                         int bwh) {
+    if (bwh == 32)
+      ReferenceHadamard32x32(a, a_stride, b);
+    else if (bwh == 16)
+      ReferenceHadamard16x16(a, a_stride, b);
+    else
+      ReferenceHadamard8x8(a, a_stride, b);
+  }
+
+  void CompareReferenceRandom() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+
+    ReferenceHadamard(a, bwh_, b_ref, bwh_);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + block_size_);
+    std::sort(b_ref, b_ref + block_size_);
+    EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
+  }
+
+  void VaryStride() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+
+    tran_low_t b_ref[kMaxBlockSize];
+    for (int i = 8; i < 64; i += 8) {
+      memset(b, 0, sizeof(b));
+      memset(b_ref, 0, sizeof(b_ref));
+
+      ReferenceHadamard(a, i, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
+  void SpeedTest(int times) {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    memset(input, 1, sizeof(input));
+    memset(output, 0, sizeof(output));
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < times; ++i) {
+      h_func_(input, bwh_, output);
+    }
+    aom_usec_timer_mark(&timer);
+
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
+           elapsed_time);
+  }
+
+  ACMRandom rnd_;
+
+ private:
+  int bwh_;
+  int block_size_;
+  HadamardFunc h_func_;
+};
+
+class HadamardLowbdTest : public HadamardTestBase {
+ public:
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+
+TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_c, 32)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
+#endif  // HAVE_NEON
+
+}  // namespace

diff --git a/libaom/test/hash_test.cc b/libaom/test/hash_test.cc
index e9f7f63..eb964ac 100644
--- a/libaom/test/hash_test.cc
+++ b/libaom/test/hash_test.cc

@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 #include <new>
+#include <tuple>
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -24,9 +25,9 @@
 namespace {
 
 typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p,
-                                          int length);
+                                          size_t length);
 
-typedef ::testing::tuple<get_crc32c_value_func, int> HashParam;
+typedef std::tuple<get_crc32c_value_func, int> HashParam;
 
 class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
  public:
@@ -45,7 +46,7 @@
   CRC32C calc_;
   uint8_t *buffer_;
   int bsize_;
-  int length_;
+  size_t length_;
 };
 
 AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; }
@@ -58,7 +59,7 @@
   length_ = bsize_ * bsize_ * sizeof(uint16_t);
   buffer_ = new uint8_t[length_];
   ASSERT_TRUE(buffer_ != NULL);
-  for (int i = 0; i < length_; ++i) {
+  for (size_t i = 0; i < length_; ++i) {
     buffer_[i] = rnd_.Rand8();
   }
 }
@@ -118,13 +119,13 @@
 
 const int kValidBlockSize[] = { 64, 32, 8, 4 };
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AV1Crc32cHashTest,
     ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c),
                        ::testing::ValuesIn(kValidBlockSize)));
 
 #if HAVE_SSE4_2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_2, AV1Crc32cHashTest,
     ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2),
                        ::testing::ValuesIn(kValidBlockSize)));

diff --git a/libaom/test/hbd_metrics_test.cc b/libaom/test/hbd_metrics_test.cc
index 09df9bd..5b03bee 100644
--- a/libaom/test/hbd_metrics_test.cc
+++ b/libaom/test/hbd_metrics_test.cc

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <new>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -173,7 +174,7 @@
   HBDMetricFunc hbd_metric_;
 };
 
-typedef ::testing::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
+typedef std::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
     MetricTestTParam;
 class HBDMetricsTest : public HBDMetricsTestBase,
                        public ::testing::TestWithParam<MetricTestTParam> {
@@ -197,7 +198,7 @@
 // Allow some extra variation due to rounding error accumulated in dct.
 static const double kPhvs_thresh = 0.3;
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AOMSSIM, HBDMetricsTest,
     ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
                                        8, 10, kSsim_thresh),
@@ -207,7 +208,7 @@
                                        8, 12, kSsim_thresh),
                       MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
                                        12, 12, kPhvs_thresh)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     FASTSSIM, HBDMetricsTest,
     ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
                                        8, 10, kFSsim_thresh),
@@ -217,7 +218,7 @@
                                        8, 12, kFSsim_thresh),
                       MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
                                        12, 12, kFSsim_thresh)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PSNRHVS, HBDMetricsTest,
     ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
                                        8, 10, kPhvs_thresh),
@@ -227,7 +228,7 @@
                                        8, 12, kPhvs_thresh),
                       MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
                                        12, 12, kPhvs_thresh)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PSNR, HBDMetricsTest,
     ::testing::Values(
         MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh),

diff --git a/libaom/test/hiprec_convolve_test.cc b/libaom/test/hiprec_convolve_test.cc
index dcf8523..59d28e8 100644
--- a/libaom/test/hiprec_convolve_test.cc
+++ b/libaom/test/hiprec_convolve_test.cc

@@ -9,14 +9,18 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/hiprec_convolve_test_util.h"
 
 using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+#endif
 using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 
@@ -25,21 +29,22 @@
   RunSpeedTest(GET_PARAM(3));
 }
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
-                        libaom_test::AV1HiprecConvolve::BuildParams(
-                            av1_wiener_convolve_add_src_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_sse2));
 #endif
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HiprecConvolveTest,
-                        libaom_test::AV1HiprecConvolve::BuildParams(
-                            av1_wiener_convolve_add_src_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_avx2));
 #endif
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, AV1HiprecConvolveTest,
-                        libaom_test::AV1HiprecConvolve::BuildParams(
-                            av1_wiener_convolve_add_src_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_neon));
 #endif
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #if HAVE_SSSE3 || HAVE_AVX2
 TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(4));
@@ -48,15 +53,16 @@
   RunSpeedTest(GET_PARAM(4));
 }
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdHiprecConvolveTest,
-                        libaom_test::AV1HighbdHiprecConvolve::BuildParams(
-                            av1_highbd_wiener_convolve_add_src_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdHiprecConvolveTest,
+                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                             av1_highbd_wiener_convolve_add_src_ssse3));
 #endif
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdHiprecConvolveTest,
-                        libaom_test::AV1HighbdHiprecConvolve::BuildParams(
-                            av1_highbd_wiener_convolve_add_src_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdHiprecConvolveTest,
+                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                             av1_highbd_wiener_convolve_add_src_avx2));
 #endif
 #endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace

diff --git a/libaom/test/hiprec_convolve_test_util.cc b/libaom/test/hiprec_convolve_test_util.cc
index 2672bce..956af7f 100644
--- a/libaom/test/hiprec_convolve_test_util.cc
+++ b/libaom/test/hiprec_convolve_test_util.cc

@@ -13,38 +13,55 @@
 
 #include "av1/common/restoration.h"
 
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace libaom_test {
 
 // Generate a random pair of filter kernels, using the ranges
 // of possible values from the loop-restoration experiment
 static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
-                             InterpKernel vkernel) {
-  hkernel[0] = hkernel[6] =
-      WIENER_FILT_TAP0_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
-  hkernel[1] = hkernel[5] =
-      WIENER_FILT_TAP1_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
-  hkernel[2] = hkernel[4] =
-      WIENER_FILT_TAP2_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
-  hkernel[3] = -(hkernel[0] + hkernel[1] + hkernel[2]);
-  hkernel[7] = 0;
+                             InterpKernel vkernel, int kernel_type = 2) {
+  if (kernel_type == 0) {
+    // Low possible values for filter coefficients
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else if (kernel_type == 1) {
+    // Max possible values for filter coefficients
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else {
+    // Randomly generated values for filter coefficients
+    hkernel[0] = hkernel[6] =
+        WIENER_FILT_TAP0_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
+    hkernel[1] = hkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+    hkernel[2] = hkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+    hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = 0;
 
-  vkernel[0] = vkernel[6] =
-      WIENER_FILT_TAP0_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
-  vkernel[1] = vkernel[5] =
-      WIENER_FILT_TAP1_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
-  vkernel[2] = vkernel[4] =
-      WIENER_FILT_TAP2_MINV +
-      rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
-  vkernel[3] = -(vkernel[0] + vkernel[1] + vkernel[2]);
-  vkernel[7] = 0;
+    vkernel[0] = vkernel[6] =
+        WIENER_FILT_TAP0_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV);
+    vkernel[1] = vkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+    vkernel[2] = vkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+    vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+    vkernel[7] = 0;
+  }
 }
 
 namespace AV1HiprecConvolve {
@@ -74,7 +91,7 @@
   const int w = 128, h = 128;
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
   const int num_iters = GET_PARAM(2);
-  int i, j;
+  int i, j, k, m;
   const ConvolveParams conv_params = get_conv_params_wiener(8);
 
   uint8_t *input_ = new uint8_t[h * w];
@@ -91,25 +108,25 @@
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
   DECLARE_ALIGNED(16, InterpKernel, vkernel);
 
-  generate_kernels(&rnd_, hkernel, vkernel);
+  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+    for (i = 0; i < num_iters; ++i) {
+      for (k = 0; k < h; ++k)
+        for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
+      // Choose random locations within the source block
+      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
+                                    out_w, hkernel, 16, vkernel, 16, out_w,
+                                    out_h, &conv_params);
+      test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
+                vkernel, 16, out_w, out_h, &conv_params);
 
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-
-  for (i = 0; i < num_iters; ++i) {
-    // Choose random locations within the source block
-    int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-    int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-    av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
-                                  out_w, hkernel, 16, vkernel, 16, out_w, out_h,
-                                  &conv_params);
-    test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
-              vkernel, 16, out_w, out_h, &conv_params);
-
-    for (j = 0; j < out_w * out_h; ++j)
-      ASSERT_EQ(output[j], output2[j])
-          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
-          << (j / out_w) << ") on iteration " << i;
+      for (j = 0; j < out_w * out_h; ++j)
+        ASSERT_EQ(output[j], output2[j])
+            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+            << (j / out_w) << ") on iteration " << i;
+    }
   }
   delete[] input_;
   delete[] output;
@@ -183,6 +200,7 @@
 }
 }  // namespace AV1HiprecConvolve
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdHiprecConvolve {
 
 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
@@ -228,29 +246,29 @@
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
   DECLARE_ALIGNED(16, InterpKernel, vkernel);
 
-  generate_kernels(&rnd_, hkernel, vkernel);
-
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
 
   uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
   uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
   uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+    for (i = 0; i < num_iters; ++i) {
+      // Choose random locations within the source block
+      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+      av1_highbd_wiener_convolve_add_src_c(
+          input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel,
+          16, vkernel, 16, out_w, out_h, &conv_params, bd);
+      test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
+                hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
 
-  for (i = 0; i < num_iters; ++i) {
-    // Choose random locations within the source block
-    int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-    int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-    av1_highbd_wiener_convolve_add_src_c(
-        input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel, 16,
-        vkernel, 16, out_w, out_h, &conv_params, bd);
-    test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
-              hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
-
-    for (j = 0; j < out_w * out_h; ++j)
-      ASSERT_EQ(output[j], output2[j])
-          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
-          << (j / out_w) << ") on iteration " << i;
+      for (j = 0; j < out_w * out_h; ++j)
+        ASSERT_EQ(output[j], output2[j])
+            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+            << (j / out_w) << ") on iteration " << i;
+    }
   }
   delete[] input;
   delete[] output;
@@ -328,4 +346,5 @@
   delete[] output2;
 }
 }  // namespace AV1HighbdHiprecConvolve
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace libaom_test

diff --git a/libaom/test/hiprec_convolve_test_util.h b/libaom/test/hiprec_convolve_test_util.h
index 2abe24b..6b6da4e 100644
--- a/libaom/test/hiprec_convolve_test_util.h
+++ b/libaom/test/hiprec_convolve_test_util.h

@@ -12,6 +12,8 @@
 #ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 #define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 
+#include <tuple>
+
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
@@ -35,8 +37,7 @@
                                      int w, int h,
                                      const ConvolveParams *conv_params);
 
-typedef ::testing::tuple<int, int, int, hiprec_convolve_func>
-    HiprecConvolveParam;
+typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
 
 ::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
     hiprec_convolve_func filter);
@@ -58,6 +59,7 @@
 
 }  // namespace AV1HiprecConvolve
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdHiprecConvolve {
 typedef void (*highbd_hiprec_convolve_func)(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
@@ -65,7 +67,7 @@
     const int16_t *filter_y, int y_step_q4, int w, int h,
     const ConvolveParams *conv_params, int bps);
 
-typedef ::testing::tuple<int, int, int, int, highbd_hiprec_convolve_func>
+typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
     HighbdHiprecConvolveParam;
 
 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
@@ -87,7 +89,7 @@
 };
 
 }  // namespace AV1HighbdHiprecConvolve
-
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace libaom_test
 
 #endif  // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_

diff --git a/libaom/test/horver_correlation_test.cc b/libaom/test/horver_correlation_test.cc
index 72fd46a..ccb8edd 100644
--- a/libaom/test/horver_correlation_test.cc
+++ b/libaom/test/horver_correlation_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
@@ -27,7 +29,7 @@
 typedef void (*HorverFunc)(const int16_t *diff, int stride, int w, int h,
                            float *hcorr, float *vcorr);
 
-typedef ::testing::tuple<const HorverFunc> HorverTestParam;
+typedef std::tuple<const HorverFunc> HorverTestParam;
 
 class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
  public:
@@ -133,13 +135,13 @@
 TEST_P(HorverTest, DISABLED_Speed) { RunHorverSpeedTest(100000); }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, HorverTest,
     ::testing::Values(av1_get_horver_correlation_full_sse4_1));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2));
 #endif  // HAVE_AVX2
 

diff --git a/libaom/test/horz_superres_test.cc b/libaom/test/horz_superres_test.cc
index f2c2115..938b0b1 100644
--- a/libaom/test/horz_superres_test.cc
+++ b/libaom/test/horz_superres_test.cc

@@ -10,6 +10,8 @@
  */
 
 #include <memory>
+#include <ostream>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -23,8 +25,8 @@
 
 namespace {
 
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 /* TESTING PARAMETERS */
 
@@ -40,9 +42,19 @@
   double psnr_threshold;
 } TestVideoParam;
 
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " limit:" << test_arg.limit
+            << " screen_content:" << test_arg.screen_content
+            << " psnr_threshold:" << test_arg.psnr_threshold << " }";
+}
+
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 26.0 },
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 },
+#if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+#endif
   { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
   // Image coding (single frame).
   { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
@@ -182,8 +194,8 @@
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
     SuperresDenominatorPair denoms = GET_PARAM(2);
-    superres_denom_ = ::testing::get<0>(denoms);
-    superres_kf_denom_ = ::testing::get<1>(denoms);
+    superres_denom_ = std::get<0>(denoms);
+    superres_kf_denom_ = std::get<1>(denoms);
   }
 
   virtual ~HorzSuperresFixedEndToEndTest() {}
@@ -293,8 +305,8 @@
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
     SuperresQThresholdPair qthresholds = GET_PARAM(2);
-    superres_qthresh_ = ::testing::get<0>(qthresholds);
-    superres_kf_qthresh_ = ::testing::get<1>(qthresholds);
+    superres_qthresh_ = std::get<0>(qthresholds);
+    superres_kf_qthresh_ = std::get<1>(qthresholds);
   }
 
   virtual ~HorzSuperresQThreshEndToEndTest() {}

diff --git a/libaom/test/intra_edge_test.cc b/libaom/test/intra_edge_test.cc
index ce61402..f7702c9 100644
--- a/libaom/test/intra_edge_test.cc
+++ b/libaom/test/intra_edge_test.cc

@@ -100,7 +100,7 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, UpsampleTest8B,
     ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
                                 av1_upsample_intra_edge_sse4_1)));
@@ -152,7 +152,7 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, UpsampleTestHB,
     ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
                                    av1_upsample_intra_edge_high_sse4_1)));
@@ -223,7 +223,7 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, FilterEdgeTest8B,
     ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
                                           av1_filter_intra_edge_sse4_1)));
@@ -268,10 +268,10 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, FilterEdgeTestHB,
-                        ::testing::Values(FilterEdgeTestFuncsHBD(
-                            av1_filter_intra_edge_high_c,
-                            av1_filter_intra_edge_high_sse4_1)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
+                         ::testing::Values(FilterEdgeTestFuncsHBD(
+                             av1_filter_intra_edge_high_c,
+                             av1_filter_intra_edge_high_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 // Speed tests

diff --git a/libaom/test/intrabc_test.cc b/libaom/test/intrabc_test.cc
index 3ea4217..b57eb6f 100644
--- a/libaom/test/intrabc_test.cc
+++ b/libaom/test/intrabc_test.cc

@@ -13,11 +13,11 @@
 
 #include "config/aom_config.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 #include "av1/common/mv.h"
 #include "av1/common/mvref_common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/tile_common.h"
 
 namespace {
@@ -156,13 +156,15 @@
   AV1_COMMON cm;
   memset(&cm, 0, sizeof(cm));
 
-  for (int i = 0; i < static_cast<int>(GTEST_ARRAY_SIZE_(kDvCases)); ++i) {
-    EXPECT_EQ(static_cast<int>(kDvCases[i].valid),
-              av1_is_dv_valid(kDvCases[i].dv, &cm, &xd,
-                              xd.tile.mi_row_start + kDvCases[i].mi_row_offset,
-                              xd.tile.mi_col_start + kDvCases[i].mi_col_offset,
-                              kDvCases[i].bsize, MAX_MIB_SIZE_LOG2))
-        << "DvCases[" << i << "]";
+  for (const DvTestCase &dv_case : kDvCases) {
+    const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
+    const int mi_col = xd.tile.mi_col_start + dv_case.mi_col_offset;
+    xd.is_chroma_ref = is_chroma_reference(mi_row, mi_col, dv_case.bsize,
+                                           xd.plane[1].subsampling_x,
+                                           xd.plane[1].subsampling_y);
+    EXPECT_EQ(static_cast<int>(dv_case.valid),
+              av1_is_dv_valid(dv_case.dv, &cm, &xd, mi_row, mi_col,
+                              dv_case.bsize, MAX_MIB_SIZE_LOG2));
   }
 }
 }  // namespace

diff --git a/libaom/test/intrapred_test.cc b/libaom/test/intrapred_test.cc
index 43ab773..779cf9a 100644
--- a/libaom/test/intrapred_test.cc
+++ b/libaom/test/intrapred_test.cc

@@ -133,6 +133,7 @@
   IntraPredFunc<FuncType> params_;
 };
 
+#if CONFIG_AV1_HIGHBITDEPTH
 class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
  protected:
   void Predict() {
@@ -142,6 +143,7 @@
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
 };
+#endif
 
 class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
  protected:
@@ -152,6 +154,7 @@
   }
 };
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Suppress an unitialized warning. Once there are implementations to test then
 // this can be restored.
 TEST_P(HighbdIntraPredTest, Bitexact) {
@@ -164,6 +167,7 @@
   av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
+#endif
 
 // Same issue as above but for arm.
 #if !HAVE_NEON
@@ -179,6 +183,7 @@
 }
 #endif  // !HAVE_NEON
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
 // High Bit Depth Tests
 #define highbd_entry(type, width, height, opt, bd)                          \
@@ -196,7 +201,7 @@
       highbd_entry(type, 16, 32, opt, bd),                                    \
       highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
 #endif
-
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 // ---------------------------------------------------------------------------
 // Low Bit Depth Tests
 
@@ -219,8 +224,8 @@
   lowbd_intrapred(v, sse2),       lowbd_intrapred(h, sse2),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
-                        ::testing::ValuesIn(LowbdIntraPredTestVector));
+INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVector));
 
 #endif  // HAVE_SSE2
 
@@ -230,8 +235,8 @@
   lowbd_intrapred(smooth, ssse3),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, LowbdIntraPredTest,
-                        ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
 
 #endif  // HAVE_SSSE3
 
@@ -247,11 +252,12 @@
   lowbd_entry(paeth, 32, 16, avx2),   lowbd_entry(paeth, 32, 32, avx2),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
-                        ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
 
 #endif  // HAVE_AVX2
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #if HAVE_NEON
 const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
   highbd_entry(dc, 4, 4, neon, 8),   highbd_entry(dc, 8, 8, neon, 8),
@@ -259,8 +265,9 @@
   highbd_entry(dc, 64, 64, neon, 8),
 };
 
-INSTANTIATE_TEST_CASE_P(NEON, HighbdIntraPredTest,
-                        ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
+                         ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
 
 #endif  // HAVE_NEON
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/invalid_file_test.cc b/libaom/test/invalid_file_test.cc
index 2a1c9a9..dd0956d 100644
--- a/libaom/test/invalid_file_test.cc
+++ b/libaom/test/invalid_file_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <cstdio>
+#include <ostream>
 #include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -23,10 +24,20 @@
 struct DecodeParam {
   int threads;
   const char *filename;
+  const char *res_filename;  // If NULL, the result filename is
+                             // filename + ".res".
 };
 
+// Constructs result file name.
+std::string GetResFilename(const DecodeParam &param) {
+  if (param.res_filename != NULL) return param.res_filename;
+  const std::string filename = param.filename;
+  return filename + ".res";
+}
+
 std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
-  return os << "threads: " << dp.threads << " file: " << dp.filename;
+  return os << "threads: " << dp.threads << " file: " << dp.filename
+            << " result file: " << GetResFilename(dp);
 }
 
 class InvalidFileTest : public ::libaom_test::DecoderTest,
@@ -90,16 +101,16 @@
 
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
     cfg.threads = input.threads;
     const std::string filename = input.filename;
     libaom_test::IVFVideoSource decode_video(filename);
     decode_video.Init();
 
-    // Construct result file name. The file holds a list of expected integer
-    // results, one for each decoded frame.  Any result that doesn't match
-    // the files list will cause a test failure.
-    const std::string res_filename = filename + ".res";
+    // The result file holds a list of expected integer results, one for each
+    // decoded frame.  Any result that doesn't match the file's list will
+    // cause a test failure.
+    const std::string res_filename = GetResFilename(input);
     OpenResFile(res_filename);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
@@ -111,26 +122,35 @@
 
 TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 
+// If res_filename (the third field) is NULL, then the result filename is
+// filename + ".res" by default. Set res_filename to a string if the result
+// filename differs from the default.
 const DecodeParam kAV1InvalidFileTests[] = {
-  { 1, "invalid-bug-1814.ivf" },
-  { 1, "invalid-chromium-906381.ivf" },
-  { 1, "invalid-google-142530197.ivf" },
-  { 1, "invalid-google-142530197-1.ivf" },
-  { 1, "invalid-oss-fuzz-9288.ivf" },
-  { 4, "invalid-oss-fuzz-9463.ivf" },
-  { 1, "invalid-oss-fuzz-9482.ivf" },
-  { 1, "invalid-oss-fuzz-9720.ivf" },
-  { 1, "invalid-oss-fuzz-10061.ivf" },
-  { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf" },
-  { 1, "invalid-oss-fuzz-10227.ivf" },
-  { 1, "invalid-oss-fuzz-10389.ivf" },
-  { 4, "invalid-oss-fuzz-10555.ivf" },
-  { 1, "invalid-oss-fuzz-10705.ivf" },
-  { 1, "invalid-oss-fuzz-10723.ivf" },
-  { 1, "invalid-oss-fuzz-10779.ivf" },
-  { 1, "invalid-oss-fuzz-11477.ivf" },
-  { 1, "invalid-oss-fuzz-11479.ivf" },
-  { 1, "invalid-oss-fuzz-11523.ivf" },
+  // { threads, filename, res_filename }
+  { 1, "invalid-bug-1814.ivf", NULL },
+  { 1, "invalid-chromium-906381.ivf", NULL },
+  { 1, "invalid-google-142530197.ivf", NULL },
+  { 1, "invalid-google-142530197-1.ivf", NULL },
+  { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-9720.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
+  { 4, "invalid-oss-fuzz-15363.ivf", NULL },
+  { 1, "invalid-oss-fuzz-16437.ivf", NULL },
+#if CONFIG_AV1_HIGHBITDEPTH
+  // These test vectors contain 10-bit or 12-bit video.
+  { 1, "invalid-oss-fuzz-9288.ivf", NULL },
+  { 1, "invalid-oss-fuzz-9482.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10061.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10227.ivf", NULL },
+  { 4, "invalid-oss-fuzz-10555.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10705.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-10779.ivf", NULL },
+  { 1, "invalid-oss-fuzz-11477.ivf", NULL },
+  { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+#endif
 };
 
 AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,

diff --git a/libaom/test/level_test.cc b/libaom/test/level_test.cc
index e3b0ef1..a9613c5 100644
--- a/libaom/test/level_test.cc
+++ b/libaom/test/level_test.cc

@@ -14,11 +14,15 @@
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
 
 namespace {
+const int kLevelMin = 0;
+const int kLevelMax = 31;
+const int kLevelKeepStats = 24;
 // Speed settings tested
 static const int kCpuUsedVectors[] = {
   1,
@@ -63,11 +67,16 @@
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
       }
     }
+
+    encoder->Control(AV1E_GET_SEQ_LEVEL_IDX, level_);
+    ASSERT_LE(level_[0], kLevelMax);
+    ASSERT_GE(level_[0], kLevelMin);
   }
 
   libaom_test::TestMode encoding_mode_;
   int cpu_used_;
   int target_level_;
+  int level_[32];
 };
 
 TEST_P(LevelTest, TestTargetLevelApi) {
@@ -79,14 +88,17 @@
   for (int operating_point = 0; operating_point <= 32; ++operating_point) {
     for (int level = 0; level <= 32; ++level) {
       const int target_level = operating_point * 100 + level;
-      if ((level >= 0 && level <= 23) || level == 31 || operating_point > 31) {
+      if ((level <= 24 && level != 2 && level != 3 && level != 6 &&
+           level != 7 && level != 10 && level != 11 && level != 20 &&
+           level != 21 && level != 22 && level != 23) ||
+          level == 31 || operating_point > 31) {
         EXPECT_EQ(AOM_CODEC_OK,
-                  aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
-                                    target_level));
+                  AOM_CODEC_CONTROL_TYPECHECKED(
+                      &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
       } else {
         EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-                  aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
-                                    target_level));
+                  AOM_CODEC_CONTROL_TYPECHECKED(
+                      &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
       }
     }
   }
@@ -102,6 +114,43 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
+TEST_P(LevelTest, TestLevelMonitoringLowBitrate) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+    target_level_ = kLevelKeepStats;
+    cfg_.rc_target_bitrate = 1000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], 0);
+  }
+}
+
+TEST_P(LevelTest, TestLevelMonitoringHighBitrate) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+    target_level_ = kLevelKeepStats;
+    cfg_.rc_target_bitrate = 4000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], 1);
+  }
+}
+
+TEST_P(LevelTest, TestTargetLevel0) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 50);
+    const int target_level = 0;
+    target_level_ = target_level;
+    cfg_.rc_target_bitrate = 4000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], target_level);
+  }
+}
+
 AV1_INSTANTIATE_TEST_CASE(LevelTest,
                           ::testing::Values(::libaom_test::kTwoPassGood),
                           ::testing::ValuesIn(kCpuUsedVectors));

diff --git a/libaom/test/lightfield_test.sh b/libaom/test/lightfield_test.sh
index 19b6934..3de88af 100755
--- a/libaom/test/lightfield_test.sh
+++ b/libaom/test/lightfield_test.sh

@@ -46,6 +46,22 @@
 
   [ -e "${lf_file}" ] || return 1
 
+  # Check to ensure all camera frames have the identical frame header. If not identical, this test fails.
+  for i in ./fh*; do
+    diff ./fh004 $i > /dev/null
+      if [ $? -eq 1 ]; then
+      return 1
+    fi
+  done
+
+  # Check to ensure all camera frames use the identical frame context. If not identical, this test fails.
+  for i in ./fc*; do
+    diff ./fc004 $i > /dev/null
+      if [ $? -eq 1 ]; then
+      return 1
+    fi
+  done
+
   # Parse lightfield bitstream to construct and output a new bitstream that can
   # be decoded by an AV1 decoder.
   local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}"

diff --git a/libaom/test/lpf_test.cc b/libaom/test/lpf_test.cc
index 451bffd..e8eeceb 100644
--- a/libaom/test/lpf_test.cc
+++ b/libaom/test/lpf_test.cc

@@ -12,6 +12,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -50,11 +51,11 @@
 typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
 typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
 
-typedef ::testing::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
-typedef ::testing::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
+typedef std::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
+typedef std::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
     hbddual_loop_param_t;
-typedef ::testing::tuple<loop_op_t, loop_op_t, int> loop_param_t;
-typedef ::testing::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
+typedef std::tuple<loop_op_t, loop_op_t, int> loop_param_t;
+typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
 
 template <typename Pixel_t, int PIXEL_WIDTH_t>
 void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
@@ -72,9 +73,9 @@
         if (j < 1) {
           tmp_s[j] = rnd->Rand16();
         } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[j] = tmp_s[j - 1] + (limit - 1);
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
         } else {  // Decrement by a value within the limit.
-          tmp_s[j] = tmp_s[j - 1] - (limit - 1);
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
         }
         j++;
       }
@@ -91,11 +92,11 @@
         if (j < 1) {
           tmp_s[j] = rnd->Rand16();
         } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] =
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1);
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
         } else {  // Decrement by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] =
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1);
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
         }
         j++;
       }
@@ -129,9 +130,9 @@
  public:
   virtual ~LoopTestParam() {}
   virtual void SetUp() {
-    loopfilter_op_ = ::testing::get<0>(this->GetParam());
-    ref_loopfilter_op_ = ::testing::get<1>(this->GetParam());
-    bit_depth_ = ::testing::get<2>(this->GetParam());
+    loopfilter_op_ = std::get<0>(this->GetParam());
+    ref_loopfilter_op_ = std::get<1>(this->GetParam());
+    bit_depth_ = std::get<2>(this->GetParam());
     mask_ = (1 << bit_depth_) - 1;
   }
 
@@ -144,26 +145,30 @@
   func_type_t ref_loopfilter_op_;
 };
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
   op(s, p, blimit, limit, thresh, bd);
 }
-void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
-  (void)bd;
-  op(s, p, blimit, limit, thresh);
-}
 void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
                      hbddual_loop_op_t op) {
   op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
 }
+#endif
+void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit, limit, thresh);
+}
 void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
   (void)bd;
   op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
 };
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
-typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
 typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
     Loop8Test9Param_hbd;
+#endif
+typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
 typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
 
 #define OPCHECK(a, b)                                                          \
@@ -206,7 +211,9 @@
          "loopfilter output. "                                                 \
       << "First failed at test case " << first_failure;
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
 
 #define VALCHECK(a, b)                                                         \
@@ -252,7 +259,9 @@
          "loopfilter output. "                                                 \
       << "First failed at test case " << first_failure;
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
 
 #define SPEEDCHECK(a, b)                                                      \
@@ -280,7 +289,9 @@
     call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
 
 #define OPCHECKd(a, b)                                                         \
@@ -337,7 +348,9 @@
          "loopfilter output. "                                                 \
       << "First failed at test case " << first_failure;
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
 
 #define VALCHECKd(a, b)                                                        \
@@ -396,7 +409,9 @@
          "loopfilter output. "                                                 \
       << "First failed at test case " << first_failure;
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
 
 #define SPEEDCHECKd(a, b)                                                    \
@@ -436,13 +451,15 @@
                     limit1, thresh1, bit_depth_, loopfilter_op_);            \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
+#endif
 TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
-
+#if CONFIG_AV1_HIGHBITDEPTH
 const hbdloop_param_t kHbdLoop8Test6[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
              8),
@@ -486,8 +503,9 @@
   make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_hbd,
-                        ::testing::ValuesIn(kHbdLoop8Test6));
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test6));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 const loop_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
@@ -500,8 +518,8 @@
   make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
-                        ::testing::ValuesIn(kLoop8Test6));
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test6));
 
 const dual_loop_param_t kLoop8Test9[] = {
   make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
@@ -515,12 +533,12 @@
   make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd,
-                        ::testing::ValuesIn(kLoop8Test9));
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test9));
 
 #endif  // HAVE_SSE2
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
 const hbddual_loop_param_t kHbdLoop8Test9[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
              &aom_highbd_lpf_horizontal_4_dual_c, 8),
@@ -572,10 +590,10 @@
              &aom_highbd_lpf_vertical_14_dual_c, 12),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,
-                        ::testing::ValuesIn(kHbdLoop8Test9));
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test9));
 
-#endif  // HAVE_SSE2
+#endif  // HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_NEON
 const loop_param_t kLoop8Test6[] = {
@@ -589,11 +607,11 @@
   make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
 };
 
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd,
-                        ::testing::ValuesIn(kLoop8Test6));
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test6));
 #endif  // HAVE_NEON
 
-#if HAVE_AVX2
+#if HAVE_AVX2 && CONFIG_AV1_HIGHBITDEPTH
 const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
              &aom_highbd_lpf_horizontal_4_dual_c, 8),
@@ -621,7 +639,7 @@
              &aom_highbd_lpf_vertical_8_dual_c, 12),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param_hbd,
-                        ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test9Avx2));
 #endif
 }  // namespace

diff --git a/libaom/test/masked_sad_test.cc b/libaom/test/masked_sad_test.cc
index 311f187..aa4dd83 100644
--- a/libaom/test/masked_sad_test.cc
+++ b/libaom/test/masked_sad_test.cc

@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -33,9 +34,37 @@
                                       const uint8_t *second_pred,
                                       const uint8_t *msk, int msk_stride,
                                       int invert_mask);
-typedef ::testing::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+typedef std::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
 
-class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
+typedef void (*MaskedSADx4Func)(const uint8_t *src, int src_stride,
+                                const uint8_t *ref[], int ref_stride,
+                                const uint8_t *second_pred, const uint8_t *msk,
+                                int msk_stride, int invert_mask,
+                                unsigned sads[]);
+
+typedef std::tuple<MaskedSADx4Func, MaskedSADx4Func> MaskedSADx4Param;
+
+class MaskedSADTestBase : public ::testing::Test {
+ public:
+  virtual ~MaskedSADTestBase() {}
+  virtual void SetUp() = 0;
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[],
+                      int times) = 0;
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times) = 0;
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runMaskedSADTest(int run_times);
+};
+
+class MaskedSADTest : public MaskedSADTestBase,
+                      public ::testing::WithParamInterface<MaskedSADParam> {
  public:
   virtual ~MaskedSADTest() {}
   virtual void SetUp() {
@@ -43,20 +72,113 @@
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-  void runMaskedSADTest(int run_times);
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[], int times);
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times);
 
  protected:
   MaskedSADFunc maskedSAD_op_;
   MaskedSADFunc ref_maskedSAD_op_;
 };
-void MaskedSADTest::runMaskedSADTest(int run_times) {
-  unsigned int ref_ret = 0, ret = 1;
+
+class MaskedSADx4Test : public MaskedSADTestBase,
+                        public ::testing::WithParamInterface<MaskedSADx4Param> {
+ public:
+  virtual ~MaskedSADx4Test() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[], int times);
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times);
+
+ protected:
+  MaskedSADx4Func maskedSAD_op_;
+  MaskedSADx4Func ref_maskedSAD_op_;
+};
+
+void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
+                           const uint8_t *ref_ptr[], int ref_stride,
+                           const uint8_t *second_pred, const uint8_t *msk,
+                           int msk_stride, int invert_mask, unsigned sads[],
+                           int times) {
+  for (int repeat = 0; repeat < times; ++repeat) {
+    sads[0] = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                                second_pred, msk, msk_stride, invert_mask);
+  }
+}
+
+void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr[], int ref_stride,
+                            const uint8_t *second_pred, const uint8_t *msk,
+                            int msk_stride, int invert_mask, unsigned sads[],
+                            int times) {
+  if (times == 1) {
+    sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                            second_pred, msk, msk_stride, invert_mask);
+  } else {
+    for (int repeat = 0; repeat < times; ++repeat) {
+      ASM_REGISTER_STATE_CHECK(
+          sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                                  second_pred, msk, msk_stride, invert_mask));
+    }
+  }
+}
+
+void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr[], int ref_stride,
+                             const uint8_t *second_pred, const uint8_t *msk,
+                             int msk_stride, int invert_mask, unsigned sads[],
+                             int times) {
+  for (int repeat = 0; repeat < times; ++repeat) {
+    ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred,
+                      msk, msk_stride, invert_mask, sads);
+  }
+}
+
+void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr[], int ref_stride,
+                              const uint8_t *second_pred, const uint8_t *msk,
+                              int msk_stride, int invert_mask, unsigned sads[],
+                              int times) {
+  if (times == 1) {
+    ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+                                           ref_stride, second_pred, msk,
+                                           msk_stride, invert_mask, sads));
+  } else {
+    for (int repeat = 0; repeat < times; ++repeat) {
+      maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk,
+                    msk_stride, invert_mask, sads);
+    }
+  }
+}
+
+void MaskedSADTestBase::runMaskedSADTest(int run_times) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE;
   DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE * 4]);
   DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+
+  const uint8_t *refs[] = { ref_ptr, ref_ptr + kBlockSize,
+                            ref_ptr + 2 * kBlockSize,
+                            ref_ptr + 3 * kBlockSize };
+  unsigned sads[] = { 0, 0, 0, 0 };
+  unsigned sads_ref[] = { 0, 0, 0, 0 };
   int err_count = 0;
   int first_failure = -1;
   int src_stride = MAX_SB_SIZE;
@@ -67,6 +189,9 @@
     for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
+      (ref_ptr + kBlockSize)[j] = rnd.Rand8();
+      (ref_ptr + 2 * kBlockSize)[j] = rnd.Rand8();
+      (ref_ptr + 3 * kBlockSize)[j] = rnd.Rand8();
       second_pred_ptr[j] = rnd.Rand8();
       msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
       assert(msk_ptr[j] <= 64);
@@ -75,33 +200,23 @@
     for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
-      for (int repeat = 0; repeat < run_times; ++repeat) {
-        ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
-                                    second_pred_ptr, msk_ptr, msk_stride,
-                                    invert_mask);
-      }
+      runRef(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+             msk_stride, invert_mask, sads_ref, run_times);
       aom_usec_timer_mark(&timer);
       const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
       aom_usec_timer_start(&timer);
-      if (run_times == 1) {
-        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
-                                                     ref_ptr, ref_stride,
-                                                     second_pred_ptr, msk_ptr,
-                                                     msk_stride, invert_mask));
-      } else {
-        for (int repeat = 0; repeat < run_times; ++repeat) {
-          ret =
-              maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
-                            second_pred_ptr, msk_ptr, msk_stride, invert_mask);
-        }
-      }
+      runTest(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+              msk_stride, invert_mask, sads, run_times);
       aom_usec_timer_mark(&timer);
       const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
       if (run_times > 10) {
         printf("%7.2f/%7.2fns", time1, time2);
         printf("(%3.2f)\n", time1 / time2);
       }
-      if (ret != ref_ret) {
+      if (sads_ref[0] != sads[0] || sads_ref[1] != sads[1] ||
+          sads_ref[2] != sads[2] || sads_ref[3] != sads[3]) {
         err_count++;
         if (first_failure == -1) first_failure = i;
       }
@@ -115,12 +230,17 @@
 
 TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
 
+TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
                                             const uint8_t *msk, int msk_stride,
                                             int invert_mask);
-typedef ::testing::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+typedef std::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
     HighbdMaskedSADParam;
 
 class HighbdMaskedSADTest
@@ -206,8 +326,9 @@
 TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); }
 
 TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSSE3
 const MaskedSADParam msad_test[] = {
@@ -235,8 +356,37 @@
   make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
 
+const MaskedSADx4Param msadx4_test[] = {
+  make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c),
+  make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c),
+  make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c),
+  make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c),
+  make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c),
+  make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c),
+  make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c),
+  make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c),
+  make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c),
+  make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c),
+  make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c),
+  make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c),
+  make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c),
+  make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
+  make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
+  make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
+  make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
+  make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
+  make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
+  make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
+  make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
+  make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
+                         ::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 const HighbdMaskedSADParam hbd_msad_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
   make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
@@ -265,8 +415,9 @@
   make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, HighbdMaskedSADTest,
-                        ::testing::ValuesIn(hbd_msad_test));
+INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest,
+                         ::testing::ValuesIn(hbd_msad_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -295,9 +446,10 @@
   make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, MaskedSADTest,
-                        ::testing::ValuesIn(msad_avx2_test));
+INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
+                         ::testing::ValuesIn(msad_avx2_test));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
   make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),
@@ -335,8 +487,9 @@
              &aom_highbd_masked_sad64x16_ssse3)
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, HighbdMaskedSADTest,
-                        ::testing::ValuesIn(hbd_msad_avx2_test));
+INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest,
+                         ::testing::ValuesIn(hbd_msad_avx2_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 }  // namespace

diff --git a/libaom/test/masked_variance_test.cc b/libaom/test/masked_variance_test.cc
index 275b9fe..bf814ce 100644
--- a/libaom/test/masked_variance_test.cc
+++ b/libaom/test/masked_variance_test.cc

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -37,7 +38,7 @@
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
     MaskedSubPixelVarianceParam;
 
 class MaskedSubPixelVarianceTest
@@ -170,8 +171,9 @@
                           << " y_offset = " << first_failure_y;
 }
 
-typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
-                         aom_bit_depth_t>
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+                   aom_bit_depth_t>
     HighbdMaskedSubPixelVarianceParam;
 
 class HighbdMaskedSubPixelVarianceTest
@@ -311,8 +313,9 @@
                           << " x_offset = " << first_failure_x
                           << " y_offset = " << first_failure_y;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSSE3
 
@@ -348,12 +351,26 @@
   make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
              &aom_masked_sub_pixel_variance4x8_c),
   make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
-             &aom_masked_sub_pixel_variance4x4_c)
+             &aom_masked_sub_pixel_variance4x4_c),
+
+  make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
+             &aom_masked_sub_pixel_variance64x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
+             &aom_masked_sub_pixel_variance16x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x8_ssse3,
+             &aom_masked_sub_pixel_variance32x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x32_ssse3,
+             &aom_masked_sub_pixel_variance8x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x4_ssse3,
+             &aom_masked_sub_pixel_variance16x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
+             &aom_masked_sub_pixel_variance4x16_c),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
-                        ::testing::ValuesIn(sub_pel_var_test));
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(sub_pel_var_test));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
@@ -450,10 +467,48 @@
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
-             &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12)
+             &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
-                        ::testing::ValuesIn(hbd_sub_pel_var_test));
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(hbd_sub_pel_var_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 }  // namespace

diff --git a/libaom/test/metadata_test.cc b/libaom/test/metadata_test.cc
new file mode 100644
index 0000000..79e08a7
--- /dev/null
+++ b/libaom/test/metadata_test.cc

@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/bitstream.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+const size_t kMetadataPayloadSizeT35 = 24;
+// 0xB5 stands for the itut t35 metadata country code for the Unites States
+const uint8_t kMetadataPayloadT35[kMetadataPayloadSizeT35] = {
+  0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
+  0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+const size_t kMetadataPayloadSizeCll = 4;
+const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
+                                                               0x03 };
+
+#if CONFIG_AV1_ENCODER
+
+const size_t kMetadataObuSizeT35 = 28;
+const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
+  0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+  0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeMdcv = 28;
+const uint8_t kMetadataObuMdcv[kMetadataObuSizeMdcv] = {
+  0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+  0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeCll = 8;
+const uint8_t kMetadataObuCll[kMetadataObuSizeCll] = { 0x2A, 0x06, 0x01, 0xB5,
+                                                       0x01, 0x02, 0x03, 0x80 };
+
+class MetadataEncodeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~MetadataEncodeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video) {
+    aom_image_t *current_frame = video->img();
+    if (current_frame) {
+      if (current_frame->metadata) aom_img_remove_metadata(current_frame);
+      ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                                     kMetadataPayloadT35, 0, AOM_MIF_ANY_FRAME),
+                -1);
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, NULL,
+                               kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME),
+          -1);
+      ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                                     NULL, 0, AOM_MIF_ANY_FRAME),
+                -1);
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                               kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                               AOM_MIF_ANY_FRAME),
+          0);
+
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_MDCV,
+                               kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                               AOM_MIF_KEY_FRAME),
+          0);
+
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_CLL,
+                               kMetadataPayloadCll, kMetadataPayloadSizeCll,
+                               AOM_MIF_KEY_FRAME),
+          0);
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const size_t bitstream_size = pkt->data.frame.sz;
+      const uint8_t *bitstream =
+          static_cast<const uint8_t *>(pkt->data.frame.buf);
+      // look for valid metadatas in bitstream
+      bool itut_t35_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeT35) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeT35; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuT35, kMetadataObuSizeT35) ==
+              0) {
+            itut_t35_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_EQ(itut_t35_metadata_found, 1u);
+
+      // Testing for HDR MDCV metadata
+      bool hdr_mdcv_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeMdcv) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeMdcv; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuMdcv, kMetadataObuSizeMdcv) ==
+              0) {
+            hdr_mdcv_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_TRUE(hdr_mdcv_metadata_found);
+
+      // Testing for HDR CLL metadata
+      bool hdr_cll_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeCll) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeCll; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuCll, kMetadataObuSizeCll) ==
+              0) {
+            hdr_cll_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_TRUE(hdr_cll_metadata_found);
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t /*pts*/) {
+    ASSERT_TRUE(img.metadata != nullptr);
+
+    ASSERT_EQ(img.metadata->sz, 3u);
+
+    for (size_t i = 0; i < img.metadata->sz - 1; ++i) {
+      ASSERT_EQ(kMetadataPayloadSizeT35, img.metadata->metadata_array[i]->sz);
+      EXPECT_EQ(
+          memcmp(kMetadataPayloadT35, img.metadata->metadata_array[i]->payload,
+                 kMetadataPayloadSizeT35),
+          0);
+    }
+
+    ASSERT_EQ(kMetadataPayloadSizeCll, img.metadata->metadata_array[2]->sz);
+    EXPECT_EQ(
+        memcmp(kMetadataPayloadCll, img.metadata->metadata_array[2]->payload,
+               kMetadataPayloadSizeCll),
+        0);
+  }
+};
+
+TEST_P(MetadataEncodeTest, TestMetadataEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 600;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_undershoot_pct = 50;
+  cfg_.rc_overshoot_pct = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.kf_mode = AOM_KF_AUTO;
+  cfg_.g_lag_in_frames = 1;
+  cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+  // Enable dropped frames.
+  cfg_.rc_dropframe_thresh = 1;
+  // Disable error_resilience mode.
+  cfg_.g_error_resilient = 0;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(MetadataEncodeTest,
+                          ::testing::Values(::libaom_test::kOnePassGood));
+
+#endif  // CONFIG_AV1_ENCODER
+}  // namespace
+
+TEST(MetadataTest, MetadataAllocation) {
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+  ASSERT_NE(metadata, nullptr);
+  aom_img_metadata_free(metadata);
+}
+
+TEST(MetadataTest, MetadataArrayAllocation) {
+  aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(2);
+  ASSERT_NE(metadata_array, nullptr);
+
+  metadata_array->metadata_array[0] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+  metadata_array->metadata_array[1] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+  aom_img_metadata_array_free(metadata_array);
+}
+
+TEST(MetadataTest, AddMetadataToImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+  aom_img_metadata_array_free(image.metadata);
+  EXPECT_EQ(aom_img_add_metadata(NULL, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            -1);
+}
+
+TEST(MetadataTest, RemoveMetadataFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+  aom_img_remove_metadata(&image);
+  aom_img_remove_metadata(NULL);
+}
+
+TEST(MetadataTest, CopyMetadataToFrameBuffer) {
+  YV12_BUFFER_CONFIG yvBuf;
+  yvBuf.metadata = NULL;
+
+  aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(1);
+  ASSERT_NE(metadata_array, nullptr);
+
+  metadata_array->metadata_array[0] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+  // Metadata_array
+  int status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array);
+  EXPECT_EQ(status, 0);
+  status = aom_copy_metadata_to_frame_buffer(NULL, metadata_array);
+  EXPECT_EQ(status, -1);
+  aom_img_metadata_array_free(metadata_array);
+
+  // Metadata_array_2
+  aom_metadata_array_t *metadata_array_2 = aom_img_metadata_array_alloc(0);
+  ASSERT_NE(metadata_array_2, nullptr);
+  status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array_2);
+  EXPECT_EQ(status, -1);
+  aom_img_metadata_array_free(metadata_array_2);
+
+  // YV12_BUFFER_CONFIG
+  status = aom_copy_metadata_to_frame_buffer(&yvBuf, NULL);
+  EXPECT_EQ(status, -1);
+  aom_remove_metadata_from_frame_buffer(&yvBuf);
+  aom_remove_metadata_from_frame_buffer(NULL);
+}
+
+TEST(MetadataTest, GetMetadataFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+
+  EXPECT_TRUE(aom_img_get_metadata(NULL, 0) == NULL);
+  EXPECT_TRUE(aom_img_get_metadata(&image, 1u) == NULL);
+  EXPECT_TRUE(aom_img_get_metadata(&image, 10u) == NULL);
+
+  const aom_metadata_t *metadata = aom_img_get_metadata(&image, 0);
+  ASSERT_TRUE(metadata != NULL);
+  ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+  EXPECT_EQ(
+      memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+      0);
+
+  aom_img_metadata_array_free(image.metadata);
+}
+
+TEST(MetadataTest, ReadMetadatasFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  uint32_t types[3];
+  types[0] = OBU_METADATA_TYPE_ITUT_T35;
+  types[1] = OBU_METADATA_TYPE_HDR_CLL;
+  types[2] = OBU_METADATA_TYPE_HDR_MDCV;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, types[0], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME),
+            0);
+  ASSERT_EQ(aom_img_add_metadata(&image, types[1], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+            0);
+  ASSERT_EQ(aom_img_add_metadata(&image, types[2], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+            0);
+
+  size_t number_metadata = aom_img_num_metadata(&image);
+  ASSERT_EQ(number_metadata, 3u);
+  for (size_t i = 0; i < number_metadata; ++i) {
+    const aom_metadata_t *metadata = aom_img_get_metadata(&image, i);
+    ASSERT_TRUE(metadata != NULL);
+    ASSERT_EQ(metadata->type, types[i]);
+    ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+    EXPECT_EQ(
+        memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+        0);
+  }
+  aom_img_metadata_array_free(image.metadata);
+}

diff --git a/libaom/test/noise_model_test.cc b/libaom/test/noise_model_test.cc
index b5b387e..5b61236 100644
--- a/libaom/test/noise_model_test.cc
+++ b/libaom/test/noise_model_test.cc

@@ -343,7 +343,7 @@
   libaom_test::ACMRandom random_;
 };
 
-TYPED_TEST_CASE_P(FlatBlockEstimatorTest);
+TYPED_TEST_SUITE_P(FlatBlockEstimatorTest);
 
 TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) {
   const int kBlockSize = 16;
@@ -494,16 +494,16 @@
   aom_flat_block_finder_free(&flat_block_finder);
 }
 
-REGISTER_TYPED_TEST_CASE_P(FlatBlockEstimatorTest, ExtractBlock,
-                           FindFlatBlocks);
+REGISTER_TYPED_TEST_SUITE_P(FlatBlockEstimatorTest, ExtractBlock,
+                            FindFlatBlocks);
 
 typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
                          BitDepthParams<uint16_t, 8, true>,   // lowbd in 16-bit
                          BitDepthParams<uint16_t, 10, true>,  // highbd data
                          BitDepthParams<uint16_t, 12, true> >
     AllBitDepthParams;
-INSTANTIATE_TYPED_TEST_CASE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
-                              AllBitDepthParams);
+INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
+                               AllBitDepthParams);
 
 template <typename T>
 class NoiseModelUpdateTest : public ::testing::Test, public T {
@@ -570,7 +570,7 @@
   uint8_t *denoised_ptr_raw_[3];
 };
 
-TYPED_TEST_CASE_P(NoiseModelUpdateTest);
+TYPED_TEST_SUITE_P(NoiseModelUpdateTest);
 
 TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) {
   EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
@@ -929,17 +929,17 @@
   }
   EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
 }
-REGISTER_TYPED_TEST_CASE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
-                           UpdateSuccessForZeroNoiseAllFlat,
-                           UpdateFailsBlockSizeTooSmall,
-                           UpdateSuccessForWhiteRandomNoise,
-                           UpdateSuccessForScaledWhiteNoise,
-                           UpdateSuccessForCorrelatedNoise,
-                           NoiseStrengthChangeSignalsDifferentNoiseType,
-                           NoiseCoeffsSignalsDifferentNoiseType);
+REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
+                            UpdateSuccessForZeroNoiseAllFlat,
+                            UpdateFailsBlockSizeTooSmall,
+                            UpdateSuccessForWhiteRandomNoise,
+                            UpdateSuccessForScaledWhiteNoise,
+                            UpdateSuccessForCorrelatedNoise,
+                            NoiseStrengthChangeSignalsDifferentNoiseType,
+                            NoiseCoeffsSignalsDifferentNoiseType);
 
-INSTANTIATE_TYPED_TEST_CASE_P(NoiseModelUpdateTestInstatiation,
-                              NoiseModelUpdateTest, AllBitDepthParams);
+INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation,
+                               NoiseModelUpdateTest, AllBitDepthParams);
 
 TEST(NoiseModelGetGrainParameters, TestLagSize) {
   aom_film_grain_t film_grain;
@@ -1229,7 +1229,7 @@
   int stride_[3];
 };
 
-TYPED_TEST_CASE_P(WienerDenoiseTest);
+TYPED_TEST_SUITE_P(WienerDenoiseTest);
 
 TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
   const uint8_t *const data_ptrs[3] = {
@@ -1336,8 +1336,8 @@
   }
 }
 
-REGISTER_TYPED_TEST_CASE_P(WienerDenoiseTest, InvalidBlockSize,
-                           InvalidChromaSubsampling, GradientTest);
+REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize,
+                            InvalidChromaSubsampling, GradientTest);
 
-INSTANTIATE_TYPED_TEST_CASE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
-                              AllBitDepthParams);
+INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+                               AllBitDepthParams);

diff --git a/libaom/test/obmc_sad_test.cc b/libaom/test/obmc_sad_test.cc
index 6cef869..6b4382c 100644
--- a/libaom/test/obmc_sad_test.cc
+++ b/libaom/test/obmc_sad_test.cc

@@ -101,11 +101,18 @@
   TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
   TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
   TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
-  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1)
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1),
+
+  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_sse4_1),
+  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_sse4_1),
+  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_sse4_1),
+  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_sse4_1),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadTest,
-                        ::testing::ValuesIn(sse4_functions));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadTest,
+                         ::testing::ValuesIn(sse4_functions));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
@@ -125,12 +132,21 @@
   TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
   TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
   TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
-  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2)
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2),
+
+  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_avx2),
+  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_avx2),
+  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_avx2),
+  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_avx2),
+  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_avx2),
+  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_avx2),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadTest, ::testing::ValuesIn(avx2_functions));
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest,
+                         ::testing::ValuesIn(avx2_functions));
 #endif  // HAVE_AVX2
 
+#if CONFIG_AV1_HIGHBITDEPTH
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
@@ -204,11 +220,18 @@
   TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_sse4_1),
   TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_sse4_1),
   TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_sse4_1),
-  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1)
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1),
+
+  TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_sse4_1),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadHBDTest,
-                        ::testing::ValuesIn(sse4_functions_hbd));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadHBDTest,
+                         ::testing::ValuesIn(sse4_functions_hbd));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
@@ -228,10 +251,18 @@
   TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2),
   TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2),
   TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2),
-  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2)
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2),
+
+  TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_avx2),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadHBDTest,
-                        ::testing::ValuesIn(avx2_functions_hbd));
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadHBDTest,
+                         ::testing::ValuesIn(avx2_functions_hbd));
 #endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/obmc_variance_test.cc b/libaom/test/obmc_variance_test.cc
index 4563b96..fc281d7 100644
--- a/libaom/test/obmc_variance_test.cc
+++ b/libaom/test/obmc_variance_test.cc

@@ -147,11 +147,18 @@
   TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1),
   TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1),
   TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
-  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1)
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1),
+
+  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_sse4_1),
+  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_sse4_1),
+  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_sse4_1),
+  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_sse4_1),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceTest,
-                        ::testing::ValuesIn(sse4_functions));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceTest,
+                         ::testing::ValuesIn(sse4_functions));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
@@ -170,18 +177,25 @@
   TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
   TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
   TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
-  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
-  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1)
+  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_avx2),
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_avx2),
+
+  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_avx2),
+  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_avx2),
+  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_avx2),
+  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_avx2),
+  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_avx2),
+  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_avx2),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, ObmcVarianceTest,
-                        ::testing::ValuesIn(avx2_functions));
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest,
+                         ::testing::ValuesIn(avx2_functions));
 #endif  // HAVE_AVX2
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
 
 TEST_P(ObmcVarianceHBDTest, RandomValues) {
@@ -336,10 +350,48 @@
   TestFuncs(aom_highbd_12_obmc_variance4x8_c,
             aom_highbd_12_obmc_variance4x8_sse4_1, 12),
   TestFuncs(aom_highbd_12_obmc_variance4x4_c,
-            aom_highbd_12_obmc_variance4x4_sse4_1, 12)
+            aom_highbd_12_obmc_variance4x4_sse4_1, 12),
+
+  TestFuncs(aom_highbd_obmc_variance64x16_c,
+            aom_highbd_obmc_variance64x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x64_c,
+            aom_highbd_obmc_variance16x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x8_c, aom_highbd_obmc_variance32x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x32_c, aom_highbd_obmc_variance8x32_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance16x4_c, aom_highbd_obmc_variance16x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x16_c, aom_highbd_obmc_variance4x16_sse4_1,
+            8),
+  TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+            aom_highbd_10_obmc_variance64x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+            aom_highbd_10_obmc_variance16x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+            aom_highbd_10_obmc_variance32x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+            aom_highbd_10_obmc_variance8x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+            aom_highbd_10_obmc_variance16x4_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+            aom_highbd_10_obmc_variance4x16_sse4_1, 10),
+  TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+            aom_highbd_12_obmc_variance64x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+            aom_highbd_12_obmc_variance16x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+            aom_highbd_12_obmc_variance32x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+            aom_highbd_12_obmc_variance8x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+            aom_highbd_12_obmc_variance16x4_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+            aom_highbd_12_obmc_variance4x16_sse4_1, 12),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceHBDTest,
-                        ::testing::ValuesIn(sse4_functions_hbd));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest,
+                         ::testing::ValuesIn(sse4_functions_hbd));
 #endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/pickrst_test.cc b/libaom/test/pickrst_test.cc
index 0aa49b6..9a2c5bc 100644
--- a/libaom/test/pickrst_test.cc
+++ b/libaom/test/pickrst_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "test/register_state_check.h"
@@ -36,8 +38,7 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-typedef ::testing::tuple<const lowbd_pixel_proj_error_func>
-    PixelProjErrorTestParam;
+typedef std::tuple<const lowbd_pixel_proj_error_func> PixelProjErrorTestParam;
 
 class PixelProjErrorTest
     : public ::testing::TestWithParam<PixelProjErrorTestParam> {
@@ -177,18 +178,19 @@
 TEST_P(PixelProjErrorTest, DISABLED_Speed) { RunPixelProjErrorTest(200000); }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, PixelProjErrorTest,
-                        ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 
-INSTANTIATE_TEST_CASE_P(AVX2, PixelProjErrorTest,
-                        ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_lowbd
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace pickrst_test_highbd {
 static const int kIterations = 100;
 
@@ -201,8 +203,7 @@
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-typedef ::testing::tuple<const highbd_pixel_proj_error_func>
-    PixelProjErrorTestParam;
+typedef std::tuple<const highbd_pixel_proj_error_func> PixelProjErrorTestParam;
 
 class PixelProjHighbdErrorTest
     : public ::testing::TestWithParam<PixelProjErrorTestParam> {
@@ -344,14 +345,190 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, PixelProjHighbdErrorTest,
-                        ::testing::Values(av1_highbd_pixel_proj_error_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjHighbdErrorTest,
+                         ::testing::Values(av1_highbd_pixel_proj_error_sse4_1));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 
-INSTANTIATE_TEST_CASE_P(AVX2, PixelProjHighbdErrorTest,
-                        ::testing::Values(av1_highbd_pixel_proj_error_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjHighbdErrorTest,
+                         ::testing::Values(av1_highbd_pixel_proj_error_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_highbd
+
+////////////////////////////////////////////////////////////////////////////////
+// Get_proj_subspace_Test
+////////////////////////////////////////////////////////////////////////////////
+
+namespace get_proj_subspace_test_lowbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace)(const uint8_t *src8, int width,
+                                      int height, int src_stride,
+                                      const uint8_t *dat8, int dat_stride,
+                                      int32_t *flt0, int flt0_stride,
+                                      int32_t *flt1, int flt1_stride,
+                                      int64_t H[2][2], int64_t C[2],
+                                      const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace> GetProjSubspaceTestParam;
+
+class GetProjSubspaceTest
+    : public ::testing::TestWithParam<GetProjSubspaceTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunGetProjSubspaceTest(int32_t run_times);
+  void RunGetProjSubspaceTest_ExtremeValues();
+
+ private:
+  set_get_proj_subspace target_func_;
+  libaom_test::ACMRandom rng_;
+  uint8_t *src_;
+  uint8_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
+  int h_end = run_times != 1
+                  ? 128
+                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+                     2147483640);  // We test for widths divisible by 8.
+  int v_end =
+      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand8();
+      src_[i] = rng_.Rand8();
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_calc_proj_params_c(src, v_end, h_end, src_stride, dgd, dgd_stride,
+                             flt0_, flt0_stride, flt1_, flt1_stride, H_ref,
+                             C_ref, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    } else {
+      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+      ASSERT_EQ(C_ref[0], C_test[0]);
+      ASSERT_EQ(C_ref[1], C_test[1]);
+    }
+  }
+}
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = MAX_DATA_BLOCK;
+  const int v_start = 0;
+  int v_end = MAX_DATA_BLOCK;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 255;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    params.r[0] = 1;
+    params.r[1] = 1;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    av1_calc_proj_params_c(src, h_end - h_start, v_end - v_start, src_stride,
+                           dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+                           flt1_stride, H_ref, C_ref, &params);
+
+    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+                 C_test, &params);
+
+    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+    ASSERT_EQ(C_ref[0], C_test[0]);
+    ASSERT_EQ(C_ref[1], C_test[1]);
+  }
+}
+
+TEST_P(GetProjSubspaceTest, RandomValues) { RunGetProjSubspaceTest(1); }
+
+TEST_P(GetProjSubspaceTest, ExtremeValues) {
+  RunGetProjSubspaceTest_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
+                         ::testing::Values(av1_calc_proj_params_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace get_proj_subspace_test_lowbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/test/quantize_func_test.cc b/libaom/test/quantize_func_test.cc
index 067a981..b40b38d 100644
--- a/libaom/test/quantize_func_test.cc
+++ b/libaom/test/quantize_func_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
@@ -63,9 +65,9 @@
   HBD_QUAN_FUNC;
 }
 
-enum { TYPE_B, TYPE_DC, TYPE_FP } UENUM1BYTE(QuantType);
+enum QuantType { TYPE_B, TYPE_DC, TYPE_FP };
 
-using ::testing::tuple;
+using std::tuple;
 typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
     QuantizeParam;
 
@@ -296,6 +298,8 @@
   aom_usec_timer timer, simd_timer;
   int rows = tx_size_high[tx_size_];
   int cols = tx_size_wide[tx_size_];
+  rows = AOMMIN(32, rows);
+  cols = AOMMIN(32, cols);
   for (int cnt = 0; cnt <= rows; cnt++) {
     FillCoeffRandomRows(cnt * cols);
 
@@ -321,139 +325,223 @@
   }
 }
 
-using ::testing::make_tuple;
+using std::make_tuple;
 
 #if HAVE_AVX2
 const QuantizeParam kQParamArrayAvx2[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X16, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_4X16, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X4, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_32X8, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_8X32, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
-             TYPE_FP, AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_16X64,
-             TYPE_FP, AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_64X16,
-             TYPE_FP, AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
-             TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_16X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_64X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
-             TYPE_FP, AOM_BITS_8),
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
-             TYPE_FP, AOM_BITS_10),
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_10),
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
-             TYPE_FP, AOM_BITS_12),
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
   make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
-             TYPE_FP, AOM_BITS_8),
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
   make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
-             TYPE_FP, AOM_BITS_10),
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_10),
   make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
-             TYPE_FP, AOM_BITS_12),
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
-             TYPE_FP, AOM_BITS_8),
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
-             TYPE_FP, AOM_BITS_10),
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_10),
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
-             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
-             TYPE_FP, AOM_BITS_12),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
-             TYPE_B, AOM_BITS_8),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
-             TYPE_B, AOM_BITS_10),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
-             TYPE_B, AOM_BITS_12),
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+#endif
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, QuantizeTest,
-                        ::testing::ValuesIn(kQParamArrayAvx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArrayAvx2));
 #endif  // HAVE_AVX2
 
 #if HAVE_SSE2
 const QuantizeParam kQParamArraySSE2[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X16, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_4X16, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X4, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_8X32, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_32X8, TYPE_FP,
-             AOM_BITS_8),
-  make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2, TX_16X16, TYPE_B,
-             AOM_BITS_8),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
-             TYPE_B, AOM_BITS_8),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
-             TYPE_B, AOM_BITS_10),
-  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
-             TYPE_B, AOM_BITS_12),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
   make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
-             TX_32X32, TYPE_B, AOM_BITS_8),
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
-             TX_32X32, TYPE_B, AOM_BITS_10),
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
   make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
-             TX_32X32, TYPE_B, AOM_BITS_12),
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
   make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
-             TX_64X64, TYPE_B, AOM_BITS_8),
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
-             TX_64X64, TYPE_B, AOM_BITS_10),
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
   make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
-             TX_64X64, TYPE_B, AOM_BITS_12),
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#endif
   make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
-             TX_16X16, TYPE_B, AOM_BITS_8),
-  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_8X8,
-             TYPE_B, AOM_BITS_8),
-  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_4X4,
-             TYPE_B, AOM_BITS_8),
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_quantize_b_32x32_adaptive_c,
-             &aom_quantize_b_32x32_adaptive_sse2, TX_32X16, TYPE_B, AOM_BITS_8),
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X16), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_quantize_b_32x32_adaptive_c,
-             &aom_quantize_b_32x32_adaptive_sse2, TX_16X32, TYPE_B, AOM_BITS_8),
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X32), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_quantize_b_32x32_adaptive_c,
-             &aom_quantize_b_32x32_adaptive_sse2, TX_32X32, TYPE_B, AOM_BITS_8)
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
-                        ::testing::ValuesIn(kQParamArraySSE2));
+INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
+#if HAVE_NEON
+const QuantizeParam kQParamArrayNEON[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArrayNEON));
 #endif
 
 #if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, QuantizeTest,
-    ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
-                                 TX_16X16, TYPE_B, AOM_BITS_8),
-                      make_tuple(&aom_quantize_b_32x32_c,
-                                 &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_quantize_b_64x64_c,
-                                 &aom_quantize_b_64x64_ssse3, TX_64X64, TYPE_B,
-                                 AOM_BITS_8)));
+    ::testing::Values(
+        make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
+                   static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_ssse3,
+                   static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_ssse3,
+                   static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)));
 
 #endif  // HAVE_SSSE3 && ARCH_X86_64
 
 #if HAVE_AVX && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX, QuantizeTest,
-    ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
-                                 TX_16X16, TYPE_B, AOM_BITS_8),
-                      make_tuple(&aom_quantize_b_32x32_c,
-                                 &aom_quantize_b_32x32_avx, TX_32X32, TYPE_B,
-                                 AOM_BITS_8)));
+    ::testing::Values(
+        make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
+                   static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx,
+                   static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
 
 #endif  // HAVE_AVX && ARCH_X86_64
 }  // namespace

diff --git a/libaom/test/reconinter_test.cc b/libaom/test/reconinter_test.cc
index a8536e5..51bec0e 100644
--- a/libaom/test/reconinter_test.cc
+++ b/libaom/test/reconinter_test.cc

@@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -34,7 +35,7 @@
                                            const uint8_t *src1, int src1_stride,
                                            int h, int w);
 
-typedef ::testing::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
+typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
     BuildCompDiffwtdMaskDParam;
 
 #if HAVE_SSE4_1
@@ -63,7 +64,7 @@
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
     ConvolveParams *conv_params, int bd);
 
-typedef ::testing::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
+typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
     BuildCompDiffwtdMaskD16Param;
 
 #if HAVE_SSE4_1 || HAVE_NEON
@@ -234,25 +235,25 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, BuildCompDiffwtdMaskTest,
-                        BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BuildCompDiffwtdMaskD16Test,
     BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskTest,
-                        BuildParams(av1_build_compound_diffwtd_mask_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_avx2));
 
-INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskD16Test,
-                        BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
+                         BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
-                        BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test,
+                         BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
 #endif
 
 }  // namespace

diff --git a/libaom/test/resize_test.cc b/libaom/test/resize_test.cc
index 39e7d1b..bcf6794 100644
--- a/libaom/test/resize_test.cc
+++ b/libaom/test/resize_test.cc

@@ -12,6 +12,7 @@
 #include <climits>
 #include <vector>
 #include "aom_dsp/aom_dsp_common.h"
+#include "common/tools_common.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -47,7 +48,7 @@
   header[3] = 'F';
   mem_put_le16(header + 4, 0);                    /* version */
   mem_put_le16(header + 6, 32);                   /* headersize */
-  mem_put_le32(header + 8, 0x30395056);           /* fourcc (av1) */
+  mem_put_le32(header + 8, AV1_FOURCC);           /* fourcc (av1) */
   mem_put_le16(header + 12, cfg->g_w);            /* width */
   mem_put_le16(header + 14, cfg->g_h);            /* height */
   mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
@@ -617,12 +618,12 @@
 TEST_P(ResizeCspTest, TestResizeCspWorks) {
 #endif
   const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 };
-  for (size_t i = 0; i < GTEST_ARRAY_SIZE_(image_formats); ++i) {
-    ResizingCspVideoSource video(image_formats[i]);
+  for (const aom_img_fmt_t &img_format : image_formats) {
+    ResizingCspVideoSource video(img_format);
     init_flags_ = AOM_CODEC_USE_PSNR;
     cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_profile = (image_formats[i] == AOM_IMG_FMT_I420) ? 0 : 1;
+    cfg_.g_profile = (img_format == AOM_IMG_FMT_I420) ? 0 : 1;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
     // Check we decoded the same number of frames as we attempted to encode

diff --git a/libaom/test/rt_end_to_end_test.cc b/libaom/test/rt_end_to_end_test.cc
index 9c3e96b..f14d124 100644
--- a/libaom/test/rt_end_to_end_test.cc
+++ b/libaom/test/rt_end_to_end_test.cc

@@ -10,6 +10,9 @@
  */
 
 #include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -24,9 +27,25 @@
 const unsigned int kFrames = 10;
 const int kBitrate = 500;
 
-// List of psnr thresholds for speed settings 0-8
-const double kPsnrThreshold[9] = { 36.9, 36.9, 36.85, 36.8, 36.6,
-                                   36.4, 36.0, 35.5,  35.0 };
+// List of psnr thresholds for speed settings 6-8
+// keys: video, speed, aq mode.
+std::unordered_map<std::string,
+                   std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
+                         { { 5, { { 0, 35.4 }, { 3, 36.4 } } },
+                           { 6, { { 0, 35.3 }, { 3, 36.2 } } },
+                           { 7, { { 0, 34.9 }, { 3, 35.8 } } },
+                           { 8, { { 0, 35.0 }, { 3, 35.8 } } } } },
+                       { "paris_352_288_30.y4m",
+                         { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
+                           { 6, { { 0, 36.1 }, { 3, 36.6 } } },
+                           { 7, { { 0, 35.5 }, { 3, 36.0 } } },
+                           { 8, { { 0, 36.0 }, { 3, 36.5 } } } } },
+                       { "niklas_1280_720_30.y4m",
+                         { { 5, { { 0, 34.6 }, { 3, 34.6 } } },
+                           { 6, { { 0, 34.2 }, { 3, 34.2 } } },
+                           { 7, { { 0, 33.7 }, { 3, 33.6 } } },
+                           { 8, { { 0, 33.6 }, { 3, 33.4 } } } } } };
 
 typedef struct {
   const char *filename;
@@ -40,24 +59,26 @@
   return os << "TestVideoParam { filename:" << test_arg.filename
             << " input_bit_depth:" << test_arg.input_bit_depth
             << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
-            << " profile:" << test_arg.profile << "}";
+            << " profile:" << test_arg.profile << " }";
 }
 
-// TODO(kyslov): Add more test vectors
 const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
 };
 
-// Speed settings tested
-const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
-
+// Params: test video, speed, aq mode, threads, tile columns.
 class RTEndToEndTest
-    : public ::libaom_test::CodecTestWith2Params<TestVideoParam, int>,
+    : public ::libaom_test::CodecTestWith5Params<TestVideoParam, int,
+                                                 unsigned int, int, int>,
       public ::libaom_test::EncoderTest {
  protected:
   RTEndToEndTest()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
-        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+        aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+        tile_columns_(GET_PARAM(5)) {}
 
   virtual ~RTEndToEndTest() {}
 
@@ -65,8 +86,8 @@
     InitializeConfig();
     SetMode(::libaom_test::kRealTime);
 
-    cfg_.g_usage = 1;  // TODO(kyslov): Move it to encode_test_driver.cc
     cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_threads = threads_;
     cfg_.rc_buf_sz = 1000;
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -86,9 +107,11 @@
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_ROW_MT, 1);
     }
   }
 
@@ -97,7 +120,9 @@
     return 0.0;
   }
 
-  double GetPsnrThreshold() { return kPsnrThreshold[cpu_used_]; }
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[test_video_param_.filename][cpu_used_][aq_mode_];
+  }
 
   void DoTest() {
     cfg_.rc_target_bitrate = kBitrate;
@@ -115,7 +140,8 @@
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
-    EXPECT_GT(psnr, GetPsnrThreshold()) << "cpu used = " << cpu_used_;
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << " aq mode = " << aq_mode_;
   }
 
   TestVideoParam test_video_param_;
@@ -124,18 +150,25 @@
  private:
   double psnr_;
   unsigned int nframes_;
+  unsigned int aq_mode_;
+  int threads_;
+  int tile_columns_;
 };
 
-class RTEndToEndTestLarge : public RTEndToEndTest {};
-
-TEST_P(RTEndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+class RTEndToEndTestThreaded : public RTEndToEndTest {};
 
 TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestLarge,
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::Values(kTestVectors[0]),
-                          ::testing::Values(kCpuUsedVectors[8]));
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
+                          ::testing::Range(5, 9),
+                          ::testing::Values<unsigned int>(0, 3),
+                          ::testing::Values(1), ::testing::Values(1));
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestThreaded,
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::Range(5, 9),
+                          ::testing::Values<unsigned int>(0, 3),
+                          ::testing::Range(2, 5), ::testing::Range(2, 5));
 }  // namespace

diff --git a/libaom/test/sad_test.cc b/libaom/test/sad_test.cc
index 87dbb33..0bdbf37 100644
--- a/libaom/test/sad_test.cc
+++ b/libaom/test/sad_test.cc

@@ -12,6 +12,7 @@
 #include <string.h>
 #include <limits.h>
 #include <stdio.h>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -28,37 +29,42 @@
 
 typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
-typedef ::testing::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
 typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *ref_ptr, int ref_stride,
                                   const uint8_t *second_pred);
-typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+typedef std::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
 
 typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
                                    int width, int height, const uint8_t *ref,
                                    int ref_stride,
                                    const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
 
 typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
                                           int src_stride,
                                           const uint8_t *ref_ptr,
                                           int ref_stride, int width,
                                           int height);
-typedef ::testing::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
+typedef std::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
 
 typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          const uint8_t *second_pred,
                                          const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, DistWtdSadMxNAvgFunc, int>
-    DistWtdSadMxNAvgParam;
+typedef std::tuple<int, int, DistWtdSadMxNAvgFunc, int> DistWtdSadMxNAvgParam;
 
 typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_ptr[], int ref_stride,
                              uint32_t *sad_array);
-typedef ::testing::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+
+typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_ptr[], int ref_stride,
+                                const uint8_t *second_pred,
+                                uint32_t *sad_array);
+typedef std::tuple<int, int, SadMxNx4AvgFunc, int> SadMxNx4AvgParam;
 
 using libaom_test::ACMRandom;
 
@@ -339,6 +345,42 @@
   }
 };
 
+class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
+                     public SADTestBase {
+ public:
+  SADx4AvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+                                          references, reference_stride_,
+                                          second_pred_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADavg(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 200000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
 class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
                 public SADTestBase {
  public:
@@ -812,7 +854,70 @@
   source_data_ = tmp_source_data;
 }
 
-using ::testing::make_tuple;
+using std::make_tuple;
+
+#if SPEED_TEST
+TEST_P(SADx4AvgTest, Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  SpeedSAD();
+  reference_stride_ = tmp_stride;
+}
+#endif
+
+TEST_P(SADx4AvgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4AvgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4AvgTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4AvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
 
 //------------------------------------------------------------------------------
 // C functions
@@ -833,6 +938,7 @@
   make_tuple(8, 4, &aom_sad8x4_c, -1),
   make_tuple(4, 8, &aom_sad4x8_c, -1),
   make_tuple(4, 4, &aom_sad4x4_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
@@ -881,8 +987,39 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_sad64x16_c, -1),
+  make_tuple(16, 64, &aom_sad16x64_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_c, -1),
+  make_tuple(8, 32, &aom_sad8x32_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_c, -1),
+  make_tuple(4, 16, &aom_sad4x16_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
@@ -901,6 +1038,7 @@
   make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
   make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
@@ -949,8 +1087,39 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_avg_c, -1),
+  make_tuple(8, 32, &aom_sad8x32_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_avg_c, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 // TODO(chengchen): add highbd tests
 const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
@@ -970,10 +1139,17 @@
   make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
 };
 
-INSTANTIATE_TEST_CASE_P(C, DistWtdCompAvgTest,
-                        ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
 
 const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
   make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
@@ -992,9 +1168,17 @@
   make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
+  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
+  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
 };
-INSTANTIATE_TEST_CASE_P(C, DistWtdSADavgTest,
-                        ::testing::ValuesIn(dist_wtd_avg_c_tests));
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
@@ -1013,6 +1197,7 @@
   make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
@@ -1061,8 +1246,65 @@
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x4d_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x4d_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+const SadMxNx4AvgParam x4d_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avg_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avg_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avg_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_avg_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avg_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avg_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_avg_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avg_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avg_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_avg_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_avg_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_avg_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_avg_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_avg_c, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avg_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_avg_c, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avg_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_avg_c, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avg_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1),
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
 
 //------------------------------------------------------------------------------
 // ARM functions
@@ -1076,14 +1318,14 @@
   make_tuple(8, 8, &aom_sad8x8_neon, -1),
   make_tuple(4, 4, &aom_sad4x4_neon, -1),
 };
-INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
 const SadMxNx4Param x4d_neon_tests[] = {
   make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
   make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
 };
-INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -1106,6 +1348,7 @@
   make_tuple(8, 4, &aom_sad8x4_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
@@ -1117,6 +1360,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
@@ -1128,6 +1373,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
@@ -1139,8 +1386,41 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
@@ -1159,6 +1439,7 @@
   make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
@@ -1170,6 +1451,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
@@ -1181,6 +1464,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
@@ -1192,8 +1477,41 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_avg_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
@@ -1212,6 +1530,7 @@
   make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
@@ -1251,8 +1570,66 @@
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x4d_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x4d_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avg_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avg_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avg_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_avg_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avg_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avg_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_avg_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avg_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avg_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_avg_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_avg_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_avg_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_avg_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_avg_sse2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avg_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_avg_sse2, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avg_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_avg_sse2, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avg_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_avg_sse2, -1),
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest,
+                         ::testing::ValuesIn(x4d_avg_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -1281,9 +1658,16 @@
   make_tuple(32, 8, &aom_sad32xh_sse2, -1),
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+
+  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
+  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
+  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
+  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
+  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
 };
-INSTANTIATE_TEST_CASE_P(SSE2, DistWtdSADTest,
-                        ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest,
+                         ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
 
 #endif  // HAVE_SSSE3
 
@@ -1310,10 +1694,17 @@
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdCompAvgTest,
-                        ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
 
 const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
   make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1),
@@ -1332,9 +1723,16 @@
   make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1),
+  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1),
+  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1),
 };
-INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdSADavgTest,
-                        ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
@@ -1351,6 +1749,7 @@
   make_tuple(32, 64, &aom_sad32x64_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
@@ -1384,8 +1783,22 @@
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
+
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
 const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
@@ -1396,6 +1809,7 @@
   make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
@@ -1429,17 +1843,35 @@
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
+
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
-  make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
-  make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
@@ -1473,8 +1905,22 @@
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
+
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
 
 //------------------------------------------------------------------------------
@@ -1495,7 +1941,7 @@
   make_tuple(4, 8, &aom_sad4x8_msa, -1),
   make_tuple(4, 4, &aom_sad4x4_msa, -1),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
 
 const SadMxNAvgParam avg_msa_tests[] = {
   make_tuple(64, 64, &aom_sad64x64_avg_msa, -1),
@@ -1512,7 +1958,7 @@
   make_tuple(4, 8, &aom_sad4x8_avg_msa, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_msa, -1),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
 
 const SadMxNx4Param x4d_msa_tests[] = {
   make_tuple(64, 64, &aom_sad64x64x4d_msa, -1),
@@ -1529,7 +1975,7 @@
   make_tuple(4, 8, &aom_sad4x8x4d_msa, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_msa, -1),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
 #endif  // HAVE_MSA
 
 }  // namespace

diff --git a/libaom/test/sb_multipass_test.cc b/libaom/test/sb_multipass_test.cc
new file mode 100644
index 0000000..0ca76ab
--- /dev/null
+++ b/libaom/test/sb_multipass_test.cc

@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+class AV1SBMultipassTest
+    : public ::libaom_test::CodecTestWith2Params<int, bool>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1SBMultipassTest()
+      : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)),
+        row_mt_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
+  }
+  virtual ~AV1SBMultipassTest() { delete decoder_; }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      SetTileSize(encoder);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, use_multipass_);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+    encoder->Control(AV1E_SET_TILE_ROWS, 1);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
+  }
+
+  void DoTest() {
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6);
+    cfg_.rc_target_bitrate = 1000;
+
+    // Encode while coding each sb once
+    use_multipass_ = false;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> single_pass_size_enc;
+    std::vector<std::string> single_pass_md5_enc;
+    std::vector<std::string> single_pass_md5_dec;
+    single_pass_size_enc = size_enc_;
+    single_pass_md5_enc = md5_enc_;
+    single_pass_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Encode while coding each sb twice
+    use_multipass_ = true;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_pass_size_enc;
+    std::vector<std::string> multi_pass_md5_enc;
+    std::vector<std::string> multi_pass_md5_dec;
+    multi_pass_size_enc = size_enc_;
+    multi_pass_md5_enc = md5_enc_;
+    multi_pass_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(single_pass_size_enc, multi_pass_size_enc);
+    ASSERT_EQ(single_pass_md5_enc, multi_pass_md5_enc);
+    ASSERT_EQ(single_pass_md5_dec, multi_pass_md5_dec);
+  }
+
+  bool use_multipass_;
+  int set_cpu_used_;
+  bool row_mt_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
+};
+
+TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(AV1SBMultipassTest, ::testing::Range(0, 6),
+                          ::testing::Bool());
+
+}  // namespace

diff --git a/libaom/test/selfguided_filter_test.cc b/libaom/test/selfguided_filter_test.cc
index 464a58f..d65cce5 100644
--- a/libaom/test/selfguided_filter_test.cc
+++ b/libaom/test/selfguided_filter_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <ctime>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -27,8 +28,8 @@
 namespace {
 
 using libaom_test::ACMRandom;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
                         int eps, const int *xqd, uint8_t *dst8, int dst_stride,
@@ -88,8 +89,9 @@
           int h = AOMMIN(pu_height, height - k);
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
-          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                         output_p, out_stride, tmpbuf, 8, 0);
+          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                             output_p, out_stride, tmpbuf, 8,
+                                             0);
         }
     }
     aom_usec_timer_mark(&ref_timer);
@@ -175,8 +177,9 @@
           uint8_t *output2_p = output2 + k * out_stride + j;
           tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
                    tmpbuf, 8, 0);
-          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                         output2_p, out_stride, tmpbuf, 8, 0);
+          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                             output2_p, out_stride, tmpbuf, 8,
+                                             0);
         }
 
       for (j = 0; j < test_h; ++j)
@@ -199,20 +202,24 @@
 TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
-                        ::testing::Values(apply_selfguided_restoration_sse4_1));
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_sse4_1));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest,
-                        ::testing::Values(apply_selfguided_restoration_avx2));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_avx2));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, AV1SelfguidedFilterTest,
-                        ::testing::Values(apply_selfguided_restoration_neon));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_neon));
 #endif
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Test parameter list:
 //  <tst_fun_, bit_depth>
 typedef tuple<SgrFunc, int> HighbdFilterTestParam;
@@ -269,7 +276,7 @@
           int h = AOMMIN(pu_height, height - k);
           uint16_t *input_p = input + k * stride + j;
           uint16_t *output_p = output + k * out_stride + j;
-          apply_selfguided_restoration_c(
+          av1_apply_selfguided_restoration_c(
               CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
               CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
         }
@@ -362,7 +369,7 @@
           tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
                    CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
                    1);
-          apply_selfguided_restoration_c(
+          av1_apply_selfguided_restoration_c(
               CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
               CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
         }
@@ -387,24 +394,27 @@
 
 #if HAVE_SSE4_1
 const int highbd_params_sse4_1[] = { 8, 10, 12 };
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1HighbdSelfguidedFilterTest,
-    ::testing::Combine(::testing::Values(apply_selfguided_restoration_sse4_1),
-                       ::testing::ValuesIn(highbd_params_sse4_1)));
+    ::testing::Combine(
+        ::testing::Values(av1_apply_selfguided_restoration_sse4_1),
+        ::testing::ValuesIn(highbd_params_sse4_1)));
 #endif
 
 #if HAVE_AVX2
 const int highbd_params_avx2[] = { 8, 10, 12 };
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdSelfguidedFilterTest,
-    ::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2),
+    ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_avx2),
                        ::testing::ValuesIn(highbd_params_avx2)));
 #endif
+
 #if HAVE_NEON
 const int highbd_params_neon[] = { 8, 10, 12 };
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AV1HighbdSelfguidedFilterTest,
-    ::testing::Combine(::testing::Values(apply_selfguided_restoration_neon),
+    ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_neon),
                        ::testing::ValuesIn(highbd_params_neon)));
 #endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/simd_cmp_impl.h b/libaom/test/simd_cmp_impl.h
index 2aa02c8..d3eb336 100644
--- a/libaom/test/simd_cmp_impl.h
+++ b/libaom/test/simd_cmp_impl.h

@@ -469,11 +469,8 @@
   fptr simd;
 } mapping;
 
-#define MAP(name)                                \
-  {                                              \
-#name, reinterpret_cast < fptr > (c_##name), \
-        reinterpret_cast < fptr > (name)         \
-  }
+#define MAP(name) \
+  { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
 
 const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v64_ssd_u8),
@@ -1478,8 +1475,8 @@
       (CArg1(*const)(const void *))c_load1;
   CArg2 (*const my_c_load2)(const void *) =
       (CArg2(*const)(const void *))c_load2;
-  CArg2 (*const my_c_load3)(const void *) =
-      (CArg2(*const)(const void *))c_load3;
+  CArg3 (*const my_c_load3)(const void *) =
+      (CArg3(*const)(const void *))c_load3;
   CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
       (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
 
@@ -1734,8 +1731,9 @@
   }
 
   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
-                      << Print(s, sizeof(s)) << ") -> " << Print(d, sizeof(d))
-                      << " (simd), " << Print(ref_d, sizeof(ref_d)) << " (ref)";
+                      << Print(s, sizeof(CArg)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
 }
 
 template <typename CRet, typename CArg1, typename CArg2>
@@ -1990,9 +1988,10 @@
   }
 
   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
-                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
-                      << ") -> " << Print(d, sizeof(d)) << " (simd), "
-                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+                      << Print(s1, sizeof(CArg1)) << ", "
+                      << Print(s2, sizeof(CArg2)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
 }
 
 template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
@@ -2066,10 +2065,11 @@
   }
 
   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
-                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
-                      << ", " << Print(s3, sizeof(s3)) << ") -> "
-                      << Print(d, sizeof(d)) << " (simd), "
-                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+                      << Print(s1, sizeof(CArg1)) << ", "
+                      << Print(s2, sizeof(CArg2)) << ", "
+                      << Print(s3, sizeof(CArg3)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
 }
 
 // Instantiations to make the functions callable from another files

diff --git a/libaom/test/simd_impl.h b/libaom/test/simd_impl.h
index fd06f67..61fda00 100644
--- a/libaom/test/simd_impl.h
+++ b/libaom/test/simd_impl.h

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #define SIMD_CHECK 1
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/clear_system_state.h"
@@ -23,9 +25,9 @@
  public:
   virtual ~TestIntrinsic() {}
   virtual void SetUp() {
-    mask = ::testing::get<0>(this->GetParam());
-    maskwidth = ::testing::get<1>(this->GetParam());
-    name = ::testing::get<2>(this->GetParam());
+    mask = std::get<0>(this->GetParam());
+    maskwidth = std::get<1>(this->GetParam());
+    name = std::get<2>(this->GetParam());
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
@@ -36,8 +38,8 @@
 };
 
 // Create one typedef for each function signature
-#define TYPEDEF_SIMD(name)                                                    \
-  typedef TestIntrinsic< ::testing::tuple<uint32_t, uint32_t, const char *> > \
+#define TYPEDEF_SIMD(name)                                             \
+  typedef TestIntrinsic<std::tuple<uint32_t, uint32_t, const char *> > \
       ARCH_POSTFIX(name)
 
 TYPEDEF_SIMD(V64_U8);
@@ -350,16 +352,16 @@
   TestSimd1Arg<c_v64, c_v256>(kIterations, mask, maskwidth, name);
 }
 
-// Add a macro layer since INSTANTIATE_TEST_CASE_P will quote the name
+// Add a macro layer since INSTANTIATE_TEST_SUITE_P will quote the name
 // so we need to expand it first with the prefix
 #define INSTANTIATE(name, type, ...) \
-  INSTANTIATE_TEST_CASE_P(name, type, ::testing::Values(__VA_ARGS__))
+  INSTANTIATE_TEST_SUITE_P(name, type, ::testing::Values(__VA_ARGS__))
 
 #define SIMD_TUPLE(name, mask, maskwidth) \
-  ::testing::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
+  std::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64),
-            (SIMD_TUPLE(v64_sad_u8, 0U, 0U), SIMD_TUPLE(v64_ssd_u8, 0U, 0U)));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64), SIMD_TUPLE(v64_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v64_ssd_u8, 0U, 0U));
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V64_V64V64), SIMD_TUPLE(v64_add_8, 0U, 0U),

diff --git a/libaom/test/subtract_test.cc b/libaom/test/subtract_test.cc
index 7dcedf5..4001e8b 100644
--- a/libaom/test/subtract_test.cc
+++ b/libaom/test/subtract_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
@@ -88,30 +90,31 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, AV1SubtractBlockTest,
-                        ::testing::Values(aom_subtract_block_c));
+INSTANTIATE_TEST_SUITE_P(C, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1SubtractBlockTest,
-                        ::testing::Values(aom_subtract_block_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_sse2));
 #endif
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, AV1SubtractBlockTest,
-                        ::testing::Values(aom_subtract_block_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_neon));
 #endif
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, AV1SubtractBlockTest,
-                        ::testing::Values(aom_subtract_block_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_msa));
 #endif
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                                 ptrdiff_t diff_stride, const uint8_t *src_ptr,
                                 ptrdiff_t src_stride, const uint8_t *pred_ptr,
                                 ptrdiff_t pred_stride, int bd);
 
-using ::testing::get;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::get;
+using std::make_tuple;
+using std::tuple;
 
 // <width, height, bit_dpeth, subtract>
 typedef tuple<int, int, int, HBDSubtractFunc> Params;
@@ -207,7 +210,6 @@
 TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
 
 #if HAVE_SSE2
-
 const Params kAV1HBDSubtractBlock_sse2[] = {
   make_tuple(4, 4, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(4, 4, 12, &aom_highbd_subtract_block_c),
@@ -243,7 +245,8 @@
   make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HBDSubtractBlockTest,
-                        ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HBDSubtractBlockTest,
+                         ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
 #endif  // HAVE_SSE2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace

diff --git a/libaom/test/sum_squares_test.cc b/libaom/test/sum_squares_test.cc
index f26a646..8845466 100644
--- a/libaom/test/sum_squares_test.cc
+++ b/libaom/test/sum_squares_test.cc

@@ -12,6 +12,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -157,7 +158,7 @@
 
 #if HAVE_SSE2
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, SumSquaresTest,
     ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
                                 &aom_sum_squares_2d_i16_sse2)));
@@ -165,7 +166,7 @@
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, SumSquaresTest,
     ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
                                 &aom_sum_squares_2d_i16_avx2)));
@@ -224,9 +225,9 @@
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, SumSquares1DTest,
-                        ::testing::Values(TestFuncs1D(
-                            aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSquares1DTest,
+                         ::testing::Values(TestFuncs1D(
+                             aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
 
 #endif  // HAVE_SSE2
 
@@ -234,7 +235,7 @@
                             int b_stride, int width, int height);
 typedef libaom_test::FuncParam<sse_func> TestSSEFuncs;
 
-typedef ::testing::tuple<TestSSEFuncs, int> SSETestParam;
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
 
 class SSETest : public ::testing::TestWithParam<SSETestParam> {
  public:
@@ -242,7 +243,12 @@
   virtual void SetUp() {
     params_ = GET_PARAM(0);
     width_ = GET_PARAM(1);
-    isHbd_ = params_.ref_func == aom_highbd_sse_c;
+    isHbd_ =
+#if CONFIG_AV1_HIGHBITDEPTH
+        params_.ref_func == aom_highbd_sse_c;
+#else
+        0;
+#endif
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
     ref_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
@@ -379,20 +385,455 @@
     RunTest(1, width_, height, 100);
   }
 }
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE4_1
-TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
-                            TestSSEFuncs(&aom_highbd_sse_c,
-                                         &aom_highbd_sse_sse4_1) };
-INSTANTIATE_TEST_CASE_P(SSE4_1, SSETest,
-                        Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 
-TestSSEFuncs sse_avx2[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
-                            TestSSEFuncs(&aom_highbd_sse_c,
-                                         &aom_highbd_sse_avx2) };
-INSTANTIATE_TEST_CASE_P(AVX2, SSETest,
-                        Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
 #endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// get_blk sum squares test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*sse_sum_func)(const int16_t *data, int stride, int bw, int bh,
+                             int *x_sum, int64_t *x2_sum);
+typedef libaom_test::FuncParam<sse_sum_func> TestSSE_SumFuncs;
+
+typedef std::tuple<TestSSE_SumFuncs, int> SSE_SumTestParam;
+
+class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
+ public:
+  virtual ~SSE_Sum_Test() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int16_t *data,
+                      int16_t val) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        data[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  int width_;
+  TestSSE_SumFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = 12;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, -limit);
+      }
+    }
+    int sum_c = 0;
+    int64_t sse_intr = 0;
+    int sum_intr = 0;
+    int64_t sse_c = 0;
+
+    params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+    params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height);
+
+    } else {
+      EXPECT_EQ(sum_c, sum_intr)
+          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+      EXPECT_EQ(sse_c, sse_intr)
+          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+TEST_P(SSE_Sum_Test, OperationCheck) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(1, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSE_Sum_Test, ExtremeValues) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(0, width_, height, 1);
+  }
+}
+
+TEST_P(SSE_Sum_Test, DISABLED_Speed) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(1, width_, height, 10000);
+  }
+}
+
+#if HAVE_SSE2
+TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
+    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
+                         Combine(ValuesIn(sse_sum_sse2), Range(4, 65, 4)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
+    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
+                         Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 2D Variance test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*Var2DFunc)(uint8_t *src, int stride, int width, int height);
+typedef libaom_test::FuncParam<Var2DFunc> TestFuncVar2D;
+
+const uint16_t test_block_size[2] = { 128, 256 };
+
+class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+  virtual ~Lowbd2dVarTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 7;  // Up to 8 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 7;  // Up to 8 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : 0;
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncVar2D params_;
+  uint8_t *src_;
+  ACMRandom rnd_;
+};
+
+void Lowbd2dVarTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
+    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
+    int stride = 4 << rnd_(8);              // Up to 512 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+
+    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst =
+                                 params_.tst_func(src_, stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void Lowbd2dVarTest::RunSpeedTest() {
+  for (int block = 0; block < 2; block++) {
+    const int width = test_block_size[block];
+    const int height = test_block_size[block];
+    int stride = 4 << rnd_(8);  // Up to 512 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+           (double)elapsed_time / elapsed_time1);
+  }
+}
+
+TEST_P(Lowbd2dVarTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(Lowbd2dVarTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Lowbd2dVarTest,
+                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+                                                         &aom_var_2d_u8_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Lowbd2dVarTest,
+                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+                                                         &aom_var_2d_u8_avx2)));
+
+#endif  // HAVE_SSE2
+
+class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+  virtual ~Highbd2dVarTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : 0;
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncVar2D params_;
+  uint16_t *src_;
+  ACMRandom rnd_;
+};
+
+void Highbd2dVarTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
+    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
+    int stride = 4 << rnd_(8);              // Up to 512 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+
+    const uint64_t res_ref =
+        params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(
+        res_tst =
+            params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void Highbd2dVarTest::RunSpeedTest() {
+  for (int block = 0; block < 2; block++) {
+    const int width = test_block_size[block];
+    const int height = test_block_size[block];
+    int stride = 4 << rnd_(8);  // Up to 512 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+           (double)elapsed_time / elapsed_time1);
+  }
+}
+
+TEST_P(Highbd2dVarTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(Highbd2dVarTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, Highbd2dVarTest,
+    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Highbd2dVarTest,
+    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_avx2)));
+
+#endif  // HAVE_SSE2
 }  // namespace

diff --git a/libaom/test/superframe_test.cc b/libaom/test/superframe_test.cc
index 2cec95a..024a18b 100644
--- a/libaom/test/superframe_test.cc
+++ b/libaom/test/superframe_test.cc

@@ -10,6 +10,7 @@
  */
 
 #include <climits>
+#include <tuple>
 #include <vector>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
@@ -23,7 +24,7 @@
 const int kTileCols = 1;
 const int kTileRows = 2;
 
-typedef ::testing::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
+typedef std::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
 
 class SuperframeTest
     : public ::libaom_test::CodecTestWithParam<SuperframeTestParam>,
@@ -35,12 +36,12 @@
   virtual void SetUp() {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
-    const libaom_test::TestMode mode = ::testing::get<kTestMode>(input);
+    const libaom_test::TestMode mode = std::get<kTestMode>(input);
     SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
-    n_tile_cols_ = ::testing::get<kTileCols>(input);
-    n_tile_rows_ = ::testing::get<kTileRows>(input);
+    n_tile_cols_ = std::get<kTileCols>(input);
+    n_tile_rows_ = std::get<kTileRows>(input);
   }
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,

diff --git a/libaom/test/svc_datarate_test.cc b/libaom/test/svc_datarate_test.cc
new file mode 100644
index 0000000..28e517b
--- /dev/null
+++ b/libaom/test/svc_datarate_test.cc

@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/datarate_test.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+#include "av1/common/enums.h"
+
+namespace datarate_test {
+namespace {
+
+class DatarateTestSVC
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 unsigned int, int>,
+      public DatarateTest {
+ public:
+  DatarateTestSVC() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
+
+  virtual void ResetModel() {
+    DatarateTest::ResetModel();
+    layer_frame_cnt_ = 0;
+    superframe_cnt_ = 0;
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 1;
+    for (int i = 0; i < AOM_MAX_LAYERS; i++) {
+      target_layer_bitrate_[i] = 0;
+      effective_datarate_tl[i] = 0.0;
+    }
+    memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
+    memset(&svc_params_, 0, sizeof(aom_svc_params_t));
+    memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    int spatial_layer_id = 0;
+    if (video->frame() == 0) {
+      initialize_svc(number_temporal_layers_, number_spatial_layers_,
+                     &svc_params_);
+      encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0);
+      encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+    }
+    if (number_spatial_layers_ == 2) {
+      spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
+    } else if (number_spatial_layers_ == 3) {
+      spatial_layer_id = (layer_frame_cnt_ % 3 == 0)
+                             ? 0
+                             : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1 : 2;
+    }
+    // Set the reference/update flags, layer_id, and reference_map
+    // buffer index.
+    frame_flags_ = set_layer_pattern(video->frame(), &layer_id_,
+                                     &ref_frame_config_, spatial_layer_id);
+    encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+    encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    layer_frame_cnt_++;
+    DatarateTest::PreEncodeFrameHook(video, encoder);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    // Update the layer cumulative  bitrate.
+    for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_;
+         i++) {
+      int layer = layer_id_.spatial_layer_id * number_temporal_layers_ + i;
+      effective_datarate_tl[layer] += 1.0 * frame_size_in_bits;
+    }
+    if (layer_id_.spatial_layer_id == number_spatial_layers_ - 1) {
+      last_pts_ = pkt->data.frame.pts;
+      superframe_cnt_++;
+    }
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = ((last_pts_ + 1) * timebase_);
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
+    }
+  }
+
+  // Layer pattern configuration.
+  virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id,
+                                aom_svc_ref_frame_config_t *ref_frame_config,
+                                int spatial_layer) {
+    layer_id->spatial_layer_id = spatial_layer;
+    // Set the referende map buffer idx for the 7 references:
+    // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+    // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+    for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      ref_frame_config->ref_idx[i] = i;
+      ref_frame_config->reference[i] = 0;
+    }
+    for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+    // Set layer_flags to 0 when using ref_frame_config->reference.
+    int layer_flags = 0;
+    // Always reference LAST.
+    ref_frame_config->reference[0] = 1;
+    if (number_temporal_layers_ == 3 && number_spatial_layers_ == 1) {
+      // 3-layer:
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (frame_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[3] = 1;
+      } else if ((frame_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+      } else if ((frame_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+      } else if ((frame_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[1] = 0;
+      }
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) {
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Keep LAST and GOLDEN in slots 0 and 3.
+        ref_frame_config->ref_idx[0] = 0;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 3
+        // and GOLDEN to slot 0. Update slot 3 (LAST).
+        ref_frame_config->ref_idx[0] = 3;
+        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->refresh[3] = 1;
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 3) {
+      // 3 spatial layers, 1 temporal.
+      // Note for this case , we set the buffer idx for all references to be
+      // either LAST or GOLDEN, which are always valid references, since decoder
+      // will check if any of the 7 references is valid scale in
+      // valid_ref_frame_size().
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Set all other buffer_idx to 0.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN (and all other refs) to slot 0.
+        // Update slot 1 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+        // and GOLDEN (and all other refs) to slot 1.
+        // Update slot 2 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
+      // 3 spatial and 3 temporal layer.
+      if (superframe_cnt_ % 4 == 0) {
+        // Base temporal layer.
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST, update LAST.
+          // Set all buffer_idx to 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 0.
+          // Update slot 1 (LAST).
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 1.
+          // Update slot 2 (LAST).
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->refresh[2] = 1;
+        }
+      } else if ((superframe_cnt_ - 1) % 4 == 0) {
+        // First top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST (slot 0).
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to slot 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and Update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 4.
+          // No update.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+          ref_frame_config->ref_idx[0] = 2;
+        }
+      } else if ((superframe_cnt_ - 2) % 4 == 0) {
+        // Middle temporal enhancement layer.
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST.
+          // Set all buffer_idx to 0.
+          // Set GOLDEN to slot 5 and update slot 5.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 5;
+          ref_frame_config->refresh[5] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 5.
+          // Set LAST2 to slot 6 and update slot 6.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 5;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[2] = 6;
+          ref_frame_config->refresh[6] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 6.
+          // Set LAST2 to slot 6 and update slot 7.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 6;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[2] = 7;
+          ref_frame_config->refresh[7] = 1;
+        }
+      } else if ((superframe_cnt_ - 3) % 4 == 0) {
+        // Second top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Set LAST to slot 5 and reference LAST.
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 5;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 6;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // GOLDEN to slot 4. No update.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 7;
+          ref_frame_config->ref_idx[3] = 4;
+        }
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    }
+    return layer_flags;
+  }
+
+  virtual void initialize_svc(int number_temporal_layers,
+                              int number_spatial_layers,
+                              aom_svc_params *svc_params) {
+    svc_params->number_spatial_layers = number_spatial_layers;
+    svc_params->number_temporal_layers = number_temporal_layers;
+    for (int i = 0; i < number_temporal_layers * number_spatial_layers; ++i) {
+      svc_params->max_quantizers[i] = 60;
+      svc_params->min_quantizers[i] = 2;
+      svc_params->layer_target_bitrate[i] = target_layer_bitrate_[i];
+    }
+    // Do at most 3 spatial or temporal layers here.
+    svc_params->framerate_factor[0] = 1;
+    if (number_temporal_layers == 2) {
+      svc_params->framerate_factor[0] = 2;
+      svc_params->framerate_factor[1] = 1;
+    } else if (number_temporal_layers == 3) {
+      svc_params->framerate_factor[0] = 4;
+      svc_params->framerate_factor[1] = 2;
+      svc_params->framerate_factor[2] = 1;
+    }
+    svc_params->scaling_factor_num[0] = 1;
+    svc_params->scaling_factor_den[0] = 1;
+    if (number_spatial_layers == 2) {
+      svc_params->scaling_factor_num[0] = 1;
+      svc_params->scaling_factor_den[0] = 2;
+      svc_params->scaling_factor_num[1] = 1;
+      svc_params->scaling_factor_den[1] = 1;
+    } else if (number_spatial_layers == 3) {
+      svc_params->scaling_factor_num[0] = 1;
+      svc_params->scaling_factor_den[0] = 4;
+      svc_params->scaling_factor_num[1] = 1;
+      svc_params->scaling_factor_den[1] = 2;
+      svc_params->scaling_factor_num[2] = 1;
+      svc_params->scaling_factor_den[2] = 1;
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL2SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 300, 600 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 2;
+    target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
+    target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLKfTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.kf_mode = AOM_KF_AUTO;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 100;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  int layer_frame_cnt_;
+  int superframe_cnt_;
+  int number_temporal_layers_;
+  int number_spatial_layers_;
+  // Allow for up to 3 temporal layers.
+  int target_layer_bitrate_[AOM_MAX_LAYERS];
+  aom_svc_params_t svc_params_;
+  aom_svc_ref_frame_config_t ref_frame_config_;
+  aom_svc_layer_id_t layer_id_;
+  double effective_datarate_tl[AOM_MAX_LAYERS];
+};
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SL) {
+  BasicRateTargetingSVC3TL1SLTest();
+}
+
+// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) {
+  BasicRateTargetingSVC1TL2SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SL) {
+  BasicRateTargetingSVC1TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) {
+  BasicRateTargetingSVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) {
+  BasicRateTargetingSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for auto key frame mode with short key frame period.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) {
+  BasicRateTargetingSVC3TL3SLKfTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestSVC,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9),
+                          ::testing::Range<unsigned int>(0, 4),
+                          ::testing::Values(0, 1));
+
+}  // namespace
+}  // namespace datarate_test

diff --git a/libaom/test/temporal_filter_planewise_test.cc b/libaom/test/temporal_filter_planewise_test.cc
new file mode 100644
index 0000000..c3f3e9e
--- /dev/null
+++ b/libaom/test/temporal_filter_planewise_test.cc

@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+typedef void (*TemporalFilterPlanewiseFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
+    TemporalFilterPlanewiseFuncParam;
+
+typedef std::tuple<TemporalFilterPlanewiseFuncParam, int>
+    TemporalFilterPlanewiseWithParam;
+
+class TemporalFilterPlanewiseTest
+    : public ::testing::TestWithParam<TemporalFilterPlanewiseWithParam> {
+ public:
+  virtual ~TemporalFilterPlanewiseTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+
+    ASSERT_TRUE(src1_ != NULL);
+    ASSERT_TRUE(src2_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride, int stride2) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src1_[ii * stride + jj] = rnd_.Rand8();
+        src2_[ii * stride2 + jj] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int stride2, uint8_t *data2, uint8_t val) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        data[ii * stride + jj] = val;
+        data2[ii * stride2 + jj] = (255 - val);
+      }
+    }
+  }
+
+ protected:
+  TemporalFilterPlanewiseFuncParam params_;
+  uint8_t *src1_;
+  uint8_t *src2_;
+  ACMRandom rnd_;
+};
+
+void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height,
+                                          int run_times) {
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2);
+    } else {
+      const int msb = 8;  // Up to 8 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
+      }
+    }
+    double sigma[1] = { 2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const BLOCK_SIZE block_size = BLOCK_32X32;
+    const int use_subblock = 0;
+    const int block_mse = 20;
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    const int num_planes = 1;
+    YV12_BUFFER_CONFIG *ref_frame =
+        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+    ref_frame->heights[0] = height;
+    ref_frame->strides[0] = stride;
+    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = src;
+    ref_frame->buffers[0] = ref_frame->buffer_alloc;
+    ref_frame->flags = 0;  // Only support low bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+
+    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+    mbd->plane[0].subsampling_y = 0;
+    mbd->plane[0].subsampling_x = 0;
+    mbd->bd = 8;
+
+    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_ref, count_ref);
+    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C count does not match optimized count.";
+        }
+      }
+    }
+
+    free(ref_frame);
+    free(mbd);
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, OperationCheck) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(0, height, height, 1);
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 100000);
+  }
+}
+
+#if HAVE_AVX2
+TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = {
+  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
+                                   &av1_apply_temporal_filter_planewise_avx2)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest,
+                         Combine(ValuesIn(temporal_filter_planewise_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = {
+  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
+                                   &av1_apply_temporal_filter_planewise_sse2)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest,
+                         Combine(ValuesIn(temporal_filter_planewise_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+
+}  // namespace
+#endif

diff --git a/libaom/test/yuv_temporal_filter_test.cc b/libaom/test/temporal_filter_yuv_test.cc
similarity index 77%
rename from libaom/test/yuv_temporal_filter_test.cc
rename to libaom/test/temporal_filter_yuv_test.cc
index fcaf0df..dc17aaa 100644
--- a/libaom/test/yuv_temporal_filter_test.cc
+++ b/libaom/test/temporal_filter_yuv_test.cc

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <ostream>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
@@ -24,20 +26,17 @@
 const int MAX_WIDTH = 32;
 const int MAX_HEIGHT = 32;
 
-typedef void (*YUVTemporalFilterFunc)(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
-    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+typedef void (*TemporalFilterYUVFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
 
 struct TemporalFilterWithBd {
-  TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+  TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
       : temporal_filter(func), bd(bitdepth) {}
 
-  YUVTemporalFilterFunc temporal_filter;
+  TemporalFilterYUVFunc temporal_filter;
   int bd;
 };
 
@@ -74,9 +73,9 @@
 template <>
 int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
                          int filter_weight) {
-  unsigned int index_mult[14] = {
-    0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
-  };
+  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                  39322, 32768, 28087, 24576, 21846,
+                                  19661, 17874, 0,     15124 };
 
   assert(index >= 0 && index <= 13);
   assert(index_mult[index] != 0);
@@ -354,7 +353,7 @@
   }
 }
 
-class YUVTemporalFilterTest
+class TemporalFilterYUVTest
     : public ::testing::TestWithParam<TemporalFilterWithBd> {
  public:
   virtual void SetUp() {
@@ -390,7 +389,7 @@
                        uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
                        uint16_t *v_count);
 
-  YUVTemporalFilterFunc filter_func_;
+  TemporalFilterYUVFunc filter_func_;
   ACMRandom rnd_;
   int saturate_test_;
   int num_repeats_;
@@ -399,7 +398,7 @@
 };
 
 template <>
-void YUVTemporalFilterTest::ApplyTestFilter<uint8_t>(
+void TemporalFilterYUVTest::ApplyTestFilter<uint8_t>(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
     int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
     int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
@@ -407,15 +406,73 @@
     int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
     uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
     uint32_t *v_accum, uint16_t *v_count) {
+  (void)block_width;
+  (void)block_height;
+  (void)y_src_stride;
+  (void)uv_src_stride;
+
+  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
+  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
+  const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
+  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
+  const int mb_row = 0;
+  const int mb_col = 0;
+  const int use_subblock = !(use_32x32);
+
+  YV12_BUFFER_CONFIG *ref_frame =
+      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+  ref_frame->strides[0] = y_pre_stride;
+  ref_frame->strides[1] = uv_pre_stride;
+  const int alloc_size = MAX_MB_PLANE * mb_pels;
+  DECLARE_ALIGNED(16, uint8_t, src[alloc_size]);
+  ref_frame->buffer_alloc = src;
+  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
+  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
+  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
+  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
+
+  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+  mbd->plane[0].subsampling_y = 0;
+  mbd->plane[0].subsampling_x = 0;
+  mbd->plane[1].subsampling_y = ss_y;
+  mbd->plane[1].subsampling_x = ss_x;
+  mbd->plane[2].subsampling_y = ss_y;
+  mbd->plane[2].subsampling_x = ss_x;
+
+  DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]);
+  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
+  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
+  memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t));
+  memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t));
+  memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t));
+  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
+  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
+
   ASM_REGISTER_STATE_CHECK(
-      filter_func_(y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src,
-                   uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width,
-                   block_height, ss_x, ss_y, strength, blk_fw, use_32x32,
-                   y_accum, y_count, u_accum, u_count, v_accum, v_count));
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
+
+  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
+
+  free(ref_frame);
+  free(mbd);
 }
 
 template <>
-void YUVTemporalFilterTest::ApplyTestFilter<uint16_t>(
+void TemporalFilterYUVTest::ApplyTestFilter<uint16_t>(
     const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
     int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
     int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
@@ -423,16 +480,74 @@
     int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
     uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
     uint32_t *v_accum, uint16_t *v_count) {
-  ASM_REGISTER_STATE_CHECK(filter_func_(
-      CONVERT_TO_BYTEPTR(y_src), y_src_stride, CONVERT_TO_BYTEPTR(y_pre),
-      y_pre_stride, CONVERT_TO_BYTEPTR(u_src), CONVERT_TO_BYTEPTR(v_src),
-      uv_src_stride, CONVERT_TO_BYTEPTR(u_pre), CONVERT_TO_BYTEPTR(v_pre),
-      uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw,
-      use_32x32, y_accum, y_count, u_accum, u_count, v_accum, v_count));
+  (void)block_width;
+  (void)block_height;
+  (void)y_src_stride;
+  (void)uv_src_stride;
+
+  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
+  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
+  const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
+  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
+  const int mb_row = 0;
+  const int mb_col = 0;
+  const int use_subblock = !(use_32x32);
+
+  YV12_BUFFER_CONFIG *ref_frame =
+      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+  ref_frame->strides[0] = y_pre_stride;
+  ref_frame->strides[1] = uv_pre_stride;
+  const int alloc_size = MAX_MB_PLANE * mb_pels;
+  DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]);
+  ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16);
+  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
+  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
+  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
+  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
+
+  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+  mbd->plane[0].subsampling_y = 0;
+  mbd->plane[0].subsampling_x = 0;
+  mbd->plane[1].subsampling_y = ss_y;
+  mbd->plane[1].subsampling_x = ss_x;
+  mbd->plane[2].subsampling_y = ss_y;
+  mbd->plane[2].subsampling_x = ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]);
+  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
+  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
+  memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t));
+  memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t));
+  memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t));
+  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
+  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
+  const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
+
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
+
+  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
+
+  free(ref_frame);
+  free(mbd);
 }
 
 template <typename PixelType>
-void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+void TemporalFilterYUVTest::CompareTestWithParam(int width, int height,
                                                  int ss_x, int ss_y,
                                                  int filter_strength,
                                                  int use_32x32,
@@ -533,7 +648,7 @@
 }
 
 template <typename PixelType>
-void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height,
                                                    int ss_x, int ss_y,
                                                    int filter_strength,
                                                    int use_32x32,
@@ -568,7 +683,7 @@
   }
 }
 
-TEST_P(YUVTemporalFilterTest, Use32x32) {
+TEST_P(TemporalFilterYUVTest, Use32x32) {
   const int width = 32, height = 32;
   const int use_32x32 = 1;
 
@@ -594,7 +709,7 @@
   }
 }
 
-TEST_P(YUVTemporalFilterTest, Use16x16) {
+TEST_P(TemporalFilterYUVTest, Use16x16) {
   const int width = 32, height = 32;
   const int use_32x32 = 0;
 
@@ -630,7 +745,7 @@
   }
 }
 
-TEST_P(YUVTemporalFilterTest, SaturationTest) {
+TEST_P(TemporalFilterYUVTest, SaturationTest) {
   const int width = 32, height = 32;
   const int use_32x32 = 1;
   const int filter_weight = 1;
@@ -657,7 +772,7 @@
   }
 }
 
-TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
+TEST_P(TemporalFilterYUVTest, DISABLED_Speed) {
   const int width = 32, height = 32;
   num_repeats_ = 1000;
 
@@ -707,20 +822,20 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(
-    C, YUVTemporalFilterTest,
+INSTANTIATE_TEST_SUITE_P(
+    C, TemporalFilterYUVTest,
     ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_c, 8),
-        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 10),
-        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 12)));
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12)));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, YUVTemporalFilterTest,
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, TemporalFilterYUVTest,
     ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_sse4_1, 8),
-        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 10),
-        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 12)));
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12)));
 #endif  // HAVE_SSE4_1
 
 }  // namespace

diff --git a/libaom/test/test-data.sha1 b/libaom/test/test-data.sha1
index 91487e4..383ae79 100644
--- a/libaom/test/test-data.sha1
+++ b/libaom/test/test-data.sha1

@@ -16,24 +16,33 @@
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
 b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf
 9655e6275888547ecd1f14e20e08ce4891372e76 *invalid-oss-fuzz-10389.ivf.res
+e5fe0e8984c42d53d4ff734c3fbfd57d5c5c25cf *invalid-oss-fuzz-10389.ivf.res.2
 11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
 cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
 758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
 88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
 1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
 0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
 7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
 15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res
 1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf
 1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-11479.ivf.res
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2
 b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf
 7c44ac1723c14d98bcb888fbf118c959511519ba *invalid-oss-fuzz-11523.ivf.res
+3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2
+cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res
+5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf
+09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-oss-fuzz-16437.ivf.res
 ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
 67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
 c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2
 f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
 a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf
@@ -65,6 +74,7 @@
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
 5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
 36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
+c542890ac929749000f7b3883174f2202070d834 *pixel_capture_w320h240.yuv
 c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf
 26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5
 a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf
@@ -542,3 +552,8 @@
 bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5
 b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf
 1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5
+f8724ed96272ddbc35776908f2df7cb9955766a9 *paris_352_288_30.y4m
+11bb40026103182c23a88133edafca369e5575e2 *av1-1-b8-23-film_grain-50.ivf
+c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5
+2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf
+83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5

diff --git a/libaom/test/test.cmake b/libaom/test/test.cmake
index a44737a..d4d3b29 100644
--- a/libaom/test/test.cmake
+++ b/libaom/test/test.cmake

@@ -35,6 +35,7 @@
             "${AOM_ROOT}/test/function_equivalence_test.h"
             "${AOM_ROOT}/test/log2_test.cc"
             "${AOM_ROOT}/test/md5_helper.h"
+            "${AOM_ROOT}/test/metadata_test.cc"
             "${AOM_ROOT}/test/register_state_check.h"
             "${AOM_ROOT}/test/test_vectors.cc"
             "${AOM_ROOT}/test/test_vectors.h"
@@ -60,12 +61,14 @@
             "${AOM_ROOT}/test/borders_test.cc"
             "${AOM_ROOT}/test/cpu_speed_test.cc"
             "${AOM_ROOT}/test/datarate_test.cc"
+            "${AOM_ROOT}/test/datarate_test.h"
+            "${AOM_ROOT}/test/svc_datarate_test.cc"
             "${AOM_ROOT}/test/encode_api_test.cc"
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
             "${AOM_ROOT}/test/end_to_end_test.cc"
             "${AOM_ROOT}/test/fwd_kf_test.cc"
-            "${AOM_ROOT}/test/gf_max_pyr_height_test.cc"
+            "${AOM_ROOT}/test/gf_pyr_height_test.cc"
             "${AOM_ROOT}/test/rt_end_to_end_test.cc"
             "${AOM_ROOT}/test/error_resilience_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
@@ -79,7 +82,8 @@
             "${AOM_ROOT}/test/scalability_test.cc"
             "${AOM_ROOT}/test/y4m_test.cc"
             "${AOM_ROOT}/test/y4m_video_source.h"
-            "${AOM_ROOT}/test/yuv_video_source.h")
+            "${AOM_ROOT}/test/yuv_video_source.h"
+            "${AOM_ROOT}/test/time_stamp_test.cc")
 
 list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
 list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
@@ -89,6 +93,7 @@
 
 if(NOT BUILD_SHARED_LIBS)
   list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/av1_common_int_test.cc"
               "${AOM_ROOT}/test/cdef_test.cc"
               "${AOM_ROOT}/test/cfl_test.cc"
               "${AOM_ROOT}/test/convolve_test.cc"
@@ -98,7 +103,6 @@
               "${AOM_ROOT}/test/intrabc_test.cc"
               "${AOM_ROOT}/test/intrapred_test.cc"
               "${AOM_ROOT}/test/lpf_test.cc"
-              "${AOM_ROOT}/test/onyxc_int_test.cc"
               "${AOM_ROOT}/test/scan_test.cc"
               "${AOM_ROOT}/test/selfguided_filter_test.cc"
               "${AOM_ROOT}/test/simd_cmp_impl.h"
@@ -115,6 +119,7 @@
                 "${AOM_ROOT}/test/av1_ext_tile_test.cc"
                 "${AOM_ROOT}/test/binary_codes_test.cc"
                 "${AOM_ROOT}/test/boolcoder_test.cc"
+                "${AOM_ROOT}/test/cnn_test.cc"
                 "${AOM_ROOT}/test/coding_path_sync.cc"
                 "${AOM_ROOT}/test/decode_multithreaded_test.cc"
                 "${AOM_ROOT}/test/divu_small_test.cc"
@@ -122,10 +127,21 @@
                 "${AOM_ROOT}/test/ec_test.cc"
                 "${AOM_ROOT}/test/ethread_test.cc"
                 "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/sb_multipass_test.cc"
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/superframe_test.cc"
                 "${AOM_ROOT}/test/tile_independence_test.cc"
-                "${AOM_ROOT}/test/yuv_temporal_filter_test.cc")
+                "${AOM_ROOT}/test/temporal_filter_planewise_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+    if(CONFIG_REALTIME_ONLY)
+      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/cnn_test.cc"
+                       "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+    endif()
+    if(NOT CONFIG_AV1_HIGHBITDEPTH)
+      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/coding_path_sync.cc")
+    endif()
   endif()
 
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
@@ -180,6 +196,7 @@
               "${AOM_ROOT}/test/av1_txfm_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.h"
               "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
+              "${AOM_ROOT}/test/avg_test.cc"
               "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
               "${AOM_ROOT}/test/blend_a64_mask_test.cc"
               "${AOM_ROOT}/test/comp_avg_pred_test.cc"
@@ -190,6 +207,8 @@
               "${AOM_ROOT}/test/error_block_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
+              "${AOM_ROOT}/test/fdct4x4_test.cc"
+              "${AOM_ROOT}/test/hadamard_test.cc"
               "${AOM_ROOT}/test/horver_correlation_test.cc"
               "${AOM_ROOT}/test/masked_sad_test.cc"
               "${AOM_ROOT}/test/masked_variance_test.cc"
@@ -198,12 +217,14 @@
               "${AOM_ROOT}/test/obmc_sad_test.cc"
               "${AOM_ROOT}/test/obmc_variance_test.cc"
               "${AOM_ROOT}/test/pickrst_test.cc"
+              "${AOM_ROOT}/test/quantize_func_test.cc"
               "${AOM_ROOT}/test/sad_test.cc"
               "${AOM_ROOT}/test/subtract_test.cc"
               "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sum_squares_test.cc"
               "${AOM_ROOT}/test/variance_test.cc"
               "${AOM_ROOT}/test/wiener_test.cc"
+              "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/warp_filter_test.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.h")
@@ -212,9 +233,18 @@
               "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
               "${AOM_ROOT}/test/av1_quantize_test.cc"
               "${AOM_ROOT}/test/corner_match_test.cc"
-              "${AOM_ROOT}/test/quantize_func_test.cc"
               "${AOM_ROOT}/test/simd_cmp_sse4.cc")
 
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/test/av1_quantize_test.cc")
+  endif()
+
+  if(NOT (HAVE_SSE2 OR HAVE_NEON))
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/quantize_func_test.cc")
+  endif()
+
   if(HAVE_SSE4_1)
     list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
                 "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
@@ -232,10 +262,10 @@
 if(ENABLE_TESTS)
   find_package(PythonInterp)
   if(NOT PYTHONINTERP_FOUND)
-    message(FATAL_ERROR
-              "--- Unit tests require Python, rerun cmake with "
-              "-DENABLE_TESTS=0 to avoid this error, or install Python and "
-              "make sure it's in your PATH.")
+    message(
+      FATAL_ERROR "--- Unit tests require Python, rerun cmake with "
+                  "-DENABLE_TESTS=0 to avoid this error, or install Python and "
+                  "make sure it's in your PATH.")
   endif()
 
   if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
@@ -247,8 +277,8 @@
 
   include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
   add_library(
-    aom_gtest
-    STATIC "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+    aom_gtest STATIC
+    "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
   if(MSVC OR WIN32)
     target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
   elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
@@ -281,8 +311,8 @@
   endif()
 
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:test_aom_common>)
+                             $<TARGET_OBJECTS:aom_common_app_util>
+                             $<TARGET_OBJECTS:test_aom_common>)
   list(APPEND AOM_APP_TARGETS test_libaom)
 
   if(CONFIG_AV1_DECODER)
@@ -303,7 +333,8 @@
     endif()
 
     if(NOT BUILD_SHARED_LIBS)
-      add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+      add_executable(test_intra_pred_speed
+                     ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
                      $<TARGET_OBJECTS:aom_common_app_util>)
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
                             aom_gtest)
@@ -356,13 +387,13 @@
     foreach(test_index RANGE ${max_file_index})
       list(GET test_files ${test_index} test_file)
       list(GET test_file_checksums ${test_index} test_file_checksum)
-      add_custom_target(testdata_${test_index}
-                        COMMAND
-                          ${CMAKE_COMMAND} -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
-                          -DAOM_ROOT="${AOM_ROOT}"
-                          -DAOM_TEST_FILE="${test_file}"
-                          -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
-                          "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      add_custom_target(
+        testdata_${test_index}
+        COMMAND ${CMAKE_COMMAND}
+                -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" -DAOM_ROOT="${AOM_ROOT}"
+                -DAOM_TEST_FILE="${test_file}"
+                -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
+                "${AOM_ROOT}/test/test_data_download_worker.cmake")
       list(APPEND testdata_targets testdata_${test_index})
     endforeach()
 
@@ -406,7 +437,7 @@
   foreach(var ${all_cmake_vars})
 
     # https://github.com/cheshirekow/cmake_format/issues/34
-# cmake-format: off
+    # cmake-format: off
     if (("${var}" MATCHES "_TEST_" AND NOT
          "${var}" MATCHES
          "_DATA_\|_CMAKE_\|INTRA_PRED\|_COMPILED\|_HOSTING\|_PERF_\|CODER_")
@@ -424,7 +455,7 @@
   # Libaom_test_srcs.txt generation.
   set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt")
   file(WRITE "${libaom_test_srcs_txt_file}"
-             "# This file is generated. DO NOT EDIT.\n")
+       "# This file is generated. DO NOT EDIT.\n")
 
   # Static source file list first.
   foreach(aom_test_source_var ${aom_test_source_vars})

diff --git a/libaom/test/test_data_download_worker.cmake b/libaom/test/test_data_download_worker.cmake
index dc80349..a490388 100644
--- a/libaom/test/test_data_download_worker.cmake
+++ b/libaom/test/test_data_download_worker.cmake

@@ -20,15 +20,15 @@
 endif ()
 # cmake-format: on
 
-set(AOM_TEST_DATA_URL "http://storage.googleapis.com/aom-test-data")
+set(AOM_TEST_DATA_URL "https://storage.googleapis.com/aom-test-data")
 
 if(NOT AOM_TEST_DATA_PATH)
   set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}")
 endif()
 
 if("${AOM_TEST_DATA_PATH}" STREQUAL "")
-  message(WARNING
-            "Writing test data to ${AOM_CONFIG_DIR}, set "
+  message(
+    WARNING "Writing test data to ${AOM_CONFIG_DIR}, set "
             "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
   set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}")
 endif()

diff --git a/libaom/test/test_data_util.cmake b/libaom/test/test_data_util.cmake
index 69703d6..050600e 100644
--- a/libaom/test/test_data_util.cmake
+++ b/libaom/test/test_data_util.cmake

@@ -12,6 +12,7 @@
 list(APPEND AOM_TEST_DATA_FILE_NAMES
             "hantro_collage_w352h288.yuv"
             "hantro_odd.yuv"
+            "paris_352_288_30.y4m"
             "park_joy_90p_10_420.y4m"
             "park_joy_90p_10_422.y4m"
             "park_joy_90p_10_444.y4m"
@@ -24,6 +25,7 @@
             "park_joy_90p_8_420_vertical_csp.y4m"
             "park_joy_90p_8_422.y4m"
             "park_joy_90p_8_444.y4m"
+            "pixel_capture_w320h240.yuv"
             "desktop_credits.y4m"
             "niklas_1280_720_30.y4m"
             "rush_hour_444.y4m"
@@ -294,6 +296,8 @@
               "av1-1-b10-00-quantizer-62.ivf.md5"
               "av1-1-b10-00-quantizer-63.ivf"
               "av1-1-b10-00-quantizer-63.ivf.md5"
+              "av1-1-b10-23-film_grain-50.ivf"
+              "av1-1-b10-23-film_grain-50.ivf.md5"
               "av1-1-b8-01-size-16x16.ivf"
               "av1-1-b8-01-size-16x16.ivf.md5"
               "av1-1-b8-01-size-16x18.ivf"
@@ -512,6 +516,8 @@
               "av1-1-b8-22-svc-L1T2.ivf.md5"
               "av1-1-b8-22-svc-L2T2.ivf"
               "av1-1-b8-22-svc-L2T2.ivf.md5"
+              "av1-1-b8-23-film_grain-50.ivf"
+              "av1-1-b8-23-film_grain-50.ivf.md5"
               "invalid-bug-1814.ivf"
               "invalid-bug-1814.ivf.res"
               "invalid-chromium-906381.ivf"
@@ -528,24 +534,33 @@
               "invalid-oss-fuzz-10227.ivf.res"
               "invalid-oss-fuzz-10389.ivf"
               "invalid-oss-fuzz-10389.ivf.res"
+              "invalid-oss-fuzz-10389.ivf.res.2"
               "invalid-oss-fuzz-10555.ivf"
               "invalid-oss-fuzz-10555.ivf.res"
               "invalid-oss-fuzz-10705.ivf"
               "invalid-oss-fuzz-10705.ivf.res"
               "invalid-oss-fuzz-10723.ivf"
               "invalid-oss-fuzz-10723.ivf.res"
+              "invalid-oss-fuzz-10723.ivf.res.2"
               "invalid-oss-fuzz-10779.ivf"
               "invalid-oss-fuzz-10779.ivf.res"
               "invalid-oss-fuzz-11477.ivf"
               "invalid-oss-fuzz-11477.ivf.res"
               "invalid-oss-fuzz-11479.ivf"
               "invalid-oss-fuzz-11479.ivf.res"
+              "invalid-oss-fuzz-11479.ivf.res.2"
               "invalid-oss-fuzz-11523.ivf"
               "invalid-oss-fuzz-11523.ivf.res"
+              "invalid-oss-fuzz-11523.ivf.res.2"
+              "invalid-oss-fuzz-15363.ivf"
+              "invalid-oss-fuzz-15363.ivf.res"
+              "invalid-oss-fuzz-16437.ivf"
+              "invalid-oss-fuzz-16437.ivf.res"
               "invalid-oss-fuzz-9288.ivf"
               "invalid-oss-fuzz-9288.ivf.res"
               "invalid-oss-fuzz-9463.ivf"
               "invalid-oss-fuzz-9463.ivf.res"
+              "invalid-oss-fuzz-9463.ivf.res.2"
               "invalid-oss-fuzz-9482.ivf"
               "invalid-oss-fuzz-9482.ivf.res"
               "invalid-oss-fuzz-9720.ivf"
@@ -629,7 +644,7 @@
 # writes it to $local_path.
 function(download_test_file file_url file_checksum local_path)
   message("Downloading ${file_url} ...")
-  file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS
-       EXPECTED_HASH SHA1=${file_checksum})
+  file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS EXPECTED_HASH
+                                              SHA1=${file_checksum})
   message("Download of ${file_url} complete.")
 endfunction()

diff --git a/libaom/test/test_intra_pred_speed.cc b/libaom/test/test_intra_pred_speed.cc
index b72ac11..25c50d0 100644
--- a/libaom/test/test_intra_pred_speed.cc
+++ b/libaom/test/test_intra_pred_speed.cc

@@ -878,6 +878,8 @@
                 aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
                 NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
 // High Bitdepth
 namespace {
@@ -1460,5 +1462,6 @@
     aom_highbd_smooth_h_predictor_64x16_c)
 
 // -----------------------------------------------------------------------------
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #include "test/test_libaom.cc"

diff --git a/libaom/test/test_runner.cmake b/libaom/test/test_runner.cmake
index d3747b1..f0648d1 100644
--- a/libaom/test/test_runner.cmake
+++ b/libaom/test/test_runner.cmake

@@ -8,13 +8,13 @@
 # License 1.0 was not distributed with this source code in the PATENTS file, you
 # can obtain it at www.aomedia.org/license/patent.
 #
-if(NOT GTEST_TOTAL_SHARDS OR "${GTEST_SHARD_INDEX}" STREQUAL "" OR NOT
-   TEST_LIBAOM)
+if(NOT GTEST_TOTAL_SHARDS
+   OR "${GTEST_SHARD_INDEX}" STREQUAL ""
+   OR NOT TEST_LIBAOM)
   message(
     FATAL_ERROR
       "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
-          must be defined."
-    )
+          must be defined.")
 endif()
 
 set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX})

diff --git a/libaom/test/test_vector_test.cc b/libaom/test/test_vector_test.cc
index 9f070b9..eab92b6 100644
--- a/libaom/test/test_vector_test.cc
+++ b/libaom/test/test_vector_test.cc

@@ -14,6 +14,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <tuple>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "common/tools_common.h"
 #include "config/aom_config.h"
@@ -33,7 +34,7 @@
 const int kFileName = 1;
 const int kRowMT = 2;
 
-typedef ::testing::tuple<int, const char *, int> DecodeParam;
+typedef std::tuple<int, const char *, int> DecodeParam;
 
 class TestVectorTest : public ::libaom_test::DecoderTest,
                        public ::libaom_test::CodecTestWithParam<DecodeParam> {
@@ -68,7 +69,7 @@
     expected_md5[32] = '\0';
 
     ::libaom_test::MD5 md5_res;
-#if !CONFIG_LOWBITDEPTH
+#if FORCE_HIGHBITDEPTH_DECODING
     const aom_img_fmt_t shifted_fmt =
         (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
     if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
@@ -82,7 +83,7 @@
     } else {
 #endif
       md5_res.Add(&img);
-#if !CONFIG_LOWBITDEPTH
+#if FORCE_HIGHBITDEPTH_DECODING
     }
 #endif
 
@@ -104,13 +105,13 @@
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
   const DecodeParam input = GET_PARAM(1);
-  const std::string filename = ::testing::get<kFileName>(input);
+  const std::string filename = std::get<kFileName>(input);
   aom_codec_flags_t flags = 0;
   aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
   char str[256];
 
-  cfg.threads = ::testing::get<kThreads>(input);
-  row_mt_ = ::testing::get<kRowMT>(input);
+  cfg.threads = std::get<kThreads>(input);
+  row_mt_ = std::get<kRowMT>(input);
 
   snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
            filename.c_str(), cfg.threads);
@@ -138,7 +139,7 @@
   OpenMD5File(md5_filename);
 
   // Set decode config and flags.
-  cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+  cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
   set_cfg(cfg);
   set_flags(flags);
 
@@ -156,7 +157,7 @@
                        ::testing::Values(0)));
 
 // Test AV1 decode in with different numbers of threads.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AV1MultiThreaded, TestVectorTest,
     ::testing::Combine(
         ::testing::Values(

diff --git a/libaom/test/test_vectors.cc b/libaom/test/test_vectors.cc
index d2cd901..991667a 100644
--- a/libaom/test/test_vectors.cc
+++ b/libaom/test/test_vectors.cc

@@ -80,6 +80,7 @@
                                         "av1-1-b8-00-quantizer-61.ivf",
                                         "av1-1-b8-00-quantizer-62.ivf",
                                         "av1-1-b8-00-quantizer-63.ivf",
+#if CONFIG_AV1_HIGHBITDEPTH
                                         "av1-1-b10-00-quantizer-00.ivf",
                                         "av1-1-b10-00-quantizer-01.ivf",
                                         "av1-1-b10-00-quantizer-02.ivf",
@@ -144,6 +145,8 @@
                                         "av1-1-b10-00-quantizer-61.ivf",
                                         "av1-1-b10-00-quantizer-62.ivf",
                                         "av1-1-b10-00-quantizer-63.ivf",
+                                        "av1-1-b10-23-film_grain-50.ivf",
+#endif  // CONFIG_AV1_HIGHBITDEPTH
                                         "av1-1-b8-01-size-16x16.ivf",
                                         "av1-1-b8-01-size-16x18.ivf",
                                         "av1-1-b8-01-size-16x32.ivf",
@@ -252,7 +255,8 @@
                                         "av1-1-b8-06-mfmv.ivf",
                                         "av1-1-b8-22-svc-L1T2.ivf",
                                         "av1-1-b8-22-svc-L2T1.ivf",
-                                        "av1-1-b8-22-svc-L2T2.ivf" };
+                                        "av1-1-b8-22-svc-L2T2.ivf",
+                                        "av1-1-b8-23-film_grain-50.ivf" };
 const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
 #endif  // CONFIG_AV1_DECODER
 

diff --git a/libaom/test/time_stamp_test.cc b/libaom/test/time_stamp_test.cc
new file mode 100644
index 0000000..679e4da
--- /dev/null
+++ b/libaom/test/time_stamp_test.cc

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+//  Test AOM timestamp handling
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 3;
+
+// A video source that exposes functions to set the timebase, framerate and
+// starting pts.
+class DummyTimebaseVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  // Parameters num and den set the timebase for the video source.
+  DummyTimebaseVideoSource(int num, int den)
+      : framerate_numerator_(30), framerate_denominator_(1), starting_pts_(0) {
+    SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    set_limit(kFramesToEncode);
+    timebase_.num = num;
+    timebase_.den = den;
+  }
+
+  void SetFramerate(int numerator, int denominator) {
+    framerate_numerator_ = numerator;
+    framerate_denominator_ = denominator;
+  }
+
+  // Returns one frames duration in timebase units as a double.
+  double FrameDuration() const {
+    return (static_cast<double>(timebase_.den) / timebase_.num) /
+           (static_cast<double>(framerate_numerator_) / framerate_denominator_);
+  }
+
+  virtual aom_codec_pts_t pts() const {
+    return static_cast<aom_codec_pts_t>(frame_ * FrameDuration() +
+                                        starting_pts_ + 0.5);
+  }
+
+  virtual unsigned long duration() const {
+    return static_cast<unsigned long>(FrameDuration() + 0.5);
+  }
+
+  virtual aom_rational_t timebase() const { return timebase_; }
+
+  void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
+
+ private:
+  aom_rational_t timebase_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  int64_t starting_pts_;
+};
+
+class TimestampTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
+ protected:
+  TimestampTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~TimestampTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+};
+
+// Tests encoding in millisecond timebase.
+TEST_P(TimestampTest, EncodeFrames) {
+  DummyTimebaseVideoSource video(1, 1000);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestMicrosecondTimebase) {
+  // Set the timebase to microseconds.
+  DummyTimebaseVideoSource video(1, 1000000);
+  video.set_limit(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestAv1Rollover) {
+  DummyTimebaseVideoSource video(1, 1000);
+  video.set_starting_pts(922337170351ll);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(TimestampTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+
+}  // namespace

diff --git a/libaom/test/transform_test_base.h b/libaom/test/transform_test_base.h
index 8ebcf5f..68f5cc7 100644
--- a/libaom/test/transform_test_base.h
+++ b/libaom/test/transform_test_base.h

@@ -29,20 +29,23 @@
 //   to a aom header file.
 const int kDctMaxValue = 16384;
 
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        TxfmParam *txfm_param);
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+                         TxfmParam *txfm_param);
 
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
+template <typename OutputType>
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                         const TxfmParam *txfm_param);
 
+template <typename OutType>
 class TransformTestBase {
  public:
   virtual ~TransformTestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0;
 
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(const OutType *out, uint8_t *dst, int stride) = 0;
 
   void RunAccuracyCheck(uint32_t ref_max_error, double ref_avg_error) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -52,8 +55,8 @@
 
     int16_t *test_input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    tran_low_t *test_temp_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    OutType *test_temp_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(test_temp_block[0]) * num_coeffs_));
     uint8_t *dst = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
     uint8_t *src = reinterpret_cast<uint8_t *>(
@@ -123,10 +126,10 @@
 
     int16_t *input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * stride * height_));
-    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
-    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    OutType *output_ref_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    OutType *output_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
 
     for (int i = 0; i < count_test_block; ++i) {
       int j, k;
@@ -172,8 +175,8 @@
 
     int16_t *input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    tran_low_t *trans_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    OutType *trans_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(trans_block[0]) * num_coeffs_));
     uint8_t *output_block = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * stride * height_));
     uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
@@ -218,10 +221,10 @@
 
     int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
-    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    OutType *output_ref_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    OutType *output_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-mask_, mask_].
@@ -260,8 +263,8 @@
 
     int16_t *in = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    tran_low_t *coeff = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    OutType *coeff = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(coeff[0]) * num_coeffs_));
     uint8_t *dst = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
     uint8_t *src = reinterpret_cast<uint8_t *>(
@@ -313,8 +316,8 @@
 
   int pitch_;
   int height_;
-  FhtFunc fwd_txfm_ref;
-  IhtFunc inv_txfm_ref;
+  FhtFunc<OutType> fwd_txfm_ref;
+  IhtFunc<OutType> inv_txfm_ref;
   aom_bit_depth_t bit_depth_;
   int mask_;
   int num_coeffs_;

diff --git a/libaom/test/util.h b/libaom/test/util.h
index c3f4e44..aa4b106 100644
--- a/libaom/test/util.h
+++ b/libaom/test/util.h

@@ -20,7 +20,7 @@
 #include "aom_ports/aom_timer.h"
 
 // Macros
-#define GET_PARAM(k) ::testing::get<k>(GetParam())
+#define GET_PARAM(k) std::get<k>(GetParam())
 
 inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
   assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&

diff --git a/libaom/test/variance_test.cc b/libaom/test/variance_test.cc
index 1942de0..1458ece 100644
--- a/libaom/test/variance_test.cc
+++ b/libaom/test/variance_test.cc

@@ -11,6 +11,8 @@
 
 #include <cstdlib>
 #include <new>
+#include <ostream>
+#include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -115,8 +117,7 @@
 /* The subpel reference functions differ from the codec version in one aspect:
  * they calculate the bilinear factors directly instead of using a lookup table
  * and therefore upshift xoff and yoff by 1. Only every other calculated value
- * is used so the codec version shrinks the table to save space and maintain
- * compatibility with vp8.
+ * is used so the codec version shrinks the table to save space.
  */
 static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
                                     int l2w, int l2h, int xoff, int yoff,
@@ -657,9 +658,9 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using ::testing::get;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::get;
+using std::make_tuple;
+using std::tuple;
 
 template <typename FunctionType>
 class SubpelVarianceTest
@@ -804,7 +805,7 @@
     }
   }
 
-  unsigned int sse1;
+  unsigned int sse1, sse2;
   int run_time = 1000000000 / block_size();
   aom_usec_timer timer;
 
@@ -817,8 +818,24 @@
   aom_usec_timer_mark(&timer);
 
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
-         params_.bit_depth, elapsed_time);
+
+  aom_usec_timer timer_c;
+
+  aom_usec_timer_start(&timer_c);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y,
+                        &sse2, use_high_bit_depth(), params_.bit_depth);
+  }
+  aom_usec_timer_mark(&timer_c);
+
+  const int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&timer_c));
+
+  printf(
+      "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n",
+      width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time,
+      elapsed_time_c / elapsed_time);
 }
 
 template <>
@@ -1075,29 +1092,30 @@
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 
-INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
-                        ::testing::Values(aom_get_mb_ss_c));
+INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_c));
 
 typedef TestParams<Get4x4SseFunc> SseParams;
-INSTANTIATE_TEST_CASE_P(C, AvxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &aom_get4x4sse_cs_c)));
+INSTANTIATE_TEST_SUITE_P(C, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_c)));
 
 typedef TestParams<VarianceMxNFunc> MseParams;
-INSTANTIATE_TEST_CASE_P(C, AvxMseTest,
-                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
-                                          MseParams(4, 3, &aom_mse16x8_c),
-                                          MseParams(3, 4, &aom_mse8x16_c),
-                                          MseParams(3, 3, &aom_mse8x8_c)));
+INSTANTIATE_TEST_SUITE_P(C, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
+                                           MseParams(4, 3, &aom_mse16x8_c),
+                                           MseParams(3, 4, &aom_mse8x16_c),
+                                           MseParams(3, 3, &aom_mse8x8_c)));
 
 typedef TestParams<VarianceMxNFunc> VarianceParams;
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AvxVarianceTest,
     ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
                       VarianceParams(7, 6, &aom_variance128x64_c),
@@ -1114,10 +1132,17 @@
                       VarianceParams(3, 3, &aom_variance8x8_c),
                       VarianceParams(3, 2, &aom_variance8x4_c),
                       VarianceParams(2, 3, &aom_variance4x8_c),
-                      VarianceParams(2, 2, &aom_variance4x4_c)));
+                      VarianceParams(2, 2, &aom_variance4x4_c),
+
+                      VarianceParams(6, 4, &aom_variance64x16_c),
+                      VarianceParams(4, 6, &aom_variance16x64_c),
+                      VarianceParams(5, 3, &aom_variance32x8_c),
+                      VarianceParams(3, 5, &aom_variance8x32_c),
+                      VarianceParams(4, 2, &aom_variance16x4_c),
+                      VarianceParams(2, 4, &aom_variance4x16_c)));
 
 typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
@@ -1135,10 +1160,17 @@
         SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
         SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
         SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0)));
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
+
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0)));
 
 typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
@@ -1156,10 +1188,17 @@
         SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
         SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
+
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0)));
 
 typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AvxDistWtdSubpelAvgVarianceTest,
     ::testing::Values(DistWtdSubpelAvgVarianceParams(
                           6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
@@ -1186,9 +1225,23 @@
                       DistWtdSubpelAvgVarianceParams(
                           2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
                       DistWtdSubpelAvgVarianceParams(
-                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0)));
+                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
 
-INSTANTIATE_TEST_CASE_P(
+                      DistWtdSubpelAvgVarianceParams(
+                          6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c,
+                          0)));
+
+INSTANTIATE_TEST_SUITE_P(
     C, AvxObmcSubpelVarianceTest,
     ::testing::Values(
         ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
@@ -1207,8 +1260,16 @@
         ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
         ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
         ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
-        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0)));
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0),
 
+        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_c, 0),
+        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_c, 0),
+        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_c, 0),
+        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
+        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
+        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
@@ -1228,7 +1289,7 @@
 TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 /* TODO(debargha): This test does not support the highbd version
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AvxHBDMseTest,
     ::testing::Values(make_tuple(4, 4, &aom_highbd_12_mse16x16_c),
                       make_tuple(4, 4, &aom_highbd_12_mse16x8_c),
@@ -1292,13 +1353,32 @@
   VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8),
   VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
   VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
-  VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8)
+  VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8),
+
+  VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12),
+  VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12),
+  VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12),
+  VarianceParams(3, 5, &aom_highbd_12_variance8x32_c, 12),
+  VarianceParams(4, 2, &aom_highbd_12_variance16x4_c, 12),
+  VarianceParams(2, 4, &aom_highbd_12_variance4x16_c, 12),
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_c, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_c, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_c, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_c, 10),
+  VarianceParams(4, 2, &aom_highbd_10_variance16x4_c, 10),
+  VarianceParams(2, 4, &aom_highbd_10_variance4x16_c, 10),
+  VarianceParams(6, 4, &aom_highbd_8_variance64x16_c, 8),
+  VarianceParams(4, 6, &aom_highbd_8_variance16x64_c, 8),
+  VarianceParams(5, 3, &aom_highbd_8_variance32x8_c, 8),
+  VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8),
+  VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8),
+  VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8),
 };
-INSTANTIATE_TEST_CASE_P(C, AvxHBDVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDVariance_c));
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_c));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxHBDVarianceTest,
     ::testing::Values(
         VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
@@ -1355,9 +1435,28 @@
   SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
   SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
   SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8),
+  SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_c, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_c, 8),
+  SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_c, 8),
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_c, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_c, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_c, 10),
+  SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_c, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_c, 10),
+  SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_c, 10),
+  SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_c, 12),
+  SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_c, 12),
+  SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_c, 12),
+  SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12),
+  SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12),
 };
-INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
 
 const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
   SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c,
@@ -1434,10 +1533,41 @@
   SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
   SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
   SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-  SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12)
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12),
+
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_c, 8),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_c, 8),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_c, 8),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_c,
+                          10),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_c,
+                          10),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_c,
+                          10),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_c,
+                          10),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_c,
+                          10),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_c,
+                          12),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_c,
+                          12),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_c,
+                          12),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_c,
+                          12),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c,
+                          12),
 };
-INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 
 const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
   ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
@@ -1522,22 +1652,54 @@
   ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c,
                            12),
   ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
-                           12)
+                           12),
+
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_obmc_sub_pixel_variance64x16_c, 8),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_obmc_sub_pixel_variance16x64_c, 8),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_obmc_sub_pixel_variance32x8_c, 8),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_obmc_sub_pixel_variance8x32_c, 8),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_obmc_sub_pixel_variance16x4_c, 8),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_obmc_sub_pixel_variance4x16_c, 8),
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_10_obmc_sub_pixel_variance32x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_10_obmc_sub_pixel_variance8x32_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_10_obmc_sub_pixel_variance16x4_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_10_obmc_sub_pixel_variance4x16_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_12_obmc_sub_pixel_variance32x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_12_obmc_sub_pixel_variance8x32_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_12_obmc_sub_pixel_variance16x4_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_12_obmc_sub_pixel_variance4x16_c,
+                           12),
 };
-INSTANTIATE_TEST_CASE_P(C, AvxHBDObmcSubpelVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
-                        ::testing::Values(aom_get_mb_ss_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_sse2));
 
-INSTANTIATE_TEST_CASE_P(SSE2, AvxMseTest,
-                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
-                                          MseParams(4, 3, &aom_mse16x8_sse2),
-                                          MseParams(3, 4, &aom_mse8x16_sse2),
-                                          MseParams(3, 3, &aom_mse8x8_sse2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
+                                           MseParams(4, 3, &aom_mse16x8_sse2),
+                                           MseParams(3, 4, &aom_mse8x16_sse2),
+                                           MseParams(3, 3, &aom_mse8x8_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AvxVarianceTest,
     ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
                       VarianceParams(7, 6, &aom_variance128x64_sse2),
@@ -1562,7 +1724,7 @@
                       VarianceParams(2, 3, &aom_variance4x8_sse2),
                       VarianceParams(2, 2, &aom_variance4x4_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
@@ -1580,9 +1742,16 @@
         SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
         SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
         SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0)));
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
 
-INSTANTIATE_TEST_CASE_P(
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
@@ -1603,10 +1772,19 @@
         SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
         SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0)));
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
 
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2,
+                                0)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1,
@@ -1616,7 +1794,7 @@
         SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1,
                              12)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(2, 2,
@@ -1631,7 +1809,7 @@
 #endif  // HAVE_SSE4_1
 
 /* TODO(debargha): This test does not support the highbd version
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AvxHBDMseTest,
     ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2),
                       MseParams(4, 3, &aom_highbd_12_mse16x8_sse2),
@@ -1686,10 +1864,29 @@
   VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
   VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
   VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
-  VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8)
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8),
+
+  VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12),
+  VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12),
+  VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12),
+  VarianceParams(3, 5, &aom_highbd_12_variance8x32_sse2, 12),
+  // VarianceParams(4, 2, &aom_highbd_12_variance16x4_sse2, 12),
+  // VarianceParams(2, 4, &aom_highbd_12_variance4x16_sse2, 12),
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_sse2, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_sse2, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_sse2, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_sse2, 10),
+  // VarianceParams(4, 2, &aom_highbd_10_variance16x4_sse2, 10),
+  // VarianceParams(2, 4, &aom_highbd_10_variance4x16_sse2, 10),
+  VarianceParams(6, 4, &aom_highbd_8_variance64x16_sse2, 8),
+  VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8),
+  VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8),
+  VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8),
+  // VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
+  // VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
 };
-INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDVariance_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_sse2));
 
 #if HAVE_AVX2
 
@@ -1706,11 +1903,11 @@
   VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10),
   VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
   VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
-  VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10)
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
 };
 
-INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDVariance_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_avx2));
 #endif  // HAVE_AVX2
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
@@ -1755,10 +1952,29 @@
   SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_sse2, 8),
   SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
   SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
-  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8)
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12),
+  SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12),
+  SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12),
+  SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_sse2, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_sse2, 12),
+  // SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_sse2, 12),
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_sse2, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_sse2, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_sse2, 10),
+  SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_sse2, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_sse2, 10),
+  // SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_sse2, 10),
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_sse2, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_sse2, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8),
+  SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8),
+  // SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
 };
-INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
 
 const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
   SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_sse2,
@@ -1825,15 +2041,54 @@
                           8),
   SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_sse2,
                           8),
-  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                          8),
+
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_sse2,
+                          12),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_12_sub_pixel_avg_variance4x16_sse2, 12),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_sse2,
+                          10),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_10_sub_pixel_avg_variance4x16_sse2, 10),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2,
+                          8),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
-                        ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
 #endif  // HAVE_SSE2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
@@ -1851,9 +2106,16 @@
         SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
         SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
         SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0)));
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
 
-INSTANTIATE_TEST_CASE_P(
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
@@ -1881,13 +2143,28 @@
         SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
         SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3,
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
+
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3,
                                 0)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, AvxDistWtdSubpelAvgVarianceTest,
     ::testing::Values(
         DistWtdSubpelAvgVarianceParams(
+            7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
             6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
         DistWtdSubpelAvgVarianceParams(
             6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
@@ -1912,11 +2189,24 @@
         DistWtdSubpelAvgVarianceParams(
             2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
         DistWtdSubpelAvgVarianceParams(
-            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0)));
+            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
+
+        DistWtdSubpelAvgVarianceParams(
+            6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxObmcSubpelVarianceTest,
     ::testing::Values(
         ObmcSubpelVarianceParams(7, 7,
@@ -1950,14 +2240,28 @@
         ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
                                  0),
         ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
+                                 0),
+
+        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
                                  0)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest,
-                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_avx2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4,
+                                                     &aom_mse16x16_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxVarianceTest,
     ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
                       VarianceParams(7, 6, &aom_variance128x64_avx2),
@@ -1975,7 +2279,7 @@
                       VarianceParams(4, 3, &aom_variance16x8_avx2),
                       VarianceParams(4, 2, &aom_variance16x4_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
@@ -1985,9 +2289,14 @@
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
         SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
         SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0)));
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
@@ -2005,16 +2314,18 @@
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, AvxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &aom_get4x4sse_cs_neon)));
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_neon)));
 
-INSTANTIATE_TEST_CASE_P(NEON, AvxMseTest,
-                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_neon)));
+INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4,
+                                                     &aom_mse16x16_neon)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_neon),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
+                      VarianceParams(6, 6, &aom_variance64x64_neon),
                       VarianceParams(6, 5, &aom_variance64x32_neon),
                       VarianceParams(5, 6, &aom_variance32x64_neon),
                       VarianceParams(5, 5, &aom_variance32x32_neon),
@@ -2023,7 +2334,7 @@
                       VarianceParams(3, 4, &aom_variance8x16_neon),
                       VarianceParams(3, 3, &aom_variance8x8_neon)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
@@ -2033,20 +2344,20 @@
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
-                        ::testing::Values(aom_get_mb_ss_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_msa));
 
-INSTANTIATE_TEST_CASE_P(MSA, AvxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &aom_get4x4sse_cs_msa)));
+INSTANTIATE_TEST_SUITE_P(MSA, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_msa)));
 
-INSTANTIATE_TEST_CASE_P(MSA, AvxMseTest,
-                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
-                                          MseParams(4, 3, &aom_mse16x8_msa),
-                                          MseParams(3, 4, &aom_mse8x16_msa),
-                                          MseParams(3, 3, &aom_mse8x8_msa)));
+INSTANTIATE_TEST_SUITE_P(MSA, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
+                                           MseParams(4, 3, &aom_mse16x8_msa),
+                                           MseParams(3, 4, &aom_mse8x16_msa),
+                                           MseParams(3, 3, &aom_mse8x8_msa)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, AvxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
                       VarianceParams(6, 5, &aom_variance64x32_msa),
@@ -2062,7 +2373,7 @@
                       VarianceParams(2, 3, &aom_variance4x8_msa),
                       VarianceParams(2, 2, &aom_variance4x4_msa)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, AvxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0),
@@ -2079,7 +2390,7 @@
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0),
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, AvxSubpelAvgVarianceTest,
     ::testing::Values(
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0),

diff --git a/libaom/test/warp_filter_test.cc b/libaom/test/warp_filter_test.cc
index d7b3ec9..c5e87f0 100644
--- a/libaom/test/warp_filter_test.cc
+++ b/libaom/test/warp_filter_test.cc

@@ -8,47 +8,58 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <tuple>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/warp_filter_test_util.h"
 using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+#endif
 using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 
 TEST_P(AV1WarpFilterTest, CheckOutput) {
-  RunCheckOutput(::testing::get<3>(GET_PARAM(0)));
+  RunCheckOutput(std::get<3>(GET_PARAM(0)));
 }
 TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
-  RunSpeedTest(::testing::get<3>(GET_PARAM(0)));
+  RunSpeedTest(std::get<3>(GET_PARAM(0)));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
 
+#if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
-  RunCheckOutput(::testing::get<4>(GET_PARAM(0)));
+  RunCheckOutput(std::get<4>(GET_PARAM(0)));
 }
 TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
-  RunSpeedTest(::testing::get<4>(GET_PARAM(0)));
+  RunSpeedTest(std::get<4>(GET_PARAM(0)));
 }
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdWarpFilterTest,
-                        libaom_test::AV1HighbdWarpFilter::BuildParams(
-                            av1_highbd_warp_affine_sse4_1));
-
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
+                         libaom_test::AV1HighbdWarpFilter::BuildParams(
+                             av1_highbd_warp_affine_sse4_1));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_SSE4_1
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
 #endif  // HAVE_NEON

diff --git a/libaom/test/warp_filter_test_util.cc b/libaom/test/warp_filter_test_util.cc
index 9208af8..bcb0c18 100644
--- a/libaom/test/warp_filter_test_util.cc
+++ b/libaom/test/warp_filter_test_util.cc

@@ -11,8 +11,8 @@
 #include "aom_ports/aom_timer.h"
 #include "test/warp_filter_test_util.h"
 
-using ::testing::make_tuple;
-using ::testing::tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace libaom_test {
 
@@ -55,8 +55,9 @@
       if (is_beta_zero == 1) mat[3] = 0;
       if (is_gamma_zero == 1) mat[4] = 0;
       if (is_delta_zero == 1)
-        mat[5] = (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) +
-                 (1 << WARPEDMODEL_PREC_BITS);
+        mat[5] = static_cast<int32_t>(
+            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+            (1 << WARPEDMODEL_PREC_BITS));
     }
 
     // Calculate the derived parameters and check that they are suitable
@@ -65,12 +66,14 @@
 
     *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
     *beta = clamp(mat[3], INT16_MIN, INT16_MAX);
-    *gamma = clamp(((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
-                   INT16_MIN, INT16_MAX);
-    *delta =
-        clamp(mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
-                  (1 << WARPEDMODEL_PREC_BITS),
-              INT16_MIN, INT16_MAX);
+    *gamma = static_cast<int16_t>(clamp64(
+        (static_cast<int64_t>(mat[4]) * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
+        INT16_MIN, INT16_MAX));
+    *delta = static_cast<int16_t>(clamp64(
+        mat[5] -
+            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) -
+            (1 << WARPEDMODEL_PREC_BITS),
+        INT16_MIN, INT16_MAX));
 
     if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
         (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS)))
@@ -113,8 +116,7 @@
   const int border = 16;
   const int stride = w + 2 * border;
   WarpTestParam params = GET_PARAM(0);
-  const int out_w = ::testing::get<0>(params),
-            out_h = ::testing::get<1>(params);
+  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
   const int is_alpha_zero = GET_PARAM(1);
   const int is_beta_zero = GET_PARAM(2);
   const int is_gamma_zero = GET_PARAM(3);
@@ -177,9 +179,8 @@
   const int is_beta_zero = GET_PARAM(2);
   const int is_gamma_zero = GET_PARAM(3);
   const int is_delta_zero = GET_PARAM(4);
-  const int out_w = ::testing::get<0>(params),
-            out_h = ::testing::get<1>(params);
-  const int num_iters = ::testing::get<2>(params);
+  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
+  const int num_iters = std::get<2>(params);
   int i, j, sub_x, sub_y;
   const int bd = 8;
 
@@ -276,6 +277,7 @@
 }
 }  // namespace AV1WarpFilter
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdWarpFilter {
 ::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
     highbd_warp_affine_func filter) {
@@ -310,8 +312,8 @@
   const int is_beta_zero = GET_PARAM(2);
   const int is_gamma_zero = GET_PARAM(3);
   const int is_delta_zero = GET_PARAM(4);
-  const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
-  const int bd = ::testing::get<3>(param);
+  const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+  const int bd = std::get<3>(param);
   const int mask = (1 << bd) - 1;
   int sub_x, sub_y;
 
@@ -373,9 +375,9 @@
   const int is_beta_zero = GET_PARAM(2);
   const int is_gamma_zero = GET_PARAM(3);
   const int is_delta_zero = GET_PARAM(4);
-  const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
-  const int bd = ::testing::get<3>(param);
-  const int num_iters = ::testing::get<2>(param);
+  const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+  const int bd = std::get<3>(param);
+  const int num_iters = std::get<2>(param);
   const int mask = (1 << bd) - 1;
   int i, j, sub_x, sub_y;
 
@@ -477,4 +479,5 @@
   delete[] dstb;
 }
 }  // namespace AV1HighbdWarpFilter
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace libaom_test

diff --git a/libaom/test/warp_filter_test_util.h b/libaom/test/warp_filter_test_util.h
index b8998e5..66a6e24 100644
--- a/libaom/test/warp_filter_test_util.h
+++ b/libaom/test/warp_filter_test_util.h

@@ -12,6 +12,8 @@
 #ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_
 #define AOM_TEST_WARP_FILTER_TEST_UTIL_H_
 
+#include <tuple>
+
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 
@@ -41,8 +43,8 @@
                                  ConvolveParams *conv_params, int16_t alpha,
                                  int16_t beta, int16_t gamma, int16_t delta);
 
-typedef ::testing::tuple<int, int, int, warp_affine_func> WarpTestParam;
-typedef ::testing::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
+typedef std::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef std::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
 
 ::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
     warp_affine_func filter);
@@ -63,6 +65,7 @@
 
 }  // namespace AV1WarpFilter
 
+#if CONFIG_AV1_HIGHBITDEPTH
 namespace AV1HighbdWarpFilter {
 typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
                                         int width, int height, int stride,
@@ -73,9 +76,9 @@
                                         int16_t alpha, int16_t beta,
                                         int16_t gamma, int16_t delta);
 
-typedef ::testing::tuple<int, int, int, int, highbd_warp_affine_func>
+typedef std::tuple<int, int, int, int, highbd_warp_affine_func>
     HighbdWarpTestParam;
-typedef ::testing::tuple<HighbdWarpTestParam, int, int, int, int>
+typedef std::tuple<HighbdWarpTestParam, int, int, int, int>
     HighbdWarpTestParams;
 
 ::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
@@ -97,6 +100,7 @@
 };
 
 }  // namespace AV1HighbdWarpFilter
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace libaom_test
 

diff --git a/libaom/test/wiener_test.cc b/libaom/test/wiener_test.cc
index 8f49af6..81839fd 100644
--- a/libaom/test/wiener_test.cc
+++ b/libaom/test/wiener_test.cc

@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tuple>
 #include <vector>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -115,7 +116,7 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-typedef ::testing::tuple<const compute_stats_Func> WienerTestParam;
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
 
 class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
  public:
@@ -269,21 +270,22 @@
   RunWienerTest(WIENER_WIN_CHROMA, 200);
 }
 
-INSTANTIATE_TEST_CASE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+INSTANTIATE_TEST_SUITE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, WienerTest,
-                        ::testing::Values(av1_compute_stats_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTest,
+                         ::testing::Values(av1_compute_stats_sse4_1));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 
-INSTANTIATE_TEST_CASE_P(AVX2, WienerTest,
-                        ::testing::Values(av1_compute_stats_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest,
+                         ::testing::Values(av1_compute_stats_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace wiener_lowbd
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // High bit-depth tests:
 namespace wiener_highbd {
 
@@ -389,7 +391,7 @@
                                    int src_stride, int64_t *M, int64_t *H,
                                    aom_bit_depth_t bit_depth);
 
-typedef ::testing::tuple<const compute_stats_Func> WienerTestParam;
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
 
 class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
  public:
@@ -568,17 +570,18 @@
   RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_12);
 }
 
-INSTANTIATE_TEST_CASE_P(C, WienerTestHighbd,
-                        ::testing::Values(compute_stats_highbd_opt_c));
+INSTANTIATE_TEST_SUITE_P(C, WienerTestHighbd,
+                         ::testing::Values(compute_stats_highbd_opt_c));
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, WienerTestHighbd,
-                        ::testing::Values(av1_compute_stats_highbd_sse4_1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_sse4_1));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, WienerTestHighbd,
-                        ::testing::Values(av1_compute_stats_highbd_avx2));
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace wiener_highbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/libaom/test/y4m_test.cc b/libaom/test/y4m_test.cc
index 6cc75ef..5d795fa 100644
--- a/libaom/test/y4m_test.cc
+++ b/libaom/test/y4m_test.cc

@@ -121,8 +121,8 @@
   Md5Check(t.md5raw);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
 
 class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
@@ -175,6 +175,6 @@
   Md5Check(t.md5raw);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
 }  // namespace

diff --git a/libaom/third_party/fastfeat/README.libvpx b/libaom/third_party/fastfeat/README.libaom
similarity index 96%
rename from libaom/third_party/fastfeat/README.libvpx
rename to libaom/third_party/fastfeat/README.libaom
index 1e58a30..a732b0d 100644
--- a/libaom/third_party/fastfeat/README.libvpx
+++ b/libaom/third_party/fastfeat/README.libaom

@@ -37,3 +37,4 @@
 Add lines to turn off clang formatting for these files
 Remove Fast 10, 11 and 12
 Convert tabs to spaces
+Prefix global functions with "aom_"

diff --git a/libaom/third_party/fastfeat/fast.c b/libaom/third_party/fastfeat/fast.c
index 0d7efc1..f29ac8f 100644
--- a/libaom/third_party/fastfeat/fast.c
+++ b/libaom/third_party/fastfeat/fast.c

@@ -3,16 +3,16 @@
 #include "fast.h"
 
 
-xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
 {
 	xy* corners;
 	int num_corners;
 	int* scores;
 	xy* nonmax;
 
-	corners = fast9_detect(im, xsize, ysize, stride, b, &num_corners);
-	scores = fast9_score(im, stride, corners, num_corners, b);
-	nonmax = nonmax_suppression(corners, scores, num_corners, ret_num_corners);
+	corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+	scores = aom_fast9_score(im, stride, corners, num_corners, b);
+	nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_num_corners);
 
 	free(corners);
 	free(scores);

diff --git a/libaom/third_party/fastfeat/fast.h b/libaom/third_party/fastfeat/fast.h
index a00730e..a65d5a5 100644
--- a/libaom/third_party/fastfeat/fast.h
+++ b/libaom/third_party/fastfeat/fast.h

@@ -5,15 +5,15 @@
 typedef struct { int x, y; } xy;
 typedef unsigned char byte;
 
-int fast9_corner_score(const byte* p, const int pixel[], int bstart);
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart);
 
-xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
 
-int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
 
-xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
 
-xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax);
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax);
 
 
 #endif

diff --git a/libaom/third_party/fastfeat/fast_9.c b/libaom/third_party/fastfeat/fast_9.c
index ec167a9..61c654c 100644
--- a/libaom/third_party/fastfeat/fast_9.c
+++ b/libaom/third_party/fastfeat/fast_9.c

@@ -5,7 +5,7 @@
 typedef struct { int x, y; } xy;
 typedef unsigned char byte;
 
-int fast9_corner_score(const byte* p, const int pixel[], int bstart)
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart)
 {
   int bmin = bstart;
   int bmax = 255;
@@ -2958,7 +2958,7 @@
 
 
 
-int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
 {
   int* scores = (int*)malloc(sizeof(int)* num_corners);
   int n;
@@ -2967,13 +2967,13 @@
   make_offsets(pixel, stride);
 
   for(n=0; n < num_corners; n++)
-    scores[n] = fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
+    scores[n] = aom_fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
 
   return scores;
 }
 
 
-xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
 {
   int num_corners=0;
   xy* ret_corners;

diff --git a/libaom/third_party/fastfeat/nonmax.c b/libaom/third_party/fastfeat/nonmax.c
index 0438c4d..0dbc660 100644
--- a/libaom/third_party/fastfeat/nonmax.c
+++ b/libaom/third_party/fastfeat/nonmax.c

@@ -5,7 +5,7 @@
 
 #define Compare(X, Y) ((X)>=(Y))
 
-xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax)
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax)
 {
   int num_nonmax=0;
   int last_row;

diff --git a/libaom/third_party/googletest/README.libaom b/libaom/third_party/googletest/README.libaom
index 9784dd5..9b8a863 100644
--- a/libaom/third_party/googletest/README.libaom
+++ b/libaom/third_party/googletest/README.libaom

@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest
-Version: 1.8.0
+Version: 1.10.x
 License: BSD
 License File: LICENSE
 
@@ -12,15 +12,6 @@
 generation.
 
 Local Modifications:
-- Remove everything but:
-  googletest-release-1.8.0/googletest/
-    cmake/
-    include/
-    src/
-    CHANGES
-    CMakelists.txt
-    CONTRIBUTORS
-    LICENSE
-    README.md
-- Suppress unsigned overflow instrumentation in the LCG
-  https://github.com/google/googletest/pull/1066
+- Replace everything in:
+  third_party/googletest/src/googletest/src/
+  third_party/googletest/src/googletest/include/

diff --git a/libaom/third_party/googletest/gtest.mk b/libaom/third_party/googletest/gtest.mk
deleted file mode 100644
index fc4dbdc..0000000
--- a/libaom/third_party/googletest/gtest.mk
+++ /dev/null

@@ -1 +0,0 @@
-GTEST_SRCS-yes += googletest/src/googletest/src/gtest-all.cc

diff --git a/libaom/third_party/googletest/src/googletest/CMakeLists.txt b/libaom/third_party/googletest/src/googletest/CMakeLists.txt
index 621d0f0..9ee7940 100644
--- a/libaom/third_party/googletest/src/googletest/CMakeLists.txt
+++ b/libaom/third_party/googletest/src/googletest/CMakeLists.txt

@@ -5,10 +5,6 @@
 # ctest.  You can select which tests to run using 'ctest -R regex'.
 # For more options, run 'ctest --help'.
 
-# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
-# make it prominent in the GUI.
-option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
-
 # When other libraries are using a shared version of runtime libraries,
 # Google Test also has to use one.
 option(
@@ -44,13 +40,41 @@
 # as ${gtest_SOURCE_DIR} and to the root binary directory as
 # ${gtest_BINARY_DIR}.
 # Language "C" is required for find_package(Threads).
-project(gtest CXX C)
-cmake_minimum_required(VERSION 2.6.2)
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gtest CXX C)
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.6.4)
+
+if (POLICY CMP0063) # Visibility
+  cmake_policy(SET CMP0063 NEW)
+endif (POLICY CMP0063)
 
 if (COMMAND set_up_hermetic_build)
   set_up_hermetic_build()
 endif()
 
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+
+else()
+
+  mark_as_advanced(
+    gtest_force_shared_crt
+    gtest_build_tests
+    gtest_build_samples
+    gtest_disable_pthreads
+    gtest_hide_internal_symbols)
+
+endif()
+
+
 if (gtest_hide_internal_symbols)
   set(CMAKE_CXX_VISIBILITY_PRESET hidden)
   set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
@@ -61,13 +85,30 @@
 
 config_compiler_and_linker()  # Defined in internal_utils.cmake.
 
-# Where Google Test's .h files can be found.
-include_directories(
-  ${gtest_SOURCE_DIR}/include
-  ${gtest_SOURCE_DIR})
+# Create the CMake package file descriptors.
+if (INSTALL_GTEST)
+  include(CMakePackageConfigHelpers)
+  set(cmake_package_name GTest)
+  set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
+  set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
+  set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
+  set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
+  write_basic_package_version_file(${version_file} COMPATIBILITY AnyNewerVersion)
+  install(EXPORT ${targets_export_name}
+    NAMESPACE ${cmake_package_name}::
+    DESTINATION ${cmake_files_install_dir})
+  set(config_file "${generated_dir}/${cmake_package_name}Config.cmake")
+  configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in"
+    "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir})
+  install(FILES ${version_file} ${config_file}
+    DESTINATION ${cmake_files_install_dir})
+endif()
 
-# Where Google Test's libraries can be found.
-link_directories(${gtest_BINARY_DIR}/src)
+# Where Google Test's .h files can be found.
+set(gtest_build_include_dirs
+  "${gtest_SOURCE_DIR}/include"
+  "${gtest_SOURCE_DIR}")
+include_directories(${gtest_build_include_dirs})
 
 # Summary of tuple support for Microsoft Visual Studio:
 # Compiler    version(MS)  version(cmake)  Support
@@ -75,6 +116,8 @@
 # <= VS 2010  <= 10        <= 1600         Use Google Tests's own tuple.
 # VS 2012     11           1700            std::tr1::tuple + _VARIADIC_MAX=10
 # VS 2013     12           1800            std::tr1::tuple
+# VS 2015     14           1900            std::tuple
+# VS 2017     15           >= 1910         std::tuple
 if (MSVC AND MSVC_VERSION EQUAL 1700)
   add_definitions(/D _VARIADIC_MAX=10)
 endif()
@@ -89,23 +132,23 @@
 # aggressive about warnings.
 cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
 cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
-target_link_libraries(gtest_main gtest)
-
 # If the CMake version supports it, attach header directory information
 # to the targets for when we are part of a parent build (ie being pulled
 # in via add_subdirectory() rather than being a standalone build).
 if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
-  target_include_directories(gtest      INTERFACE "${gtest_SOURCE_DIR}/include")
-  target_include_directories(gtest_main INTERFACE "${gtest_SOURCE_DIR}/include")
+  target_include_directories(gtest SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gtest_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
 endif()
+target_link_libraries(gtest_main PUBLIC gtest)
 
 ########################################################################
 #
 # Install rules
-install(TARGETS gtest gtest_main
-  DESTINATION lib)
-install(DIRECTORY ${gtest_SOURCE_DIR}/include/gtest
-  DESTINATION include)
+install_project(gtest gtest_main)
 
 ########################################################################
 #
@@ -147,28 +190,28 @@
   ############################################################
   # C++ tests built with standard compiler flags.
 
-  cxx_test(gtest-death-test_test gtest_main)
+  cxx_test(googletest-death-test-test gtest_main)
   cxx_test(gtest_environment_test gtest)
-  cxx_test(gtest-filepath_test gtest_main)
-  cxx_test(gtest-linked_ptr_test gtest_main)
-  cxx_test(gtest-listener_test gtest_main)
+  cxx_test(googletest-filepath-test gtest_main)
+  cxx_test(googletest-linked-ptr-test gtest_main)
+  cxx_test(googletest-listener-test gtest_main)
   cxx_test(gtest_main_unittest gtest_main)
-  cxx_test(gtest-message_test gtest_main)
+  cxx_test(googletest-message-test gtest_main)
   cxx_test(gtest_no_test_unittest gtest)
-  cxx_test(gtest-options_test gtest_main)
-  cxx_test(gtest-param-test_test gtest
-    test/gtest-param-test2_test.cc)
-  cxx_test(gtest-port_test gtest_main)
+  cxx_test(googletest-options-test gtest_main)
+  cxx_test(googletest-param-test-test gtest
+    test/googletest-param-test2-test.cc)
+  cxx_test(googletest-port-test gtest_main)
   cxx_test(gtest_pred_impl_unittest gtest_main)
   cxx_test(gtest_premature_exit_test gtest
     test/gtest_premature_exit_test.cc)
-  cxx_test(gtest-printers_test gtest_main)
+  cxx_test(googletest-printers-test gtest_main)
   cxx_test(gtest_prod_test gtest_main
     test/production.cc)
   cxx_test(gtest_repeat_test gtest)
   cxx_test(gtest_sole_header_test gtest_main)
   cxx_test(gtest_stress_test gtest)
-  cxx_test(gtest-test-part_test gtest_main)
+  cxx_test(googletest-test-part-test gtest_main)
   cxx_test(gtest_throw_on_failure_ex_test gtest)
   cxx_test(gtest-typed-test_test gtest_main
     test/gtest-typed-test2_test.cc)
@@ -190,10 +233,10 @@
 
   cxx_test_with_flags(gtest-death-test_ex_nocatch_test
     "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
-    gtest test/gtest-death-test_ex_test.cc)
+    gtest test/googletest-death-test_ex_test.cc)
   cxx_test_with_flags(gtest-death-test_ex_catch_test
     "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
-    gtest test/gtest-death-test_ex_test.cc)
+    gtest test/googletest-death-test_ex_test.cc)
 
   cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
     gtest_main_no_rtti test/gtest_unittest.cc)
@@ -214,73 +257,75 @@
     cxx_library(gtest_main_use_own_tuple "${cxx_use_own_tuple}"
       src/gtest-all.cc src/gtest_main.cc)
 
-    cxx_test_with_flags(gtest-tuple_test "${cxx_use_own_tuple}"
-      gtest_main_use_own_tuple test/gtest-tuple_test.cc)
+    cxx_test_with_flags(googletest-tuple-test "${cxx_use_own_tuple}"
+      gtest_main_use_own_tuple test/googletest-tuple-test.cc)
 
     cxx_test_with_flags(gtest_use_own_tuple_test "${cxx_use_own_tuple}"
       gtest_main_use_own_tuple
-      test/gtest-param-test_test.cc test/gtest-param-test2_test.cc)
+      test/googletest-param-test-test.cc test/googletest-param-test2-test.cc)
   endif()
 
   ############################################################
   # Python tests.
 
-  cxx_executable(gtest_break_on_failure_unittest_ test gtest)
-  py_test(gtest_break_on_failure_unittest)
+  cxx_executable(googletest-break-on-failure-unittest_ test gtest)
+  py_test(googletest-break-on-failure-unittest)
 
   # Visual Studio .NET 2003 does not support STL with exceptions disabled.
   if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
     cxx_executable_with_flags(
-      gtest_catch_exceptions_no_ex_test_
+      googletest-catch-exceptions-no-ex-test_
       "${cxx_no_exception}"
       gtest_main_no_exception
-      test/gtest_catch_exceptions_test_.cc)
+      test/googletest-catch-exceptions-test_.cc)
   endif()
 
   cxx_executable_with_flags(
-    gtest_catch_exceptions_ex_test_
+    googletest-catch-exceptions-ex-test_
     "${cxx_exception}"
     gtest_main
-    test/gtest_catch_exceptions_test_.cc)
-  py_test(gtest_catch_exceptions_test)
+    test/googletest-catch-exceptions-test_.cc)
+  py_test(googletest-catch-exceptions-test)
 
-  cxx_executable(gtest_color_test_ test gtest)
-  py_test(gtest_color_test)
+  cxx_executable(googletest-color-test_ test gtest)
+  py_test(googletest-color-test)
 
-  cxx_executable(gtest_env_var_test_ test gtest)
-  py_test(gtest_env_var_test)
+  cxx_executable(googletest-env-var-test_ test gtest)
+  py_test(googletest-env-var-test)
 
-  cxx_executable(gtest_filter_unittest_ test gtest)
-  py_test(gtest_filter_unittest)
+  cxx_executable(googletest-filter-unittest_ test gtest)
+  py_test(googletest-filter-unittest)
 
   cxx_executable(gtest_help_test_ test gtest_main)
   py_test(gtest_help_test)
 
-  cxx_executable(gtest_list_tests_unittest_ test gtest)
-  py_test(gtest_list_tests_unittest)
+  cxx_executable(googletest-list-tests-unittest_ test gtest)
+  py_test(googletest-list-tests-unittest)
 
-  cxx_executable(gtest_output_test_ test gtest)
-  py_test(gtest_output_test)
+  cxx_executable(googletest-output-test_ test gtest)
+  py_test(googletest-output-test --no_stacktrace_support)
 
-  cxx_executable(gtest_shuffle_test_ test gtest)
-  py_test(gtest_shuffle_test)
+  cxx_executable(googletest-shuffle-test_ test gtest)
+  py_test(googletest-shuffle-test)
 
   # MSVC 7.1 does not support STL with exceptions disabled.
   if (NOT MSVC OR MSVC_VERSION GREATER 1310)
-    cxx_executable(gtest_throw_on_failure_test_ test gtest_no_exception)
-    set_target_properties(gtest_throw_on_failure_test_
+    cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception)
+    set_target_properties(googletest-throw-on-failure-test_
       PROPERTIES
       COMPILE_FLAGS "${cxx_no_exception}")
-    py_test(gtest_throw_on_failure_test)
+    py_test(googletest-throw-on-failure-test)
   endif()
 
-  cxx_executable(gtest_uninitialized_test_ test gtest)
-  py_test(gtest_uninitialized_test)
+  cxx_executable(googletest-uninitialized-test_ test gtest)
+  py_test(googletest-uninitialized-test)
 
   cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
   cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
   py_test(gtest_xml_outfiles_test)
+  py_test(googletest-json-outfiles-test)
 
   cxx_executable(gtest_xml_output_unittest_ test gtest)
-  py_test(gtest_xml_output_unittest)
+  py_test(gtest_xml_output_unittest --no_stacktrace_support)
+  py_test(googletest-json-output-unittest --no_stacktrace_support)
 endif()

diff --git a/libaom/third_party/googletest/src/googletest/README.md b/libaom/third_party/googletest/src/googletest/README.md
index edd4408..e30fe80 100644
--- a/libaom/third_party/googletest/src/googletest/README.md
+++ b/libaom/third_party/googletest/src/googletest/README.md

@@ -1,23 +1,21 @@
+### Generic Build Instructions
 
-### Generic Build Instructions ###
+#### Setup
 
-#### Setup ####
+To build Google Test and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
 
-To build Google Test and your tests that use it, you need to tell your
-build system where to find its headers and source files.  The exact
-way to do it depends on which build system you use, and is usually
-straightforward.
+#### Build
 
-#### Build ####
-
-Suppose you put Google Test in directory `${GTEST_DIR}`.  To build it,
-create a library build target (or a project as called by Visual Studio
-and Xcode) to compile
+Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a
+library build target (or a project as called by Visual Studio and Xcode) to
+compile
 
     ${GTEST_DIR}/src/gtest-all.cc
 
 with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
-in the normal header search path.  Assuming a Linux-like system and gcc,
+in the normal header search path. Assuming a Linux-like system and gcc,
 something like the following will do:
 
     g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
@@ -26,136 +24,239 @@
 
 (We need `-pthread` as Google Test uses threads.)
 
-Next, you should compile your test source file with
-`${GTEST_DIR}/include` in the system header search path, and link it
-with gtest and any other necessary libraries:
+Next, you should compile your test source file with `${GTEST_DIR}/include` in
+the system header search path, and link it with gtest and any other necessary
+libraries:
 
     g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
         -o your_test
 
-As an example, the make/ directory contains a Makefile that you can
-use to build Google Test on systems where GNU make is available
-(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
-Test's own tests.  Instead, it just builds the Google Test library and
-a sample test.  You can use it as a starting point for your own build
-script.
+As an example, the make/ directory contains a Makefile that you can use to build
+Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and
+Cygwin). It doesn't try to build Google Test's own tests. Instead, it just
+builds the Google Test library and a sample test. You can use it as a starting
+point for your own build script.
 
-If the default settings are correct for your environment, the
-following commands should succeed:
+If the default settings are correct for your environment, the following commands
+should succeed:
 
     cd ${GTEST_DIR}/make
     make
     ./sample1_unittest
 
-If you see errors, try to tweak the contents of `make/Makefile` to make
-them go away.  There are instructions in `make/Makefile` on how to do
-it.
+If you see errors, try to tweak the contents of `make/Makefile` to make them go
+away. There are instructions in `make/Makefile` on how to do it.
 
-### Using CMake ###
+### Using CMake
 
 Google Test comes with a CMake build script (
-[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for
-cross-platform.). If you don't have CMake installed already, you can
-download it for free from <http://www.cmake.org/>.
+[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
 
-CMake works by generating native makefiles or build projects that can
-be used in the compiler environment of your choice.  The typical
-workflow starts with:
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build Google Test as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
+
+#### Standalone CMake Project
+
+When building Google Test as a standalone project, the typical workflow starts
+with:
 
     mkdir mybuild       # Create a directory to hold the build output.
     cd mybuild
     cmake ${GTEST_DIR}  # Generate native build scripts.
 
-If you want to build Google Test's samples, you should replace the
-last command with
+If you want to build Google Test's samples, you should replace the last command
+with
 
     cmake -Dgtest_build_samples=ON ${GTEST_DIR}
 
-If you are on a \*nix system, you should now see a Makefile in the
-current directory.  Just type 'make' to build gtest.
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type 'make' to build gtest.
 
-If you use Windows and have Visual Studio installed, a `gtest.sln` file
-and several `.vcproj` files will be created.  You can then build them
-using Visual Studio.
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
 
 On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
 
-### Legacy Build Scripts ###
+#### Incorporating Into An Existing CMake Project
+
+If you want to use gtest in a project which already uses CMake, then a more
+robust and flexible approach is to build gtest as part of that project directly.
+This is done by making the GoogleTest source code available to the main build
+and adding it using CMake's `add_subdirectory()` command. This has the
+significant advantage that the same compiler and linker settings are used
+between gtest and the rest of your project, so issues associated with using
+incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+*   Download the GoogleTest source code manually and place it at a known
+    location. This is the least flexible approach and can make it more difficult
+    to use with continuous integration systems, etc.
+*   Embed the GoogleTest source code as a direct copy in the main project's
+    source tree. This is often the simplest approach, but is also the hardest to
+    keep up to date. Some organizations may not permit this method.
+*   Add GoogleTest as a git submodule or equivalent. This may not always be
+    possible or appropriate. Git submodules, for example, have their own set of
+    advantages and drawbacks.
+*   Use CMake to download GoogleTest as part of the build's configure step. This
+    is just a little more complex, but doesn't have the limitations of the other
+    methods.
+
+The last of the above methods is implemented with a small piece of CMake code in
+a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
+then invoked as a sub-build _during the CMake stage_. That directory is then
+pulled into the main build with `add_subdirectory()`. For example:
+
+New file `CMakeLists.txt.in`:
+
+    cmake_minimum_required(VERSION 2.8.2)
+
+    project(googletest-download NONE)
+
+    include(ExternalProject)
+    ExternalProject_Add(googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/googletest-src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/googletest-build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+
+Existing build's `CMakeLists.txt`:
+
+    # Download and unpack googletest at configure time
+    configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "Build step for googletest failed: ${result}")
+    endif()
+
+    # Prevent overriding the parent project's compiler/linker
+    # settings on Windows
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+    # Add googletest directly to our build. This defines
+    # the gtest and gtest_main targets.
+    add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
+                     ${CMAKE_BINARY_DIR}/googletest-build
+                     EXCLUDE_FROM_ALL)
+
+    # The gtest/gtest_main targets carry header search path
+    # dependencies automatically when using CMake 2.8.11 or
+    # later. Otherwise we have to add them here ourselves.
+    if (CMAKE_VERSION VERSION_LESS 2.8.11)
+      include_directories("${gtest_SOURCE_DIR}/include")
+    endif()
+
+    # Now simply link against gtest or gtest_main as needed. Eg
+    add_executable(example example.cpp)
+    target_link_libraries(example gtest_main)
+    add_test(NAME example_test COMMAND example)
+
+Note that this approach requires CMake 2.8.2 or later due to its use of the
+`ExternalProject_Add()` command. The above technique is discussed in more detail
+in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
+also contains a link to a fully generalized implementation of the technique.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+Google Test links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+Google Test already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+### Legacy Build Scripts
 
 Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools.  While we
-continue to provide them for convenience, they are not actively
-maintained any more.  We highly recommend that you follow the
-instructions in the previous two sections to integrate Google Test
-with your existing build system.
+projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to
+provide them for convenience, they are not actively maintained any more. We
+highly recommend that you follow the instructions in the above sections to
+integrate Google Test with your existing build system.
 
 If you still need to use the legacy build scripts, here's how:
 
-The msvc\ folder contains two solutions with Visual C++ projects.
-Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you
-are ready to build Google Test the same way you build any Visual
-Studio project.  Files that have names ending with -md use DLL
-versions of Microsoft runtime libraries (the /MD or the /MDd compiler
-option).  Files without that suffix use static versions of the runtime
-libraries (the /MT or the /MTd option).  Please note that one must use
-the same option to compile both gtest and the test code.  If you use
-Visual Studio 2005 or above, we recommend the -md version as /MD is
-the default for new projects in these versions of Visual Studio.
+The msvc\ folder contains two solutions with Visual C++ projects. Open the
+`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to
+build Google Test the same way you build any Visual Studio project. Files that
+have names ending with -md use DLL versions of Microsoft runtime libraries (the
+/MD or the /MDd compiler option). Files without that suffix use static versions
+of the runtime libraries (the /MT or the /MTd option). Please note that one must
+use the same option to compile both gtest and the test code. If you use Visual
+Studio 2005 or above, we recommend the -md version as /MD is the default for new
+projects in these versions of Visual Studio.
 
-On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using
-Xcode.  Build the "gtest" target.  The universal binary framework will
-end up in your selected build directory (selected in the Xcode
-"Preferences..." -> "Building" pane and defaults to xcode/build).
-Alternatively, at the command line, enter:
+On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode.
+Build the "gtest" target. The universal binary framework will end up in your
+selected build directory (selected in the Xcode "Preferences..." -> "Building"
+pane and defaults to xcode/build). Alternatively, at the command line, enter:
 
     xcodebuild
 
-This will build the "Release" configuration of gtest.framework in your
-default build location.  See the "xcodebuild" man page for more
-information about building different configurations and building in
-different locations.
+This will build the "Release" configuration of gtest.framework in your default
+build location. See the "xcodebuild" man page for more information about
+building different configurations and building in different locations.
 
-If you wish to use the Google Test Xcode project with Xcode 4.x and
-above, you need to either:
+If you wish to use the Google Test Xcode project with Xcode 4.x and above, you
+need to either:
 
- * update the SDK configuration options in xcode/Config/General.xconfig.
-   Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
-   you choose this route you lose the ability to target earlier versions
-   of MacOS X.
- * Install an SDK for an earlier version. This doesn't appear to be
-   supported by Apple, but has been reported to work
-   (http://stackoverflow.com/questions/5378518).
+*   update the SDK configuration options in xcode/Config/General.xconfig.
+    Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
+    you choose this route you lose the ability to target earlier versions of
+    MacOS X.
+*   Install an SDK for an earlier version. This doesn't appear to be supported
+    by Apple, but has been reported to work
+    (http://stackoverflow.com/questions/5378518).
 
-### Tweaking Google Test ###
+### Tweaking Google Test
 
-Google Test can be used in diverse environments.  The default
-configuration may not work (or may not work well) out of the box in
-some environments.  However, you can easily tweak Google Test by
-defining control macros on the compiler command line.  Generally,
-these macros are named like `GTEST_XYZ` and you define them to either 1
-or 0 to enable or disable a certain feature.
+Google Test can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak Google Test by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
 
-We list the most frequently used macros below.  For a complete list,
-see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h).
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h).
 
-### Choosing a TR1 Tuple Library ###
+### Choosing a TR1 Tuple Library
 
-Some Google Test features require the C++ Technical Report 1 (TR1)
-tuple library, which is not yet available with all compilers.  The
-good news is that Google Test implements a subset of TR1 tuple that's
-enough for its own need, and will automatically use this when the
-compiler doesn't provide TR1 tuple.
+Some Google Test features require the C++ Technical Report 1 (TR1) tuple
+library, which is not yet available with all compilers. The good news is that
+Google Test implements a subset of TR1 tuple that's enough for its own need, and
+will automatically use this when the compiler doesn't provide TR1 tuple.
 
-Usually you don't need to care about which tuple library Google Test
-uses.  However, if your project already uses TR1 tuple, you need to
-tell Google Test to use the same TR1 tuple library the rest of your
-project uses, or the two tuple implementations will clash.  To do
-that, add
+Usually you don't need to care about which tuple library Google Test uses.
+However, if your project already uses TR1 tuple, you need to tell Google Test to
+use the same TR1 tuple library the rest of your project uses, or the two tuple
+implementations will clash. To do that, add
 
     -DGTEST_USE_OWN_TR1_TUPLE=0
 
-to the compiler flags while compiling Google Test and your tests.  If
-you want to force Google Test to use its own tuple library, just add
+to the compiler flags while compiling Google Test and your tests. If you want to
+force Google Test to use its own tuple library, just add
 
     -DGTEST_USE_OWN_TR1_TUPLE=1
 
@@ -167,15 +268,15 @@
 
 and all features using tuple will be disabled.
 
-### Multi-threaded Tests ###
+### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available.
-After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE`
-macro to see whether this is the case (yes if the macro is `#defined` to
-1, no if it's undefined.).
+Google Test is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see
+whether this is the case (yes if the macro is `#defined` to 1, no if it's
+undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available
-in your environment, you can force it with
+If Google Test doesn't correctly detect whether pthread is available in your
+environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
 
@@ -183,26 +284,24 @@
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your
-compiler and/or linker to select the pthread library, or you'll get
-link errors.  If you use the CMake script or the deprecated Autotools
-script, this is taken care of for you.  If you use your own build
-script, you'll need to read your compiler and linker's manual to
-figure out what flags to add.
+When Google Test uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script or the deprecated Autotools script, this is taken care of for you.
+If you use your own build script, you'll need to read your compiler and linker's
+manual to figure out what flags to add.
 
-### As a Shared Library (DLL) ###
+### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a
-static library for the simplicity.  You can choose to use Google Test
-as a shared library (known as a DLL on Windows) if you prefer.
+Google Test is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use Google Test as a shared library (known
+as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
 
     -DGTEST_CREATE_SHARED_LIBRARY=1
 
-to the compiler flags.  You'll also need to tell the linker to produce
-a shared library instead - consult your linker's manual for how to do
-it.
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
 
 To compile your *tests* that use the gtest shared library, add
 
@@ -210,31 +309,28 @@
 
 to the compiler flags.
 
-Note: while the above steps aren't technically necessary today when
-using some compilers (e.g. GCC), they may become necessary in the
-future, if we decide to improve the speed of loading the library (see
-<http://gcc.gnu.org/wiki/Visibility> for details).  Therefore you are
-recommended to always add the above flags when using Google Test as a
-shared library.  Otherwise a future release of Google Test may break
-your build script.
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using Google Test as a shared library.
+Otherwise a future release of Google Test may break your build script.
 
-### Avoiding Macro Name Clashes ###
+### Avoiding Macro Name Clashes
 
-In C++, macros don't obey namespaces.  Therefore two libraries that
-both define a macro of the same name will clash if you `#include` both
-definitions.  In case a Google Test macro clashes with another
-library, you can force Google Test to rename its macro to avoid the
-conflict.
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+Google Test macro clashes with another library, you can force Google Test to
+rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro
-FOO, you can add
+Specifically, if both Google Test and some other code define macro FOO, you can
+add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name
-from `FOO` to `GTEST_FOO`.  Currently `FOO` can be `FAIL`, `SUCCEED`,
-or `TEST`.  For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll
-need to write
+to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
 
@@ -243,38 +339,3 @@
     TEST(SomeTest, DoesThis) { ... }
 
 in order to define a test.
-
-## Developing Google Test ##
-
-This section discusses how to make your own changes to Google Test.
-
-### Testing Google Test Itself ###
-
-To make sure your changes work as intended and don't break existing
-functionality, you'll want to compile and run Google Test's own tests.
-For that you can use CMake:
-
-    mkdir mybuild
-    cd mybuild
-    cmake -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Make sure you have Python installed, as some of Google Test's tests
-are written in Python.  If the cmake command complains about not being
-able to find Python (`Could NOT find PythonInterp (missing:
-PYTHON_EXECUTABLE)`), try telling it explicitly where your Python
-executable can be found:
-
-    cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Next, you can build Google Test and all of its own tests.  On \*nix,
-this is usually done by 'make'.  To run the tests, do
-
-    make test
-
-All tests should pass.
-
-Normally you don't need to worry about regenerating the source files,
-unless you need to modify them.  In that case, you should modify the
-corresponding .pump files instead and run the pump.py Python script to
-regenerate them.  You can find pump.py in the [scripts/](scripts/) directory.
-Read the [Pump manual](docs/PumpManual.md) for how to use it.

diff --git a/libaom/third_party/googletest/src/googletest/cmake/Config.cmake.in b/libaom/third_party/googletest/src/googletest/cmake/Config.cmake.in
new file mode 100644
index 0000000..12be449
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/cmake/Config.cmake.in

@@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+include(CMakeFindDependencyMacro)
+if (@GTEST_HAS_PTHREAD@)
+  set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@)
+  find_dependency(Threads)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
+check_required_components("@project_name@")

diff --git a/libaom/third_party/googletest/src/googletest/cmake/gtest.pc.in b/libaom/third_party/googletest/src/googletest/cmake/gtest.pc.in
new file mode 100644
index 0000000..e7967ad
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/cmake/gtest.pc.in

@@ -0,0 +1,9 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest
+Description: GoogleTest (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@

diff --git a/libaom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in b/libaom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
new file mode 100644
index 0000000..fe25d9c
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in

@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest_main
+Description: GoogleTest (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest
+Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@

diff --git a/libaom/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/libaom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
index 777b91e..8c1f9ba 100644
--- a/libaom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
+++ b/libaom/third_party/googletest/src/googletest/cmake/internal_utils.cmake

@@ -20,7 +20,7 @@
   if (MSVC)
     # For MSVC, CMake sets certain flags to defaults we want to override.
     # This replacement code is taken from sample in the CMake Wiki at
-    # http://www.cmake.org/Wiki/CMake_FAQ#Dynamic_Replace.
+    # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
     foreach (flag_var
              CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
              CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
@@ -38,6 +38,11 @@
       # We prefer more strict warning checking for building Google Test.
       # Replaces /W3 with /W4 in defaults.
       string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
+
+      # Prevent D9025 warning for targets that have exception handling
+      # turned off (/EHs-c- flag). Where required, exceptions are explicitly
+      # re-enabled using the cxx_exception_flags variable.
+      string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}")
     endforeach()
   endif()
 endmacro()
@@ -46,9 +51,16 @@
 # Google Mock.  You can tweak these definitions to suit your need.  A
 # variable's value is empty before it's explicitly assigned to.
 macro(config_compiler_and_linker)
-  if (NOT gtest_disable_pthreads)
+  # Note: pthreads on MinGW is not supported, even if available
+  # instead, we use windows threading primitives
+  unset(GTEST_HAS_PTHREAD)
+  if (NOT gtest_disable_pthreads AND NOT MINGW)
     # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
     find_package(Threads)
+    if (CMAKE_USE_PTHREADS_INIT)
+      set(GTEST_HAS_PTHREAD ON)
+    endif()
   endif()
 
   fix_default_compiler_settings_()
@@ -80,18 +92,17 @@
       # http://stackoverflow.com/questions/3232669 explains the issue.
       set(cxx_base_flags "${cxx_base_flags} -wd4702")
     endif()
-    if (NOT (MSVC_VERSION GREATER 1900))  # 1900 is Visual Studio 2015
-      # BigObj required for tests.
-      set(cxx_base_flags "${cxx_base_flags} -bigobj")
-    endif()
 
     set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
     set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
     set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
-    set(cxx_no_exception_flags "-D_HAS_EXCEPTIONS=0")
+    set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
     set(cxx_no_rtti_flags "-GR-")
   elseif (CMAKE_COMPILER_IS_GNUCXX)
-    set(cxx_base_flags "-Wall -Wshadow")
+    set(cxx_base_flags "-Wall -Wshadow -Werror")
+    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
+      set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
+    endif()
     set(cxx_exception_flags "-fexceptions")
     set(cxx_no_exception_flags "-fno-exceptions")
     # Until version 4.3.2, GCC doesn't define a macro to indicate
@@ -123,14 +134,16 @@
     set(cxx_no_rtti_flags "")
   endif()
 
-  if (CMAKE_USE_PTHREADS_INIT)  # The pthreads library is available and allowed.
-    set(cxx_base_flags "${cxx_base_flags} -DGTEST_HAS_PTHREAD=1")
+  # The pthreads library is available and allowed?
+  if (DEFINED GTEST_HAS_PTHREAD)
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1")
   else()
-    set(cxx_base_flags "${cxx_base_flags} -DGTEST_HAS_PTHREAD=0")
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0")
   endif()
+  set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}")
 
   # For building gtest's own tests and samples.
-  set(cxx_exception "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_exception_flags}")
+  set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}")
   set(cxx_no_exception
     "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
   set(cxx_default "${cxx_exception}")
@@ -150,13 +163,26 @@
   set_target_properties(${name}
     PROPERTIES
     COMPILE_FLAGS "${cxx_flags}")
+  # Generate debug library name with a postfix.
+  set_target_properties(${name}
+    PROPERTIES
+    DEBUG_POSTFIX "d")
   if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
     set_target_properties(${name}
       PROPERTIES
       COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
+    if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+      target_compile_definitions(${name} INTERFACE
+        $<INSTALL_INTERFACE:GTEST_LINKED_AS_SHARED_LIBRARY=1>)
+    endif()
   endif()
-  if (CMAKE_USE_PTHREADS_INIT)
-    target_link_libraries(${name} ${CMAKE_THREAD_LIBS_INIT})
+  if (DEFINED GTEST_HAS_PTHREAD)
+    if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+      set(threads_spec ${CMAKE_THREAD_LIBS_INIT})
+    else()
+      set(threads_spec Threads::Threads)
+    endif()
+    target_link_libraries(${name} PUBLIC ${threads_spec})
   endif()
 endfunction()
 
@@ -178,6 +204,10 @@
 # is built from the given source files with the given compiler flags.
 function(cxx_executable_with_flags name cxx_flags libs)
   add_executable(${name} ${ARGN})
+  if (MSVC AND (NOT (MSVC_VERSION LESS 1700)))  # 1700 is Visual Studio 2012.
+    # BigObj required for tests.
+    set(cxx_flags "${cxx_flags} -bigobj")
+  endif()
   if (cxx_flags)
     set_target_properties(${name}
       PROPERTIES
@@ -214,7 +244,7 @@
 # from the given source files with the given compiler flags.
 function(cxx_test_with_flags name cxx_flags libs)
   cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
-  add_test(${name} ${name})
+  add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
 # cxx_test(name libs srcs...)
@@ -232,23 +262,57 @@
 # creates a Python test with the given name whose main module is in
 # test/name.py.  It does nothing if Python is not installed.
 function(py_test name)
-  # We are not supporting Python tests on Linux yet as they consider
-  # all Linux environments to be google3 and try to use google3 features.
   if (PYTHONINTERP_FOUND)
-    # ${CMAKE_BINARY_DIR} is known at configuration time, so we can
-    # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
-    # only at ctest runtime (by calling ctest -c <Configuration>), so
-    # we have to escape $ to delay variable substitution here.
     if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
-      add_test(
-        NAME ${name}
-        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>)
+      if (CMAKE_CONFIGURATION_TYPES)
+	# Multi-configuration build generators as for Visual Studio save
+	# output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
+	# Release etc.), so we have to provide it here.
+        add_test(
+          NAME ${name}
+          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
+      else (CMAKE_CONFIGURATION_TYPES)
+	# Single-configuration build generators like Makefile generators
+	# don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
+        add_test(
+          NAME ${name}
+          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+      endif (CMAKE_CONFIGURATION_TYPES)
     else (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+      # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
+      # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
+      # only at ctest runtime (by calling ctest -c <Configuration>), so
+      # we have to escape $ to delay variable substitution here.
       add_test(
         ${name}
         ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-          --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE})
+          --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
     endif (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+  endif(PYTHONINTERP_FOUND)
+endfunction()
+
+# install_project(targets...)
+#
+# Installs the specified targets and configures the associated pkgconfig files.
+function(install_project)
+  if(INSTALL_GTEST)
+    install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/"
+      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+    # Install the project targets.
+    install(TARGETS ${ARGN}
+      EXPORT ${targets_export_name}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+      LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+    # Configure and install pkgconfig files.
+    foreach(t ${ARGN})
+      set(configured_pc "${generated_dir}/${t}.pc")
+      configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in"
+        "${configured_pc}" @ONLY)
+      install(FILES "${configured_pc}"
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    endforeach()
   endif()
 endfunction()

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
index 957a69c..39f0ded 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h

@@ -26,14 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
@@ -99,10 +99,11 @@
 //
 // On the regular expressions used in death tests:
 //
+//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
-//   On other platforms (e.g. Windows), we only support a simple regex
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
 //   syntax implemented as part of Google Test.  This limited
 //   implementation should be enough most of the time when writing
 //   death tests; though it lacks many features you can find in PCRE
@@ -160,29 +161,28 @@
 //   is rarely a problem as people usually don't put the test binary
 //   directory in PATH.
 //
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
 
 // Asserts that a given statement causes the program to exit, with an
 // integer exit status that satisfies predicate, and emitting error output
 // that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+#define ASSERT_EXIT(statement, predicate, regex) \
+  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
 
 // Like ASSERT_EXIT, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+// test suite, if any:
+#define EXPECT_EXIT(statement, predicate, regex) \
+  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
 
 // Asserts that a given statement causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
 // signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+#define ASSERT_DEATH(statement, regex) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
 
 // Like ASSERT_DEATH, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+// test suite, if any:
+#define EXPECT_DEATH(statement, regex) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -191,24 +191,27 @@
  public:
   explicit ExitedWithCode(int exit_code);
   bool operator()(int exit_status) const;
+
  private:
   // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
+  void operator=(const ExitedWithCode &other);
 
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
+
  private:
   const int signum_;
 };
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -226,7 +229,7 @@
 //   return 12;
 // }
 //
-// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
 //   int sideeffect = 0;
 //   // Only asserts in dbg.
 //   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
@@ -253,40 +256,85 @@
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-# ifdef NDEBUG
+#ifdef NDEBUG
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-# else
+#else
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
 
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters
+// on systems that support death tests. This allows one to write such a macro on
+// a system that does not support death tests and be sure that it will compile
+// on a death-test supporting system. It is exposed publicly so that systems
+// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST
+// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and
+// ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter if and only if EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
+
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
 // death tests are supported; otherwise they just issue a warning.  This is
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 #else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return )
 #endif
 
 }  // namespace testing

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
new file mode 100644
index 0000000..20be24f
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h

@@ -0,0 +1,769 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+// IWYU pragma: private, include "testing/base/public/gunit.h"
+// IWYU pragma: friend third_party/googletest/googlemock/.*
+// IWYU pragma: friend third_party/googletest/googletest/.*
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GTEST_MAYBE_5046_ 5046
+#else
+#define GTEST_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// MatchResultListener is an abstract class.  Its << operator can be
+// used by a matcher to explain why a value matches or doesn't match.
+//
+class MatchResultListener {
+ public:
+  // Creates a listener object with the given underlying ostream.  The
+  // listener does not own the ostream, and does not dereference it
+  // in the constructor or destructor.
+  explicit MatchResultListener(::std::ostream *os) : stream_(os) {}
+  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
+
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener &operator<<(const T &x) {
+    if (stream_ != nullptr) *stream_ << x;
+    return *this;
+  }
+
+  // Returns the underlying ostream.
+  ::std::ostream *stream() { return stream_; }
+
+  // Returns true if and only if the listener is interested in an explanation
+  // of the match result.  A matcher's MatchAndExplain() method can use
+  // this information to avoid generating the explanation when no one
+  // intends to hear it.
+  bool IsInterested() const { return stream_ != nullptr; }
+
+ private:
+  ::std::ostream *const stream_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+};
+
+inline MatchResultListener::~MatchResultListener() {}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class MatcherDescriberInterface {
+ public:
+  virtual ~MatcherDescriberInterface() {}
+
+  // Describes this matcher to an ostream.  The function should print
+  // a verb phrase that describes the property a value matching this
+  // matcher should have.  The subject of the verb phrase is the value
+  // being matched.  For example, the DescribeTo() method of the Gt(7)
+  // matcher prints "is greater than 7".
+  virtual void DescribeTo(::std::ostream *os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.  For
+  // example, if the description of this matcher is "is greater than
+  // 7", the negated description could be "is not greater than 7".
+  // You are not required to override this when implementing
+  // MatcherInterface, but it is highly advised so that your matcher
+  // can produce good error messages.
+  virtual void DescribeNegationTo(::std::ostream *os) const {
+    *os << "not (";
+    DescribeTo(os);
+    *os << ")";
+  }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener' if necessary (see the next paragraph), in
+  // the form of a non-restrictive relative clause ("which ...",
+  // "whose ...", etc) that describes x.  For example, the
+  // MatchAndExplain() method of the Pointee(...) matcher should
+  // generate an explanation like "which points to ...".
+  //
+  // Implementations of MatchAndExplain() should add an explanation of
+  // the match result *if and only if* they can provide additional
+  // information that's not already present (or not obvious) in the
+  // print-out of x and the matcher's description.  Whether the match
+  // succeeds is not a factor in deciding whether an explanation is
+  // needed, as sometimes the caller needs to print a failure message
+  // when the match succeeds (e.g. when the matcher is used inside
+  // Not()).
+  //
+  // For example, a "has at least 10 elements" matcher should explain
+  // what the actual element count is, regardless of the match result,
+  // as it is useful information to the reader; on the other hand, an
+  // "is empty" matcher probably only needs to explain what the actual
+  // size is when the match fails, as it's redundant to say that the
+  // size is 0 when the value is already known to be empty.
+  //
+  // You should override this method when defining a new matcher.
+  //
+  // It's the responsibility of the caller (Google Test) to guarantee
+  // that 'listener' is not NULL.  This helps to simplify a matcher's
+  // implementation when it doesn't care about the performance, as it
+  // can talk to 'listener' without checking its validity first.
+  // However, in order to implement dummy listeners efficiently,
+  // listener->stream() may be NULL.
+  virtual bool MatchAndExplain(T x, MatchResultListener *listener) const = 0;
+
+  // Inherits these methods from MatcherDescriberInterface:
+  //   virtual void DescribeTo(::std::ostream* os) const = 0;
+  //   virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+namespace internal {
+
+// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
+template <typename T>
+class MatcherInterfaceAdapter : public MatcherInterface<const T &> {
+ public:
+  explicit MatcherInterfaceAdapter(const MatcherInterface<T> *impl)
+      : impl_(impl) {}
+  ~MatcherInterfaceAdapter() override { delete impl_; }
+
+  void DescribeTo(::std::ostream *os) const override { impl_->DescribeTo(os); }
+
+  void DescribeNegationTo(::std::ostream *os) const override {
+    impl_->DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(const T &x,
+                       MatchResultListener *listener) const override {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+ private:
+  const MatcherInterface<T> *const impl_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
+};
+
+struct AnyEq {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a == b;
+  }
+};
+struct AnyNe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a != b;
+  }
+};
+struct AnyLt {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a < b;
+  }
+};
+struct AnyGt {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a > b;
+  }
+};
+struct AnyLe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a <= b;
+  }
+};
+struct AnyGe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a >= b;
+  }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+  DummyMatchResultListener() : MatchResultListener(nullptr) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream.  The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+  explicit StreamMatchResultListener(::std::ostream *os)
+      : MatchResultListener(os) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it.  We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener'.
+  bool MatchAndExplain(const T &x, MatchResultListener *listener) const {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+  // Returns true if and only if this matcher matches x.
+  bool Matches(const T &x) const {
+    DummyMatchResultListener dummy;
+    return MatchAndExplain(x, &dummy);
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(::std::ostream *os) const { impl_->DescribeTo(os); }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(::std::ostream *os) const {
+    impl_->DescribeNegationTo(os);
+  }
+
+  // Explains why x matches, or doesn't match, the matcher.
+  void ExplainMatchResultTo(const T &x, ::std::ostream *os) const {
+    StreamMatchResultListener listener(os);
+    MatchAndExplain(x, &listener);
+  }
+
+  // Returns the describer for this matcher object; retains ownership
+  // of the describer, which is only guaranteed to be alive when
+  // this matcher object is alive.
+  const MatcherDescriberInterface *GetDescriber() const { return impl_.get(); }
+
+ protected:
+  MatcherBase() {}
+
+  // Constructs a matcher from its implementation.
+  explicit MatcherBase(const MatcherInterface<const T &> *impl) : impl_(impl) {}
+
+  template <typename U>
+  explicit MatcherBase(
+      const MatcherInterface<U> *impl,
+      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
+          nullptr)
+      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+
+  MatcherBase(const MatcherBase &) = default;
+  MatcherBase &operator=(const MatcherBase &) = default;
+  MatcherBase(MatcherBase &&) = default;
+  MatcherBase &operator=(MatcherBase &&) = default;
+
+  virtual ~MatcherBase() {}
+
+ private:
+  std::shared_ptr<const MatcherInterface<const T &>> impl_;
+};
+
+}  // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches.  The
+// implementation of Matcher<T> is just a std::shared_ptr to const
+// MatcherInterface<T>.  Don't inherit from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+  // Constructs a null matcher.  Needed for storing Matcher objects in STL
+  // containers.  A default-constructed matcher is not yet initialized.  You
+  // cannot use it until a valid value has been assigned to it.
+  explicit Matcher() {}  // NOLINT
+
+  // Constructs a matcher from its implementation.
+  explicit Matcher(const MatcherInterface<const T &> *impl)
+      : internal::MatcherBase<T>(impl) {}
+
+  template <typename U>
+  explicit Matcher(
+      const MatcherInterface<U> *impl,
+      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
+          nullptr)
+      : internal::MatcherBase<T>(impl) {}
+
+  // Implicit constructor here allows people to write
+  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+  Matcher(T value);  // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const std::string &>
+    : public internal::MatcherBase<const std::string &> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string &> *impl)
+      : internal::MatcherBase<const std::string &>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<std::string>
+    : public internal::MatcherBase<std::string> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string &> *impl)
+      : internal::MatcherBase<std::string>(impl) {}
+  explicit Matcher(const MatcherInterface<std::string> *impl)
+      : internal::MatcherBase<std::string>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+};
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const internal::StringView &>
+    : public internal::MatcherBase<const internal::StringView &> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
+      : internal::MatcherBase<const internal::StringView &>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<internal::StringView>
+    : public internal::MatcherBase<internal::StringView> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+  explicit Matcher(const MatcherInterface<internal::StringView> *impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Prints a matcher in a human-readable format.
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const Matcher<T> &matcher) {
+  matcher.DescribeTo(&os);
+  return os;
+}
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+//   bool MatchAndExplain(const Value& value,
+//                        MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+  explicit PolymorphicMatcher(const Impl &an_impl) : impl_(an_impl) {}
+
+  // Returns a mutable reference to the underlying matcher
+  // implementation object.
+  Impl &mutable_impl() { return impl_; }
+
+  // Returns an immutable reference to the underlying matcher
+  // implementation object.
+  const Impl &impl() const { return impl_; }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new MonomorphicImpl<const T &>(impl_));
+  }
+
+ private:
+  template <typename T>
+  class MonomorphicImpl : public MatcherInterface<T> {
+   public:
+    explicit MonomorphicImpl(const Impl &impl) : impl_(impl) {}
+
+    void DescribeTo(::std::ostream *os) const override { impl_.DescribeTo(os); }
+
+    void DescribeNegationTo(::std::ostream *os) const override {
+      impl_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener *listener) const override {
+      return impl_.MatchAndExplain(x, listener);
+    }
+
+   private:
+    const Impl impl_;
+  };
+
+  Impl impl_;
+};
+
+// Creates a matcher from its implementation.
+// DEPRECATED: Especially in the generic code, prefer:
+//   Matcher<T>(new MyMatcherImpl<const T&>(...));
+//
+// MakeMatcher may create a Matcher that accepts its argument by value, which
+// leads to unnecessary copies & lack of support for non-copyable types.
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T> *impl) {
+  return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation.  This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicMatcher(foo);
+// vs
+//   PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl &impl) {
+  return PolymorphicMatcher<Impl>(impl);
+}
+
+namespace internal {
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators.  The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc).  Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+  explicit ComparisonBase(const Rhs &rhs) : rhs_(rhs) {}
+  template <typename Lhs>
+  operator Matcher<Lhs>() const {
+    return Matcher<Lhs>(new Impl<const Lhs &>(rhs_));
+  }
+
+ private:
+  template <typename T>
+  static const T &Unwrap(const T &v) {
+    return v;
+  }
+  template <typename T>
+  static const T &Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
+
+  template <typename Lhs, typename = Rhs>
+  class Impl : public MatcherInterface<Lhs> {
+   public:
+    explicit Impl(const Rhs &rhs) : rhs_(rhs) {}
+    bool MatchAndExplain(Lhs lhs,
+                         MatchResultListener * /* listener */) const override {
+      return Op()(lhs, Unwrap(rhs_));
+    }
+    void DescribeTo(::std::ostream *os) const override {
+      *os << D::Desc() << " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+    void DescribeNegationTo(::std::ostream *os) const override {
+      *os << D::NegatedDesc() << " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+
+   private:
+    Rhs rhs_;
+  };
+  Rhs rhs_;
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+  explicit EqMatcher(const Rhs &rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
+  static const char *Desc() { return "is equal to"; }
+  static const char *NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+  explicit NeMatcher(const Rhs &rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
+  static const char *Desc() { return "isn't equal to"; }
+  static const char *NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+  explicit LtMatcher(const Rhs &rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
+  static const char *Desc() { return "is <"; }
+  static const char *NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+  explicit GtMatcher(const Rhs &rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
+  static const char *Desc() { return "is >"; }
+  static const char *NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+  explicit LeMatcher(const Rhs &rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
+  static const char *Desc() { return "is <="; }
+  static const char *NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+  explicit GeMatcher(const Rhs &rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
+  static const char *Desc() { return "is >="; }
+  static const char *NegatedDesc() { return "isn't >="; }
+};
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+  MatchesRegexMatcher(const RE *regex, bool full_match)
+      : regex_(regex), full_match_(full_match) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView &s,
+                       MatchResultListener *listener) const {
+    return MatchAndExplain(std::string(s), listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType *s, MatchResultListener *listener) const {
+    return s != nullptr && MatchAndExplain(std::string(s), listener);
+  }
+
+  // Matches anything that can convert to std::string.
+  //
+  // This is a template, not just a plain function with const std::string&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <class MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType &s,
+                       MatchResultListener * /* listener */) const {
+    const std::string &s2(s);
+    return full_match_ ? RE::FullMatch(s2, *regex_)
+                       : RE::PartialMatch(s2, *regex_);
+  }
+
+  void DescribeTo(::std::ostream *os) const {
+    *os << (full_match_ ? "matches" : "contains") << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+  void DescribeNegationTo(::std::ostream *os) const {
+    *os << "doesn't " << (full_match_ ? "match" : "contain")
+        << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+ private:
+  const std::shared_ptr<const RE> regex_;
+  const bool full_match_;
+};
+}  // namespace internal
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::RE *regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const std::string &regex) {
+  return MatchesRegex(new internal::RE(regex));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::RE *regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const std::string &regex) {
+  return ContainsRegex(new internal::RE(regex));
+}
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
+
+// Constructs a Matcher<T> from a 'value' of type T.  The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs.  A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice.  A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs &rhs) {
+  return Eq(rhs);
+}
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+  return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+  return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+  return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+  return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+  return internal::NeMatcher<Rhs>(x);
+}
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
index fe879bc..713faca 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-message.h

@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
 //
@@ -43,16 +42,23 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
+#include <memory>
+#include <sstream>
 
 #include "gtest/internal/gtest-port.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Ensures that there is at least one operator<< in the global namespace.
 // See Message& operator<<(...) below for why.
-void operator<<(const testing::internal::Secret&, int);
+void operator<<(const testing::internal::Secret &, int);
 
 namespace testing {
 
@@ -86,33 +92,25 @@
  private:
   // The type of basic IO manipulators (endl, ends, and flush) for
   // narrow streams.
-  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+  typedef std::ostream &(*BasicNarrowIoManip)(std::ostream &);
 
  public:
   // Constructs an empty Message.
   Message();
 
   // Copy constructor.
-  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+  Message(const Message &msg) : ss_(new ::std::stringstream) {  // NOLINT
     *ss_ << msg.GetString();
   }
 
   // Constructs a Message from a C-string.
-  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+  explicit Message(const char *str) : ss_(new ::std::stringstream) {
     *ss_ << str;
   }
 
-#if GTEST_OS_SYMBIAN
-  // Streams a value (either a pointer or not) to this object.
-  template <typename T>
-  inline Message& operator <<(const T& value) {
-    StreamHelper(typename internal::is_pointer<T>::type(), value);
-    return *this;
-  }
-#else
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message& operator <<(const T& val) {
+  inline Message &operator<<(const T &val) {
     // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
@@ -127,7 +125,7 @@
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
+    using ::operator<<;
     *ss_ << val;
     return *this;
   }
@@ -146,15 +144,14 @@
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
-    if (pointer == NULL) {
+  inline Message &operator<<(T *const &pointer) {  // NOLINT
+    if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
       *ss_ << pointer;
     }
     return *this;
   }
-#endif  // GTEST_OS_SYMBIAN
 
   // Since the basic IO manipulators are overloaded for both narrow
   // and wide streams, we have to provide this specialized definition
@@ -162,33 +159,25 @@
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
+  Message &operator<<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
+  Message &operator<<(bool b) { return *this << (b ? "true" : "false"); }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
+  Message &operator<<(const wchar_t *wide_c_str);
+  Message &operator<<(wchar_t *wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
+  Message &operator<<(const ::std::wstring &wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
-#if GTEST_HAS_GLOBAL_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::wstring& wstr);
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
   // Gets the text streamed to this object so far as an std::string.
   // Each '\0' character in the buffer is replaced with "\\0".
   //
@@ -196,40 +185,16 @@
   std::string GetString() const;
 
  private:
-
-#if GTEST_OS_SYMBIAN
-  // These are needed as the Nokia Symbian Compiler cannot decide between
-  // const T& and const T* in a function template. The Nokia compiler _can_
-  // decide between class template specializations for T and T*, so a
-  // tr1::type_traits-like is_pointer works, and we can overload on that.
-  template <typename T>
-  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-  }
-  template <typename T>
-  inline void StreamHelper(internal::false_type /*is_pointer*/,
-                           const T& value) {
-    // See the comments in Message& operator <<(const T&) above for why
-    // we need this using statement.
-    using ::operator <<;
-    *ss_ << value;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
   // We'll hold the text streamed to this object here.
-  const internal::scoped_ptr< ::std::stringstream> ss_;
+  const std::unique_ptr< ::std::stringstream> ss_;
 
   // We declare (but don't implement) this to prevent the compiler
   // from implementing the assignment operator.
-  void operator=(const Message&);
+  void operator=(const Message &);
 };
 
 // Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream &operator<<(std::ostream &os, const Message &sb) {
   return os << sb.GetString();
 }
 
@@ -240,11 +205,13 @@
 // ::std::string, ::wstring, or ::std::wstring object, each NUL
 // character in it is replaced with "\\0".
 template <typename T>
-std::string StreamableToString(const T& streamable) {
+std::string StreamableToString(const T &streamable) {
   return (Message() << streamable).GetString();
 }
 
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
index 038f9ba..8d01df5 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h

@@ -1,7 +1,3 @@
-// This file was GENERATED by command:
-//     pump.py gtest-param-test.h.pump
-// DO NOT EDIT BY HAND!!!
-
 // Copyright 2008, Google Inc.
 // All rights reserved.
 //
@@ -31,17 +27,15 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: vladl@google.com (Vlad Losev)
-//
 // Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
+// in Google C++ Testing and Mocking Framework (Google Test)
 //
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
 //
+// GOOGLETEST_CM0001 DO NOT DELETE
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
-
 // Value-parameterized tests allow you to test your code with different
 // parameters without writing multiple copies of the same test.
 //
@@ -76,10 +70,10 @@
   ...
 }
 
-// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
 // case with any set of parameters you want. Google Test defines a number
 // of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
+// (surprise!) parameter generators. Here is a summary of them, which
 // are all in the testing namespace:
 //
 //
@@ -97,17 +91,17 @@
 // For more details, see comments at the definitions of these functions below
 // in this file.
 //
-// The following statement will instantiate tests from the FooTest test case
+// The following statement will instantiate tests from the FooTest test suite
 // each with parameter values "meeny", "miny", and "moe".
 
-INSTANTIATE_TEST_CASE_P(InstantiationName,
-                        FooTest,
-                        Values("meeny", "miny", "moe"));
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+                         FooTest,
+                         Values("meeny", "miny", "moe"));
 
 // To distinguish different instances of the pattern, (yes, you
-// can instantiate it more then once) the first argument to the
-// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
-// actual test case name. Remember to pick unique prefixes for different
+// can instantiate it more than once) the first argument to the
+// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
+// actual test suite name. Remember to pick unique prefixes for different
 // instantiations. The tests from the instantiation above will have
 // these names:
 //
@@ -124,7 +118,7 @@
 // with parameter values "cat" and "dog":
 
 const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
 
 // The tests from the instantiation above will have these names:
 //
@@ -133,9 +127,9 @@
 //    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
 //    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
 //
-// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
-// in the given test case, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_CASE_P statement.
+// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
+// in the given test suite, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_SUITE_P statement.
 //
 // Please also note that generator expressions (including parameters to the
 // generators) are evaluated in InitGoogleTest(), after main() has started.
@@ -179,31 +173,23 @@
 
 #endif  // 0
 
-#include "gtest/internal/gtest-port.h"
+#include <iterator>
+#include <utility>
 
-#if !GTEST_OS_SYMBIAN
-# include <utility>
-#endif
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-param-util.h"
-#include "gtest/internal/gtest-param-util-generated.h"
-
-#if GTEST_HAS_PARAM_TEST
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 
 // Functions producing parameter generators.
 //
 // Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test case is instantiated
+// parameterized tests. When a parameterized test suite is instantiated
 // with a particular generator, Google Test creates and runs tests
 // for each element in the sequence produced by the generator.
 //
-// In the following sample, tests from test case FooTest are instantiated
+// In the following sample, tests from test suite FooTest are instantiated
 // each three times with parameter values 3, 5, and 8:
 //
 // class FooTest : public TestWithParam<int> { ... };
@@ -212,7 +198,7 @@
 // }
 // TEST_P(FooTest, TestThat) {
 // }
-// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
 //
 
 // Range() returns generators providing sequences of values in a range.
@@ -269,13 +255,13 @@
 //
 // Examples:
 //
-// This instantiates tests from test case StringTest
+// This instantiates tests from test suite StringTest
 // each with C-string values of "foo", "bar", and "baz":
 //
 // const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
 //
-// This instantiates tests from test case StlStringTest
+// This instantiates tests from test suite StlStringTest
 // each with STL strings with values "a" and "b":
 //
 // ::std::vector< ::std::string> GetParameterStrings() {
@@ -285,9 +271,9 @@
 //   return v;
 // }
 //
-// INSTANTIATE_TEST_CASE_P(CharSequence,
-//                         StlStringTest,
-//                         ValuesIn(GetParameterStrings()));
+// INSTANTIATE_TEST_SUITE_P(CharSequence,
+//                          StlStringTest,
+//                          ValuesIn(GetParameterStrings()));
 //
 //
 // This will also instantiate tests from CharTest
@@ -300,16 +286,15 @@
 //   return list;
 // }
 // ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_CASE_P(CharSequence2,
-//                         CharTest,
-//                         ValuesIn(l.begin(), l.end()));
+// INSTANTIATE_TEST_SUITE_P(CharSequence2,
+//                          CharTest,
+//                          ValuesIn(l.begin(), l.end()));
 //
 template <typename ForwardIterator>
 internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+    typename std::iterator_traits<ForwardIterator>::value_type>
 ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
+  typedef typename std::iterator_traits<ForwardIterator>::value_type ParamType;
   return internal::ParamGenerator<ParamType>(
       new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
 }
@@ -321,7 +306,7 @@
 
 template <class Container>
 internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
+    const Container &container) {
   return ValuesIn(container.begin(), container.end());
 }
 
@@ -332,869 +317,22 @@
 // Values(T v1, T v2, ..., T vN)
 //   - returns a generator producing sequences with elements v1, v2, ..., vN.
 //
-// For example, this instantiates tests from test case BarTest each
+// For example, this instantiates tests from test suite BarTest each
 // with values "one", "two", and "three":
 //
-// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+// INSTANTIATE_TEST_SUITE_P(NumSequence,
+//                          BarTest,
+//                          Values("one", "two", "three"));
 //
-// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
 // The exact type of values will depend on the type of parameter in BazTest.
 //
-// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
 //
-// Currently, Values() supports from 1 to 50 parameters.
 //
-template <typename T1>
-internal::ValueArray1<T1> Values(T1 v1) {
-  return internal::ValueArray1<T1>(v1);
-}
-
-template <typename T1, typename T2>
-internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
-  return internal::ValueArray2<T1, T2>(v1, v2);
-}
-
-template <typename T1, typename T2, typename T3>
-internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
-  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
-  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5) {
-  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6) {
-  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7) {
-  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
-      v6, v7);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
-  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
-      v5, v6, v7, v8);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
-  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
-  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11) {
-  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12) {
-  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13) {
-  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
-  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
-  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16) {
-  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17) {
-  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18) {
-  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
-  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
-  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
-  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22) {
-  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23) {
-  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24) {
-  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
-  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26) {
-  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27) {
-  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28) {
-  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29) {
-  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
-  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
-  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32) {
-  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33) {
-  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34) {
-  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
-  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
-  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37) {
-  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38) {
-  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
-      v33, v34, v35, v36, v37, v38);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38, T39 v39) {
-  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-      v32, v33, v34, v35, v36, v37, v38, v39);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
-    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
-    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
-  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
-      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
-  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
-      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42) {
-  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
-      v42);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43) {
-  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
-      v41, v42, v43);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43, T44 v44) {
-  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
-      v40, v41, v42, v43, v44);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
-  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
-      v39, v40, v41, v42, v43, v44, v45);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
-  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
-  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
-    T48 v48) {
-  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
-      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
-    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
-    T47 v47, T48 v48, T49 v49) {
-  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
-      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
-    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
-    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
-  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-      v48, v49, v50);
+template <typename... T>
+internal::ValueArray<T...> Values(T... v) {
+  return internal::ValueArray<T...>(std::move(v)...);
 }
 
 // Bool() allows generating tests with parameters in a set of (false, true).
@@ -1207,7 +345,7 @@
 // of multiple flags can be tested when several Bool()'s are combined using
 // Combine() function.
 //
-// In the following example all tests in the test case FlagDependentTest
+// In the following example all tests in the test suite FlagDependentTest
 // will be instantiated twice with parameters false and true.
 //
 // class FlagDependentTest : public testing::TestWithParam<bool> {
@@ -1215,13 +353,10 @@
 //     external_flag = GetParam();
 //   }
 // }
-// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 
-# if GTEST_HAS_COMBINE
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
 //
@@ -1230,215 +365,143 @@
 //   - returns a generator producing sequences with elements coming from
 //     the Cartesian product of elements from the sequences generated by
 //     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
 //     of elements from sequences produces by gen1, gen2, ..., genN.
 //
-// Combine can have up to 10 arguments. This number is currently limited
-// by the maximum number of elements in the tuple implementation used by Google
-// Test.
+// Combine can have up to 10 arguments.
 //
 // Example:
 //
-// This will instantiate tests in test case AnimalTest each one with
+// This will instantiate tests in test suite AnimalTest each one with
 // the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
 // tuple("dog", BLACK), and tuple("dog", WHITE):
 //
 // enum Color { BLACK, GRAY, WHITE };
 // class AnimalTest
-//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//     : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
 //
 // TEST_P(AnimalTest, AnimalLooksNice) {...}
 //
-// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
-//                         Combine(Values("cat", "dog"),
-//                                 Values(BLACK, WHITE)));
+// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
+//                          Combine(Values("cat", "dog"),
+//                                  Values(BLACK, WHITE)));
 //
 // This will instantiate tests in FlagDependentTest with all variations of two
 // Boolean flags:
 //
 // class FlagDependentTest
-//     : public testing::TestWithParam<tuple<bool, bool> > {
+//     : public testing::TestWithParam<std::tuple<bool, bool> > {
 //   virtual void SetUp() {
 //     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     tie(external_flag_1, external_flag_2) = GetParam();
+//     std::tie(external_flag_1, external_flag_2) = GetParam();
 //   }
 // };
 //
 // TEST_P(FlagDependentTest, TestFeature1) {
 //   // Test your code using external_flag_1 and external_flag_2 here.
 // }
-// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
-//                         Combine(Bool(), Bool()));
+// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
+//                          Combine(Bool(), Bool()));
 //
-template <typename Generator1, typename Generator2>
-internal::CartesianProductHolder2<Generator1, Generator2> Combine(
-    const Generator1& g1, const Generator2& g2) {
-  return internal::CartesianProductHolder2<Generator1, Generator2>(
-      g1, g2);
+template <typename... Generator>
+internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
+  return internal::CartesianProductHolder<Generator...>(g...);
 }
 
-template <typename Generator1, typename Generator2, typename Generator3>
-internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
-  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
-      g1, g2, g3);
-}
+#define TEST_P(test_suite_name, test_name)                                     \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public test_suite_name {                                               \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
+    void TestBody() override;                                                  \
+                                                                               \
+   private:                                                                    \
+    static int AddToRegistry() {                                               \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestSuitePatternHolder<test_suite_name>(                         \
+              GTEST_STRINGIFY_(test_suite_name),                               \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_suite_name, test_name)>());                             \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry();     \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4>
-internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-    Generator4> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4) {
-  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-      Generator4>(
-      g1, g2, g3, g4);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5>
-internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-    Generator4, Generator5> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5) {
-  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-      Generator4, Generator5>(
-      g1, g2, g3, g4, g5);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6>
-internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
-  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6>(
-      g1, g2, g3, g4, g5, g6);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7>
-internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7) {
-  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7>(
-      g1, g2, g3, g4, g5, g6, g7);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8>
-internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8) {
-  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8>(
-      g1, g2, g3, g4, g5, g6, g7, g8);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9>
-internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8,
-    Generator9> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
-  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9,
-    typename Generator10>
-internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-    Generator10> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9,
-        const Generator10& g10) {
-  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-      Generator10>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
-}
-# endif  // GTEST_HAS_COMBINE
-
-
-
-# define TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, \
-              ::testing::internal::CodeLocation(\
-                  __FILE__, __LINE__))->AddTestPattern(\
-                      #test_case_name, \
-                      #test_name, \
-                      new ::testing::internal::TestMetaFactory< \
-                          GTEST_TEST_CLASS_NAME_(\
-                              test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
-// to specify a function or functor that generates custom test name suffixes
-// based on the test parameters. The function should accept one argument of
-// type testing::TestParamInfo<class ParamType>, and return std::string.
+// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
+// generator and an optional function or functor that generates custom test name
+// suffixes based on the test parameters. Such a function or functor should
+// accept one argument of type testing::TestParamInfo<class ParamType>, and
+// return std::string.
 //
 // testing::PrintToStringParamName is a builtin test suffix generator that
-// returns the value of testing::PrintToString(GetParam()). It does not work
-// for std::string or C strings.
+// returns the value of testing::PrintToString(GetParam()).
 //
 // Note: test names must be non-empty, unique, and may only contain ASCII
-// alphanumeric characters or underscore.
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
 
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
-      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
-      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
-    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
-        (__VA_ARGS__)(info); \
-  } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, \
-              ::testing::internal::CodeLocation(\
-                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
-                      #prefix, \
-                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
-                      __FILE__, __LINE__)
+#define GTEST_EXPAND_(arg) arg
+#define GTEST_GET_FIRST_(first, ...) first
+#define GTEST_GET_SECOND_(first, second, ...) second
+
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType> &info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
+                  __FILE__, __LINE__)
+
+// Allow Marking a Parameterized test class as not needing to be instantiated.
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
+  namespace gtest_do_not_use_outside_namespace_scope {}                   \
+  static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
+      GTEST_STRINGIFY_(T))
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TEST_CASE_P                                            \
+  static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
+                "");                                                       \
+  INSTANTIATE_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 }  // namespace testing
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump
deleted file mode 100644
index 3078d6d..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump
+++ /dev/null

@@ -1,510 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 50  $$ Maximum length of Values arguments we want to support.
-$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: vladl@google.com (Vlad Losev)
-//
-// Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
-
-// Value-parameterized tests allow you to test your code with different
-// parameters without writing multiple copies of the same test.
-//
-// Here is how you use value-parameterized tests:
-
-#if 0
-
-// To write value-parameterized tests, first you should define a fixture
-// class. It is usually derived from testing::TestWithParam<T> (see below for
-// another inheritance scheme that's sometimes useful in more complicated
-// class hierarchies), where the type of your parameter values.
-// TestWithParam<T> is itself derived from testing::Test. T can be any
-// copyable type. If it's a raw pointer, you are responsible for managing the
-// lifespan of the pointed values.
-
-class FooTest : public ::testing::TestWithParam<const char*> {
-  // You can implement all the usual class fixture members here.
-};
-
-// Then, use the TEST_P macro to define as many parameterized tests
-// for this fixture as you want. The _P suffix is for "parameterized"
-// or "pattern", whichever you prefer to think.
-
-TEST_P(FooTest, DoesBlah) {
-  // Inside a test, access the test parameter with the GetParam() method
-  // of the TestWithParam<T> class:
-  EXPECT_TRUE(foo.Blah(GetParam()));
-  ...
-}
-
-TEST_P(FooTest, HasBlahBlah) {
-  ...
-}
-
-// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
-// case with any set of parameters you want. Google Test defines a number
-// of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
-// are all in the testing namespace:
-//
-//
-//  Range(begin, end [, step]) - Yields values {begin, begin+step,
-//                               begin+step+step, ...}. The values do not
-//                               include end. step defaults to 1.
-//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
-//  ValuesIn(container)        - Yields values from a C-style array, an STL
-//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
-//  Bool()                     - Yields sequence {false, true}.
-//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
-//                               for the math savvy) of the values generated
-//                               by the N generators.
-//
-// For more details, see comments at the definitions of these functions below
-// in this file.
-//
-// The following statement will instantiate tests from the FooTest test case
-// each with parameter values "meeny", "miny", and "moe".
-
-INSTANTIATE_TEST_CASE_P(InstantiationName,
-                        FooTest,
-                        Values("meeny", "miny", "moe"));
-
-// To distinguish different instances of the pattern, (yes, you
-// can instantiate it more then once) the first argument to the
-// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
-// actual test case name. Remember to pick unique prefixes for different
-// instantiations. The tests from the instantiation above will have
-// these names:
-//
-//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
-//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
-//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
-//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
-//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
-//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
-//
-// You can use these names in --gtest_filter.
-//
-// This statement will instantiate all tests from FooTest again, each
-// with parameter values "cat" and "dog":
-
-const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
-
-// The tests from the instantiation above will have these names:
-//
-//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
-//
-// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
-// in the given test case, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_CASE_P statement.
-//
-// Please also note that generator expressions (including parameters to the
-// generators) are evaluated in InitGoogleTest(), after main() has started.
-// This allows the user on one hand, to adjust generator parameters in order
-// to dynamically determine a set of tests to run and on the other hand,
-// give the user a chance to inspect the generated tests with Google Test
-// reflection API before RUN_ALL_TESTS() is executed.
-//
-// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
-// for more examples.
-//
-// In the future, we plan to publish the API for defining new parameter
-// generators. But for now this interface remains part of the internal
-// implementation and is subject to change.
-//
-//
-// A parameterized test fixture must be derived from testing::Test and from
-// testing::WithParamInterface<T>, where T is the type of the parameter
-// values. Inheriting from TestWithParam<T> satisfies that requirement because
-// TestWithParam<T> inherits from both Test and WithParamInterface. In more
-// complicated hierarchies, however, it is occasionally useful to inherit
-// separately from Test and WithParamInterface. For example:
-
-class BaseTest : public ::testing::Test {
-  // You can inherit all the usual members for a non-parameterized test
-  // fixture here.
-};
-
-class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
-  // The usual test fixture members go here too.
-};
-
-TEST_F(BaseTest, HasFoo) {
-  // This is an ordinary non-parameterized test.
-}
-
-TEST_P(DerivedTest, DoesBlah) {
-  // GetParam works just the same here as if you inherit from TestWithParam.
-  EXPECT_TRUE(foo.Blah(GetParam()));
-}
-
-#endif  // 0
-
-#include "gtest/internal/gtest-port.h"
-
-#if !GTEST_OS_SYMBIAN
-# include <utility>
-#endif
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-param-util.h"
-#include "gtest/internal/gtest-param-util-generated.h"
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Functions producing parameter generators.
-//
-// Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test case is instantiated
-// with a particular generator, Google Test creates and runs tests
-// for each element in the sequence produced by the generator.
-//
-// In the following sample, tests from test case FooTest are instantiated
-// each three times with parameter values 3, 5, and 8:
-//
-// class FooTest : public TestWithParam<int> { ... };
-//
-// TEST_P(FooTest, TestThis) {
-// }
-// TEST_P(FooTest, TestThat) {
-// }
-// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
-//
-
-// Range() returns generators providing sequences of values in a range.
-//
-// Synopsis:
-// Range(start, end)
-//   - returns a generator producing a sequence of values {start, start+1,
-//     start+2, ..., }.
-// Range(start, end, step)
-//   - returns a generator producing a sequence of values {start, start+step,
-//     start+step+step, ..., }.
-// Notes:
-//   * The generated sequences never include end. For example, Range(1, 5)
-//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
-//     returns a generator producing {1, 3, 5, 7}.
-//   * start and end must have the same type. That type may be any integral or
-//     floating-point type or a user defined type satisfying these conditions:
-//     * It must be assignable (have operator=() defined).
-//     * It must have operator+() (operator+(int-compatible type) for
-//       two-operand version).
-//     * It must have operator<() defined.
-//     Elements in the resulting sequences will also have that type.
-//   * Condition start < end must be satisfied in order for resulting sequences
-//     to contain any elements.
-//
-template <typename T, typename IncrementT>
-internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
-  return internal::ParamGenerator<T>(
-      new internal::RangeGenerator<T, IncrementT>(start, end, step));
-}
-
-template <typename T>
-internal::ParamGenerator<T> Range(T start, T end) {
-  return Range(start, end, 1);
-}
-
-// ValuesIn() function allows generation of tests with parameters coming from
-// a container.
-//
-// Synopsis:
-// ValuesIn(const T (&array)[N])
-//   - returns a generator producing sequences with elements from
-//     a C-style array.
-// ValuesIn(const Container& container)
-//   - returns a generator producing sequences with elements from
-//     an STL-style container.
-// ValuesIn(Iterator begin, Iterator end)
-//   - returns a generator producing sequences with elements from
-//     a range [begin, end) defined by a pair of STL-style iterators. These
-//     iterators can also be plain C pointers.
-//
-// Please note that ValuesIn copies the values from the containers
-// passed in and keeps them to generate tests in RUN_ALL_TESTS().
-//
-// Examples:
-//
-// This instantiates tests from test case StringTest
-// each with C-string values of "foo", "bar", and "baz":
-//
-// const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
-//
-// This instantiates tests from test case StlStringTest
-// each with STL strings with values "a" and "b":
-//
-// ::std::vector< ::std::string> GetParameterStrings() {
-//   ::std::vector< ::std::string> v;
-//   v.push_back("a");
-//   v.push_back("b");
-//   return v;
-// }
-//
-// INSTANTIATE_TEST_CASE_P(CharSequence,
-//                         StlStringTest,
-//                         ValuesIn(GetParameterStrings()));
-//
-//
-// This will also instantiate tests from CharTest
-// each with parameter values 'a' and 'b':
-//
-// ::std::list<char> GetParameterChars() {
-//   ::std::list<char> list;
-//   list.push_back('a');
-//   list.push_back('b');
-//   return list;
-// }
-// ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_CASE_P(CharSequence2,
-//                         CharTest,
-//                         ValuesIn(l.begin(), l.end()));
-//
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
-  return internal::ParamGenerator<ParamType>(
-      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
-}
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
-  return ValuesIn(array, array + N);
-}
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
-  return ValuesIn(container.begin(), container.end());
-}
-
-// Values() allows generating tests from explicitly specified list of
-// parameters.
-//
-// Synopsis:
-// Values(T v1, T v2, ..., T vN)
-//   - returns a generator producing sequences with elements v1, v2, ..., vN.
-//
-// For example, this instantiates tests from test case BarTest each
-// with values "one", "two", and "three":
-//
-// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
-//
-// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
-// The exact type of values will depend on the type of parameter in BazTest.
-//
-// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
-//
-// Currently, Values() supports from 1 to $n parameters.
-//
-$range i 1..n
-$for i [[
-$range j 1..i
-
-template <$for j, [[typename T$j]]>
-internal::ValueArray$i<$for j, [[T$j]]> Values($for j, [[T$j v$j]]) {
-  return internal::ValueArray$i<$for j, [[T$j]]>($for j, [[v$j]]);
-}
-
-]]
-
-// Bool() allows generating tests with parameters in a set of (false, true).
-//
-// Synopsis:
-// Bool()
-//   - returns a generator producing sequences with elements {false, true}.
-//
-// It is useful when testing code that depends on Boolean flags. Combinations
-// of multiple flags can be tested when several Bool()'s are combined using
-// Combine() function.
-//
-// In the following example all tests in the test case FlagDependentTest
-// will be instantiated twice with parameters false and true.
-//
-// class FlagDependentTest : public testing::TestWithParam<bool> {
-//   virtual void SetUp() {
-//     external_flag = GetParam();
-//   }
-// }
-// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
-//
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
-
-# if GTEST_HAS_COMBINE
-// Combine() allows the user to combine two or more sequences to produce
-// values of a Cartesian product of those sequences' elements.
-//
-// Synopsis:
-// Combine(gen1, gen2, ..., genN)
-//   - returns a generator producing sequences with elements coming from
-//     the Cartesian product of elements from the sequences generated by
-//     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
-//     of elements from sequences produces by gen1, gen2, ..., genN.
-//
-// Combine can have up to $maxtuple arguments. This number is currently limited
-// by the maximum number of elements in the tuple implementation used by Google
-// Test.
-//
-// Example:
-//
-// This will instantiate tests in test case AnimalTest each one with
-// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
-// tuple("dog", BLACK), and tuple("dog", WHITE):
-//
-// enum Color { BLACK, GRAY, WHITE };
-// class AnimalTest
-//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
-//
-// TEST_P(AnimalTest, AnimalLooksNice) {...}
-//
-// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
-//                         Combine(Values("cat", "dog"),
-//                                 Values(BLACK, WHITE)));
-//
-// This will instantiate tests in FlagDependentTest with all variations of two
-// Boolean flags:
-//
-// class FlagDependentTest
-//     : public testing::TestWithParam<tuple<bool, bool> > {
-//   virtual void SetUp() {
-//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     tie(external_flag_1, external_flag_2) = GetParam();
-//   }
-// };
-//
-// TEST_P(FlagDependentTest, TestFeature1) {
-//   // Test your code using external_flag_1 and external_flag_2 here.
-// }
-// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
-//                         Combine(Bool(), Bool()));
-//
-$range i 2..maxtuple
-$for i [[
-$range j 1..i
-
-template <$for j, [[typename Generator$j]]>
-internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
-    $for j, [[const Generator$j& g$j]]) {
-  return internal::CartesianProductHolder$i<$for j, [[Generator$j]]>(
-      $for j, [[g$j]]);
-}
-
-]]
-# endif  // GTEST_HAS_COMBINE
-
-
-
-# define TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, \
-              ::testing::internal::CodeLocation(\
-                  __FILE__, __LINE__))->AddTestPattern(\
-                      #test_case_name, \
-                      #test_name, \
-                      new ::testing::internal::TestMetaFactory< \
-                          GTEST_TEST_CLASS_NAME_(\
-                              test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
-// to specify a function or functor that generates custom test name suffixes
-// based on the test parameters. The function should accept one argument of
-// type testing::TestParamInfo<class ParamType>, and return std::string.
-//
-// testing::PrintToStringParamName is a builtin test suffix generator that
-// returns the value of testing::PrintToString(GetParam()).
-//
-// Note: test names must be non-empty, unique, and may only contain ASCII
-// alphanumeric characters or underscore. Because PrintToString adds quotes
-// to std::string and C strings, it won't work for these types.
-
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
-      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
-      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
-    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
-        (__VA_ARGS__)(info); \
-  } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, \
-              ::testing::internal::CodeLocation(\
-                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
-                      #prefix, \
-                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
-                      __FILE__, __LINE__)
-
-}  // namespace testing
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
index 8a33164..950247c 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h

@@ -26,10 +26,8 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -46,6 +44,10 @@
 //   2. operator<<(ostream&, const T&) defined in either foo or the
 //      global namespace.
 //
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
 // If none of the above is defined, it will print the debug string of
 // the value if it is a protocol buffer, or print the raw bytes in the
 // value otherwise.
@@ -92,20 +94,27 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
+#include <functional>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
+#include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
-#include "gtest/internal/gtest-port.h"
 #include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_STD_TUPLE_
-# include <tuple>
-#endif
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
 
 namespace testing {
 
@@ -115,9 +124,8 @@
 
 // Prints the given number of bytes in the given object to the given
 // ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char *obj_bytes,
+                                     size_t count, ::std::ostream *os);
 
 // For selecting which printer to use when a given type has neither <<
 // nor PrintTo().
@@ -125,7 +133,11 @@
   kProtobuf,              // a protobuf type
   kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
                           // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view or std::string_view
+#endif
+  kOtherType  // anything else
 };
 
 // TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
@@ -136,9 +148,11 @@
 class TypeWithoutFormatter {
  public:
   // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
-                         sizeof(value), os);
+  static void PrintValue(const T &value, ::std::ostream *os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char *>(
+            reinterpret_cast<const void *>(std::addressof(value))),
+        sizeof(value), os);
   }
 };
 
@@ -150,11 +164,11 @@
 template <typename T>
 class TypeWithoutFormatter<T, kProtobuf> {
  public:
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
+  static void PrintValue(const T &value, ::std::ostream *os) {
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
     *os << ("<" + pretty_str + ">");
   }
 };
@@ -169,12 +183,26 @@
   // case printing it as an integer is the desired behavior.  In case
   // T is not an enum, printing it as an integer is the best we can do
   // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
+  static void PrintValue(const T &value, ::std::ostream *os) {
     const internal::BiggestInt kBigInt = value;
     *os << kBigInt;
   }
 };
 
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view
+  // (or std::string_view).
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T &value, ::std::ostream *os);
+};
+#endif
+
 // Prints the given value to the given ostream.  If the value is a
 // protocol message, its debug string is printed; if it's an enum or
 // of a type implicitly convertible to BiggestInt, it's printed as an
@@ -200,12 +228,20 @@
 // operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
 // specific.
 template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+::std::basic_ostream<Char, CharTraits> &operator<<(
+    ::std::basic_ostream<Char, CharTraits> &os, const T &x) {
+  TypeWithoutFormatter<
+      T, (internal::IsAProtocolMessage<T>::value
+              ? kProtobuf
+              : std::is_convertible<const T &, internal::BiggestInt>::value
+                    ? kConvertibleToInteger
+                    :
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+                    std::is_convertible<const T &, internal::StringView>::value
+                        ? kConvertibleToStringView
+                        :
+#endif
+                        kOtherType)>::PrintValue(x, &os);
   return os;
 }
 
@@ -219,7 +255,7 @@
 // Used to print a value that is not an STL-style container when the
 // user doesn't define PrintTo() for it.
 template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+void DefaultPrintNonContainerTo(const T &value, ::std::ostream *os) {
   // With the following statement, during unqualified name lookup,
   // testing::internal2::operator<< appears as if it was declared in
   // the nearest enclosing namespace that contains both
@@ -228,10 +264,8 @@
   // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
   // testing::internal2::operator<< in case T doesn't come with a <<
   // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
+
+  using ::testing::internal2::operator<<;
 
   // Assuming T is defined in namespace foo, in the next statement,
   // the compiler will consider all of:
@@ -272,7 +306,7 @@
 template <typename ToPrint, typename OtherOperand>
 class FormatForComparison {
  public:
-  static ::std::string Format(const ToPrint& value) {
+  static ::std::string Format(const ToPrint &value) {
     return ::testing::PrintToString(value);
   }
 };
@@ -281,21 +315,21 @@
 template <typename ToPrint, size_t N, typename OtherOperand>
 class FormatForComparison<ToPrint[N], OtherOperand> {
  public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  static ::std::string Format(const ToPrint *value) {
+    return FormatForComparison<const ToPrint *, OtherOperand>::Format(value);
   }
 };
 
 // By default, print C string as pointers to be safe, as we don't know
 // whether they actually point to a NUL-terminated string.
 
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                 \
+  template <typename OtherOperand>                                       \
+  class FormatForComparison<CharType *, OtherOperand> {                  \
+   public:                                                               \
+    static ::std::string Format(CharType *value) {                       \
+      return ::testing::PrintToString(static_cast<const void *>(value)); \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
@@ -309,27 +343,17 @@
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
+  template <>                                                            \
+  class FormatForComparison<CharType *, OtherStringType> {               \
+   public:                                                               \
+    static ::std::string Format(CharType *value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
 
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
-#endif
-
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
-#endif
-
 #if GTEST_HAS_STD_WSTRING
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
@@ -346,8 +370,8 @@
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1 &value,
+                                              const T2 & /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -362,19 +386,27 @@
 class UniversalPrinter;
 
 template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
+void UniversalPrint(const T &value, ::std::ostream *os);
+
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type>
+struct WrapPrinterType {};
 
 // Used to print an STL-style container when the user doesn't define
 // a PrintTo() for it.
 template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const C& container, ::std::ostream* os) {
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
+                    const C &container, ::std::ostream *os) {
   const size_t kMaxCount = 32;  // The maximum number of elements to print.
   *os << '{';
   size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
+  for (typename C::const_iterator it = container.begin(); it != container.end();
+       ++it, ++count) {
     if (count > 0) {
       *os << ',';
       if (count == kMaxCount) {  // Enough has been printed.
@@ -401,41 +433,35 @@
 // implementation-defined.  Therefore they will be printed as raw
 // bytes.)
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
-                    T* p, ::std::ostream* os) {
-  if (p == NULL) {
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */, T *p,
+                    ::std::ostream *os) {
+  if (p == nullptr) {
     *os << "NULL";
   } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */, T *p,
+                    ::std::ostream *os) {
+  if (p == nullptr) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void *>(p);
   }
 }
 
 // Used to print a non-container, non-pointer value when the user
 // doesn't define PrintTo() for it.
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const T& value, ::std::ostream* os) {
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */, const T &value,
+                    ::std::ostream *os) {
   ::testing_internal::DefaultPrintNonContainerTo(value, os);
 }
 
@@ -451,12 +477,9 @@
 // or there is already a << operator but it doesn't do what the user
 // wants).
 template <typename T>
-void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
+void PrintTo(const T &value, ::std::ostream *os) {
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
   //
   // Note that we check for container types here, prior to we check
   // for protocol message types in our operator<<.  The rationale is:
@@ -468,13 +491,23 @@
   // elements; therefore we check for container types here to ensure
   // that our format is used.
   //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !std::is_pointer<T>::value
+                ? kPrintOther
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -482,9 +515,9 @@
 // types, strings, plain arrays, and pointers).
 
 // Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
-inline void PrintTo(char c, ::std::ostream* os) {
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream *os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream *os);
+inline void PrintTo(char c, ::std::ostream *os) {
   // When printing a plain char, we always treat it as unsigned.  This
   // way, the output won't be affected by whether the compiler thinks
   // char is signed or not.
@@ -492,7 +525,7 @@
 }
 
 // Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream* os) {
+inline void PrintTo(bool x, ::std::ostream *os) {
   *os << (x ? "true" : "false");
 }
 
@@ -503,27 +536,27 @@
 // as signed integer when wchar_t is implemented by the compiler
 // as a signed type and is printed as an unsigned integer when wchar_t
 // is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream *os);
 
 // Overloads for C strings.
-GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
-inline void PrintTo(char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const char*>(s), os);
+GTEST_API_ void PrintTo(const char *s, ::std::ostream *os);
+inline void PrintTo(char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const char *>(s), os);
 }
 
 // signed/unsigned char is often used for representing binary data, so
 // we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
+inline void PrintTo(const signed char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
 }
-inline void PrintTo(signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
+inline void PrintTo(signed char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
 }
-inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
+inline void PrintTo(const unsigned char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
 }
-inline void PrintTo(unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
+inline void PrintTo(unsigned char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
 }
 
 // MSVC can be configured to define wchar_t as a typedef of unsigned
@@ -533,9 +566,9 @@
 // possibly causing invalid memory accesses.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
-inline void PrintTo(wchar_t* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+GTEST_API_ void PrintTo(const wchar_t *s, ::std::ostream *os);
+inline void PrintTo(wchar_t *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const wchar_t *>(s), os);
 }
 #endif
 
@@ -545,7 +578,7 @@
 // Prints the given number of elements in an array, without printing
 // the curly braces.
 template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream *os) {
   UniversalPrint(a[0], os);
   for (size_t i = 1; i != count; i++) {
     *os << ", ";
@@ -553,127 +586,63 @@
   }
 }
 
-// Overloads for ::string and ::std::string.
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
-inline void PrintTo(const ::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
-inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+// Overloads for ::std::string.
+GTEST_API_ void PrintStringTo(const ::std::string &s, ::std::ostream *os);
+inline void PrintTo(const ::std::string &s, ::std::ostream *os) {
   PrintStringTo(s, os);
 }
 
-// Overloads for ::wstring and ::std::wstring.
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
+// Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+GTEST_API_ void PrintWideStringTo(const ::std::wstring &s, ::std::ostream *os);
+inline void PrintTo(const ::std::wstring &s, ::std::ostream *os) {
   PrintWideStringTo(s, os);
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
-#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Overload for internal::StringView.
+inline void PrintTo(internal::StringView sp, ::std::ostream *os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+inline void PrintTo(std::nullptr_t, ::std::ostream *os) { *os << "(nullptr)"; }
+
+template <typename T>
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream *os) {
+  UniversalPrinter<T &>::Print(ref.get(), os);
+}
+
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os);
-#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+void PrintTupleTo(const T &, std::integral_constant<size_t, 0>,
+                  ::std::ostream *) {}
 
-#if GTEST_HAS_TR1_TUPLE
-// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
-// which are packed as tuples.
-
-// Overloaded PrintTo() for tuples of various arities.  We support
-// tuples of up-to 10 fields.  The following implementation works
-// regardless of whether tr1::tuple is implemented using the
-// non-standard variadic template feature or not.
-
-inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
+template <typename T, size_t I>
+void PrintTupleTo(const T &t, std::integral_constant<size_t, I>,
+                  ::std::ostream *os) {
+  PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (I > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    *os << ", ";
+  }
+  UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
+      std::get<I - 1>(t), os);
 }
 
-template <typename T1>
-void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2>
-void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9, typename T10>
-void PrintTo(
-    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
-    ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-#if GTEST_HAS_STD_TUPLE_
 template <typename... Types>
-void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
+void PrintTo(const ::std::tuple<Types...> &t, ::std::ostream *os) {
+  *os << "(";
+  PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
+  *os << ")";
 }
-#endif  // GTEST_HAS_STD_TUPLE_
 
 // Overload for std::pair.
 template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+void PrintTo(const ::std::pair<T1, T2> &value, ::std::ostream *os) {
   *os << '(';
   // We cannot use UniversalPrint(value.first, os) here, as T1 may be
   // a reference type.  The same for printing value.second.
@@ -695,7 +664,7 @@
   // Note: we deliberately don't call this PrintTo(), as that name
   // conflicts with ::testing::internal::PrintTo in the body of the
   // function.
-  static void Print(const T& value, ::std::ostream* os) {
+  static void Print(const T &value, ::std::ostream *os) {
     // By default, ::testing::internal::PrintTo() is used for printing
     // the value.
     //
@@ -710,10 +679,52 @@
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T> &value, ::std::ostream *os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...> &value, ::std::ostream *os) {
+    *os << '(';
+    absl::visit(Visitor{ os }, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U &u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream *os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
 template <typename T>
-void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+void UniversalPrintArray(const T *begin, size_t len, ::std::ostream *os) {
   if (len == 0) {
     *os << "{}";
   } else {
@@ -723,7 +734,6 @@
     // If the array has more than kThreshold elements, we'll have to
     // omit some details by printing only the first and the last
     // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
     if (len <= kThreshold) {
       PrintRawArrayTo(begin, len, os);
     } else {
@@ -735,12 +745,12 @@
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char *begin, size_t len,
+                                    ::std::ostream *os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t *begin, size_t len,
+                                    ::std::ostream *os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -748,23 +758,23 @@
  public:
   // Prints the given array, omitting some elements when there are too
   // many.
-  static void Print(const T (&a)[N], ::std::ostream* os) {
+  static void Print(const T (&a)[N], ::std::ostream *os) {
     UniversalPrintArray(a, N, os);
   }
 };
 
 // Implements printing a reference type T&.
 template <typename T>
-class UniversalPrinter<T&> {
+class UniversalPrinter<T &> {
  public:
   // MSVC warns about adding const to a function type, so we want to
   // disable the warning.
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
 
-  static void Print(const T& value, ::std::ostream* os) {
+  static void Print(const T &value, ::std::ostream *os) {
     // Prints the address of the value.  We use reinterpret_cast here
     // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+    *os << "@" << reinterpret_cast<const void *>(&value) << " ";
 
     // Then prints the value itself.
     UniversalPrint(value, os);
@@ -780,49 +790,49 @@
 template <typename T>
 class UniversalTersePrinter {
  public:
-  static void Print(const T& value, ::std::ostream* os) {
+  static void Print(const T &value, ::std::ostream *os) {
     UniversalPrint(value, os);
   }
 };
 template <typename T>
-class UniversalTersePrinter<T&> {
+class UniversalTersePrinter<T &> {
  public:
-  static void Print(const T& value, ::std::ostream* os) {
+  static void Print(const T &value, ::std::ostream *os) {
     UniversalPrint(value, os);
   }
 };
 template <typename T, size_t N>
 class UniversalTersePrinter<T[N]> {
  public:
-  static void Print(const T (&value)[N], ::std::ostream* os) {
+  static void Print(const T (&value)[N], ::std::ostream *os) {
     UniversalPrinter<T[N]>::Print(value, os);
   }
 };
 template <>
-class UniversalTersePrinter<const char*> {
+class UniversalTersePrinter<const char *> {
  public:
-  static void Print(const char* str, ::std::ostream* os) {
-    if (str == NULL) {
+  static void Print(const char *str, ::std::ostream *os) {
+    if (str == nullptr) {
       *os << "NULL";
     } else {
-      UniversalPrint(string(str), os);
+      UniversalPrint(std::string(str), os);
     }
   }
 };
 template <>
-class UniversalTersePrinter<char*> {
+class UniversalTersePrinter<char *> {
  public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
+  static void Print(char *str, ::std::ostream *os) {
+    UniversalTersePrinter<const char *>::Print(str, os);
   }
 };
 
 #if GTEST_HAS_STD_WSTRING
 template <>
-class UniversalTersePrinter<const wchar_t*> {
+class UniversalTersePrinter<const wchar_t *> {
  public:
-  static void Print(const wchar_t* str, ::std::ostream* os) {
-    if (str == NULL) {
+  static void Print(const wchar_t *str, ::std::ostream *os) {
+    if (str == nullptr) {
       *os << "NULL";
     } else {
       UniversalPrint(::std::wstring(str), os);
@@ -832,15 +842,15 @@
 #endif
 
 template <>
-class UniversalTersePrinter<wchar_t*> {
+class UniversalTersePrinter<wchar_t *> {
  public:
-  static void Print(wchar_t* str, ::std::ostream* os) {
-    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  static void Print(wchar_t *str, ::std::ostream *os) {
+    UniversalTersePrinter<const wchar_t *>::Print(str, os);
   }
 };
 
 template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
+void UniversalTersePrint(const T &value, ::std::ostream *os) {
   UniversalTersePrinter<T>::Print(value, os);
 }
 
@@ -849,135 +859,57 @@
 // (const) char pointer, this prints both the pointer and the
 // NUL-terminated string.
 template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os) {
+void UniversalPrint(const T &value, ::std::ostream *os) {
   // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
   // UniversalPrinter with T directly.
   typedef T T1;
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector<string> Strings;
+typedef ::std::vector<::std::string> Strings;
 
-// TuplePolicy<TupleT> must provide:
-// - tuple_size
-//     size of tuple TupleT.
-// - get<size_t I>(const TupleT& t)
-//     static function extracting element I of tuple TupleT.
-// - tuple_element<size_t I>::type
-//     type of element I of tuple TupleT.
-template <typename TupleT>
-struct TuplePolicy;
-
-#if GTEST_HAS_TR1_TUPLE
-template <typename TupleT>
-struct TuplePolicy {
-  typedef TupleT Tuple;
-  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
-
-  template <size_t I>
-  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
-
-  template <size_t I>
-  static typename AddReference<
-      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
-      const Tuple& tuple) {
-    return ::std::tr1::get<I>(tuple);
-  }
-};
-template <typename TupleT>
-const size_t TuplePolicy<TupleT>::tuple_size;
-#endif  // GTEST_HAS_TR1_TUPLE
-
-#if GTEST_HAS_STD_TUPLE_
-template <typename... Types>
-struct TuplePolicy< ::std::tuple<Types...> > {
-  typedef ::std::tuple<Types...> Tuple;
-  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
-
-  template <size_t I>
-  struct tuple_element : ::std::tuple_element<I, Tuple> {};
-
-  template <size_t I>
-  static const typename ::std::tuple_element<I, Tuple>::type& get(
-      const Tuple& tuple) {
-    return ::std::get<I>(tuple);
-  }
-};
-template <typename... Types>
-const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
-#endif  // GTEST_HAS_STD_TUPLE_
-
-#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
-// This helper template allows PrintTo() for tuples and
-// UniversalTersePrintTupleFieldsToStrings() to be defined by
-// induction on the number of tuple fields.  The idea is that
-// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
-// fields in tuple t, and can be defined in terms of
-// TuplePrefixPrinter<N - 1>.
-//
-// The inductive case.
-template <size_t N>
-struct TuplePrefixPrinter {
-  // Prints the first N fields of a tuple.
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
-    GTEST_INTENTIONAL_CONST_COND_PUSH_()
-    if (N > 1) {
-    GTEST_INTENTIONAL_CONST_COND_POP_()
-      *os << ", ";
-    }
-    UniversalPrinter<
-        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
-        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
-  }
-
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
-    ::std::stringstream ss;
-    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Base case.
-template <>
-struct TuplePrefixPrinter<0> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
-};
-
-// Helper function for printing a tuple.
-// Tuple must be either std::tr1::tuple or std::tuple type.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
 template <typename Tuple>
-void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
-  *os << "(";
-  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
-  *os << ")";
+void TersePrintPrefixToStrings(const Tuple &, std::integral_constant<size_t, 0>,
+                               Strings *) {}
+template <typename Tuple, size_t I>
+void TersePrintPrefixToStrings(const Tuple &t,
+                               std::integral_constant<size_t, I>,
+                               Strings *strings) {
+  TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
+                            strings);
+  ::std::stringstream ss;
+  UniversalTersePrint(std::get<I - 1>(t), &ss);
+  strings->push_back(ss.str());
 }
 
 // Prints the fields of a tuple tersely to a string vector, one
 // element for each field.  See the comment before
 // UniversalTersePrint() for how we define "tersely".
 template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple &value) {
   Strings result;
-  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
-      TersePrintPrefixToStrings(value, &result);
+  TersePrintPrefixToStrings(
+      value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
+      &result);
   return result;
 }
-#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 
 }  // namespace internal
 
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+namespace internal2 {
 template <typename T>
-::std::string PrintToString(const T& value) {
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T &value, ::std::ostream *os) {
+  internal::PrintTo(internal::StringView(value), os);
+}
+}  // namespace internal2
+#endif
+
+template <typename T>
+::std::string PrintToString(const T &value) {
   ::std::stringstream ss;
   internal::UniversalTersePrinter<T>::Print(value, &ss);
   return ss.str();

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
index f63fa9a..e263b10 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h

@@ -26,17 +26,21 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 //
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
+// GOOGLETEST_CM0004 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // This helper class can be used to mock out Google Test failure reporting
@@ -61,27 +65,28 @@
   // by Google Test.  The 'result' parameter specifies where to report the
   // results. This reporter will only catch failures generated in the current
   // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray *result);
 
   // Same as above, but you can choose the interception scope of this object.
   ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray* result);
+                                   TestPartResultArray *result);
 
   // The d'tor restores the previous test part result reporter.
-  virtual ~ScopedFakeTestPartResultReporter();
+  ~ScopedFakeTestPartResultReporter() override;
 
   // Appends the TestPartResult object to the TestPartResultArray
   // received in the constructor.
   //
   // This method is from the TestPartResultReporterInterface
   // interface.
-  virtual void ReportTestPartResult(const TestPartResult& result);
+  void ReportTestPartResult(const TestPartResult &result) override;
+
  private:
   void Init();
 
   const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface* old_reporter_;
-  TestPartResultArray* const result_;
+  TestPartResultReporterInterface *old_reporter_;
+  TestPartResultArray *const result_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
 };
@@ -96,14 +101,14 @@
 class GTEST_API_ SingleFailureChecker {
  public:
   // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
+  SingleFailureChecker(const TestPartResultArray *results,
+                       TestPartResult::Type type, const std::string &substr);
   ~SingleFailureChecker();
+
  private:
-  const TestPartResultArray* const results_;
+  const TestPartResultArray *const results_;
   const TestPartResult::Type type_;
-  const string substr_;
+  const std::string substr_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
 };
@@ -112,6 +117,8 @@
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // A set of macros for testing Google Test assertions or code that's expected
 // to generate Google Test fatal failures.  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
@@ -135,38 +142,39 @@
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
@@ -201,32 +209,37 @@
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
index 77eb844..a28afb3 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h

@@ -27,8 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: mheule@google.com (Markus Heule)
-//
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
@@ -38,6 +37,9 @@
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // A copyable object representing the result of a test part (i.e. an
@@ -51,30 +53,26 @@
   enum Type {
     kSuccess,          // Succeeded.
     kNonFatalFailure,  // Failed but the test can continue.
-    kFatalFailure      // Failed and the test should be terminated.
+    kFatalFailure,     // Failed and the test should be terminated.
+    kSkip              // Skipped.
   };
 
   // C'tor.  TestPartResult does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestPartResult object.
-  TestPartResult(Type a_type,
-                 const char* a_file_name,
-                 int a_line_number,
-                 const char* a_message)
-      : type_(a_type),
-        file_name_(a_file_name == NULL ? "" : a_file_name),
-        line_number_(a_line_number),
-        summary_(ExtractSummary(a_message)),
-        message_(a_message) {
-  }
+  TestPartResult(Type a_type, const char *a_file_name, int a_line_number,
+                 const char *a_message)
+      : type_(a_type), file_name_(a_file_name == nullptr ? "" : a_file_name),
+        line_number_(a_line_number), summary_(ExtractSummary(a_message)),
+        message_(a_message) {}
 
   // Gets the outcome of the test part.
   Type type() const { return type_; }
 
   // Gets the name of the source file where the test part took place, or
   // NULL if it's unknown.
-  const char* file_name() const {
-    return file_name_.empty() ? NULL : file_name_.c_str();
+  const char *file_name() const {
+    return file_name_.empty() ? nullptr : file_name_.c_str();
   }
 
   // Gets the line in the source file where the test part took place,
@@ -82,29 +80,32 @@
   int line_number() const { return line_number_; }
 
   // Gets the summary of the failure message.
-  const char* summary() const { return summary_.c_str(); }
+  const char *summary() const { return summary_.c_str(); }
 
   // Gets the message associated with the test part.
-  const char* message() const { return message_.c_str(); }
+  const char *message() const { return message_.c_str(); }
 
-  // Returns true iff the test part passed.
+  // Returns true if and only if the test part was skipped.
+  bool skipped() const { return type_ == kSkip; }
+
+  // Returns true if and only if the test part passed.
   bool passed() const { return type_ == kSuccess; }
 
-  // Returns true iff the test part failed.
-  bool failed() const { return type_ != kSuccess; }
-
-  // Returns true iff the test part non-fatally failed.
+  // Returns true if and only if the test part non-fatally failed.
   bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
 
-  // Returns true iff the test part fatally failed.
+  // Returns true if and only if the test part fatally failed.
   bool fatally_failed() const { return type_ == kFatalFailure; }
 
+  // Returns true if and only if the test part failed.
+  bool failed() const { return fatally_failed() || nonfatally_failed(); }
+
  private:
   Type type_;
 
   // Gets the summary of the failure message by omitting the stack
   // trace in it.
-  static std::string ExtractSummary(const char* message);
+  static std::string ExtractSummary(const char *message);
 
   // The name of the source file where the test part took place, or
   // "" if the source file is unknown.
@@ -117,7 +118,7 @@
 };
 
 // Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+std::ostream &operator<<(std::ostream &os, const TestPartResult &result);
 
 // An array of TestPartResult objects.
 //
@@ -128,10 +129,10 @@
   TestPartResultArray() {}
 
   // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult& result);
+  void Append(const TestPartResult &result);
 
   // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult& GetTestPartResult(int index) const;
+  const TestPartResult &GetTestPartResult(int index) const;
 
   // Returns the number of TestPartResult objects in the array.
   int size() const;
@@ -143,11 +144,11 @@
 };
 
 // This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
+class GTEST_API_ TestPartResultReporterInterface {
  public:
   virtual ~TestPartResultReporterInterface() {}
 
-  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+  virtual void ReportTestPartResult(const TestPartResult &result) = 0;
 };
 
 namespace internal {
@@ -162,12 +163,13 @@
     : public TestPartResultReporterInterface {
  public:
   HasNewFatalFailureHelper();
-  virtual ~HasNewFatalFailureHelper();
-  virtual void ReportTestPartResult(const TestPartResult& result);
+  ~HasNewFatalFailureHelper() override;
+  void ReportTestPartResult(const TestPartResult &result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
  private:
   bool has_new_fatal_failure_;
-  TestPartResultReporterInterface* original_reporter_;
+  TestPartResultReporterInterface *original_reporter_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
 };
@@ -176,4 +178,6 @@
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
index 5f69d56..f5afc4d 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h

@@ -26,8 +26,8 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -51,22 +51,22 @@
   T value_;
 };
 
-// Next, associate a list of types with the test case, which will be
+// Next, associate a list of types with the test suite, which will be
 // repeated for each type in the list.  The typedef is necessary for
 // the macro to parse correctly.
 typedef testing::Types<char, int, unsigned int> MyTypes;
-TYPED_TEST_CASE(FooTest, MyTypes);
+TYPED_TEST_SUITE(FooTest, MyTypes);
 
 // If the type list contains only one type, you can write that type
 // directly without Types<...>:
-//   TYPED_TEST_CASE(FooTest, int);
+//   TYPED_TEST_SUITE(FooTest, int);
 
 // Then, use TYPED_TEST() instead of TEST_F() to define as many typed
-// tests for this test case as you want.
+// tests for this test suite as you want.
 TYPED_TEST(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  // Since we are inside a derived class template, C++ requires use to
-  // visit the members of FooTest via 'this'.
+  // Inside a test, refer to the special name TypeParam to get the type
+  // parameter.  Since we are inside a derived class template, C++ requires
+  // us to visit the members of FooTest via 'this'.
   TypeParam n = this->value_;
 
   // To visit static members of the fixture, add the TestFixture::
@@ -82,6 +82,24 @@
 
 TYPED_TEST(FooTest, HasPropertyA) { ... }
 
+// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
+
 #endif  // 0
 
 // Type-parameterized tests are abstract test patterns parameterized
@@ -107,13 +125,13 @@
   ...
 };
 
-// Next, declare that you will define a type-parameterized test case
+// Next, declare that you will define a type-parameterized test suite
 // (the _P suffix is for "parameterized" or "pattern", whichever you
 // prefer):
-TYPED_TEST_CASE_P(FooTest);
+TYPED_TEST_SUITE_P(FooTest);
 
 // Then, use TYPED_TEST_P() to define as many type-parameterized tests
-// for this type-parameterized test case as you want.
+// for this type-parameterized test suite as you want.
 TYPED_TEST_P(FooTest, DoesBlah) {
   // Inside a test, refer to TypeParam to get the type parameter.
   TypeParam n = 0;
@@ -124,10 +142,10 @@
 
 // Now the tricky part: you need to register all test patterns before
 // you can instantiate them.  The first argument of the macro is the
-// test case name; the rest are the names of the tests in this test
+// test suite name; the rest are the names of the tests in this test
 // case.
-REGISTER_TYPED_TEST_CASE_P(FooTest,
-                           DoesBlah, HasPropertyA);
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+                            DoesBlah, HasPropertyA);
 
 // Finally, you are free to instantiate the pattern with the types you
 // want.  If you put the above code in a header file, you can #include
@@ -135,17 +153,23 @@
 //
 // To distinguish different instances of the pattern, the first
 // argument to the INSTANTIATE_* macro is a prefix that will be added
-// to the actual test case name.  Remember to pick unique prefixes for
+// to the actual test suite name.  Remember to pick unique prefixes for
 // different instances.
 typedef testing::Types<char, int, unsigned int> MyTypes;
-INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 
 // If the type list contains only one type, you can write that type
 // directly without Types<...>:
-//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_SUITE above,
+// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
 
 #endif  // 0
 
+#include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 #include "gtest/internal/gtest-type-util.h"
 
@@ -156,35 +180,55 @@
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the typedef for the type parameters of the
-// given test case.
-# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+// given test suite.
+#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
 
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestSuiteName) \
+  gtest_type_params_##TestSuiteName##_NameGenerator
 
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-              #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+#define TYPED_TEST_SUITE(CaseName, Types, ...)                          \
+  typedef ::testing::internal::GenerateTypeList<Types>::type            \
+      GTEST_TYPE_PARAMS_(CaseName);                                     \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+#define TYPED_TEST(CaseName, TestName)                                        \
+  static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
+                "test-name must not be empty");                               \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    void TestBody() override;                                                 \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+      GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest<   \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   GTEST_STRINGIFY_(CaseName),                \
+                                   GTEST_STRINGIFY_(TestName), 0,             \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE                                                \
+  static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
+  TYPED_TEST_SUITE
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 #endif  // GTEST_HAS_TYPED_TEST
 
@@ -195,68 +239,98 @@
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the namespace name that the type-parameterized tests for
-// the given type-parameterized test case are defined in.  The exact
+// the given type-parameterized test suite are defined in.  The exact
 // name of the namespace is subject to change without notice.
-# define GTEST_CASE_NAMESPACE_(TestCaseName) \
-  gtest_case_##TestCaseName##_
+#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the variable used to remember the names of
-// the defined tests in the given test case.
-# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
-  gtest_typed_test_case_p_state_##TestCaseName##_
+// the defined tests in the given test suite.
+#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
+  gtest_typed_test_suite_p_state_##TestSuiteName##_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
 //
 // Expands to the name of the variable used to remember the names of
-// the registered tests in the given test case.
-# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
-  gtest_registered_test_names_##TestCaseName##_
+// the registered tests in the given test suite.
+#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
+  gtest_registered_test_names_##TestSuiteName##_
 
 // The variables defined in the type-parameterized test macros are
 // static as typically these macros are used in a .h file that can be
 // #included in multiple translation units linked together.
-# define TYPED_TEST_CASE_P(CaseName) \
-  static ::testing::internal::TypedTestCasePState \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+#define TYPED_TEST_SUITE_P(SuiteName)              \
+  static ::testing::internal::TypedTestSuitePState \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
 
-# define TYPED_TEST_P(CaseName, TestName) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  template <typename gtest_TypeParam_> \
-  class TestName : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
-          __FILE__, __LINE__, #CaseName, #TestName); \
-  } \
-  template <typename gtest_TypeParam_> \
-  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE_P                                                 \
+  static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
+  TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
-  } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
+#define TYPED_TEST_P(SuiteName, TestName)                             \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
+    template <typename gtest_TypeParam_>                              \
+    class TestName : public SuiteName<gtest_TypeParam_> {             \
+     private:                                                         \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                \
+      typedef gtest_TypeParam_ TypeParam;                             \
+      void TestBody() override;                                       \
+    };                                                                \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
+            GTEST_STRINGIFY_(TestName));                              \
+  }                                                                   \
+  template <typename gtest_TypeParam_>                                \
+  void GTEST_SUITE_NAMESPACE_(                                        \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-              &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), \
-              #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+// Note: this won't work correctly if the trailing arguments are macros.
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                         \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                             \
+    typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_;    \
+  }                                                                         \
+  static const char *const GTEST_REGISTERED_TEST_NAMES_(                    \
+      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                  \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
+          GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define REGISTER_TYPED_TEST_CASE_P                                           \
+  static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
+                "");                                                         \
+  REGISTER_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::GenerateTypeList<Types>::type>())
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TYPED_TEST_CASE_P                                      \
+  static_assert(                                                           \
+      ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
+  INSTANTIATE_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 #endif  // GTEST_HAS_TYPED_TEST_P
 

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest.h
index f846c5b..8fd7eea 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest.h

@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
 // included by any test program that uses Google Test.
@@ -48,16 +47,22 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_H_
 
+#include <cstddef>
 #include <limits>
+#include <memory>
 #include <ostream>
+#include <type_traits>
 #include <vector>
 
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 #include "gtest/gtest-death-test.h"
+#include "gtest/gtest-matchers.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-param-test.h"
 #include "gtest/gtest-printers.h"
@@ -65,23 +70,19 @@
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
 
-// Depending on the platform, different string classes are available.
-// On Linux, in addition to ::std::string, Google also makes use of
-// class ::string, which has the same interface as ::std::string, but
-// has a different implementation.
-//
-// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
-// ::string is available AND is a distinct type to ::std::string, or
-// define it to 0 to indicate otherwise.
-//
-// If ::std::string and ::string are the same class on your platform
-// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
-//
-// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
-// heuristically.
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 namespace testing {
 
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -103,6 +104,10 @@
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
@@ -115,6 +120,9 @@
 // test.
 GTEST_DECLARE_bool_(print_time);
 
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
 // This flag specifies the random number seed.
 GTEST_DECLARE_int32_(random_seed);
 
@@ -135,7 +143,7 @@
 
 // When this flag is specified, a failed assertion will throw an
 // exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
+// non-zero code otherwise. For use with an external test framework.
 GTEST_DECLARE_bool_(throw_on_failure);
 
 // When this flag is set with a "host:port" string, on supported
@@ -143,6 +151,10 @@
 // the specified host machine.
 GTEST_DECLARE_string_(stream_result_to);
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -160,9 +172,11 @@
 class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
-class UnitTestImpl* GetUnitTestImpl();
+class FuchsiaDeathTest;
+class UnitTestImpl *GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message);
+                                    const std::string &message);
+std::set<std::string> *GetIgnoredParameterizedTestSuites();
 
 }  // namespace internal
 
@@ -170,7 +184,12 @@
 // If we don't forward declare them the compiler might confuse the classes
 // in friendship clauses with same named classes on the scope.
 class Test;
-class TestCase;
+class TestSuite;
+
+// Old API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TestCase = TestSuite;
+#endif
 class TestInfo;
 class UnitTest;
 
@@ -257,9 +276,15 @@
  public:
   // Copy constructor.
   // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
+  AssertionResult(const AssertionResult &other);
 
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
 
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
   //
@@ -270,21 +295,24 @@
   // we want AssertionResult's copy constructor to be used.
   template <typename T>
   explicit AssertionResult(
-      const T& success,
-      typename internal::EnableIf<
-          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
-          /*enabler*/ = NULL)
+      const T &success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type *
+      /*enabler*/
+      = nullptr)
       : success_(success) {}
 
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
 
   // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
+  AssertionResult &operator=(AssertionResult other) {
     swap(other);
     return *this;
   }
 
-  // Returns true iff the assertion succeeded.
+  // Returns true if and only if the assertion succeeded.
   operator bool() const { return success_; }  // NOLINT
 
   // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
@@ -294,37 +322,36 @@
   // use it when they fail (i.e., the predicate's outcome doesn't match the
   // assertion's expectation). When nothing has been streamed into the
   // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != NULL ?  message_->c_str() : "";
+  const char *message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
   }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
   // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
+  const char *failure_message() const { return message(); }
 
   // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
+  template <typename T>
+  AssertionResult &operator<<(const T &value) {
     AppendMessage(Message() << value);
     return *this;
   }
 
   // Allows streaming basic output manipulators such as endl or flush into
   // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+  AssertionResult &operator<<(
+      ::std::ostream &(*basic_manipulator)(::std::ostream &stream)) {
     AppendMessage(Message() << basic_manipulator);
     return *this;
   }
 
  private:
   // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == NULL)
-      message_.reset(new ::std::string);
+  void AppendMessage(const Message &a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
     message_->append(a_message.GetString().c_str());
   }
 
   // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
+  void swap(AssertionResult &other);
 
   // Stores result of the assertion predicate.
   bool success_;
@@ -332,7 +359,7 @@
   // construct is not satisfied with the predicate's outcome.
   // Referenced via a pointer to avoid taking too much stack frame space
   // with test assertions.
-  internal::scoped_ptr< ::std::string> message_;
+  std::unique_ptr< ::std::string> message_;
 };
 
 // Makes a successful assertion result.
@@ -343,19 +370,28 @@
 
 // Makes a failed assertion result with the given failure message.
 // Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+GTEST_API_ AssertionResult AssertionFailure(const Message &msg);
+
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+#include "gtest/gtest_pred_impl.h"
+
+namespace testing {
 
 // The abstract class that all tests inherit from.
 //
-// In Google Test, a unit test program contains one or many TestCases, and
-// each TestCase contains one or many Tests.
+// In Google Test, a unit test program contains one or many TestSuites, and
+// each TestSuite contains one or many Tests.
 //
 // When you define a test using the TEST macro, you don't need to
 // explicitly derive from Test - the TEST macro automatically does
 // this for you.
 //
 // The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
+// to be used in a TEST_F.  For example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
@@ -372,55 +408,59 @@
  public:
   friend class TestInfo;
 
-  // Defines types for pointers to functions that set up and tear down
-  // a test case.
-  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
-  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
-
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
   // Sets up the stuff shared by all tests in this test case.
   //
-  // Google Test will call Foo::SetUpTestCase() before running the first
+  // Google Test will call Foo::SetUpTestSuite() before running the first
   // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestCase() method to shadow the one defined in the super
+  // SetUpTestSuite() method to shadow the one defined in the super
   // class.
-  static void SetUpTestCase() {}
+  static void SetUpTestSuite() {}
 
-  // Tears down the stuff shared by all tests in this test case.
+  // Tears down the stuff shared by all tests in this test suite.
   //
-  // Google Test will call Foo::TearDownTestCase() after running the last
+  // Google Test will call Foo::TearDownTestSuite() after running the last
   // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestCase() method to shadow the one defined in the super
+  // TearDownTestSuite() method to shadow the one defined in the super
   // class.
-  static void TearDownTestCase() {}
+  static void TearDownTestSuite() {}
 
-  // Returns true iff the current test has a fatal failure.
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  static void TearDownTestCase() {}
+  static void SetUpTestCase() {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns true if and only if the current test has a fatal failure.
   static bool HasFatalFailure();
 
-  // Returns true iff the current test has a non-fatal failure.
+  // Returns true if and only if the current test has a non-fatal failure.
   static bool HasNonfatalFailure();
 
-  // Returns true iff the current test has a (either fatal or
+  // Returns true if and only if the current test was skipped.
+  static bool IsSkipped();
+
+  // Returns true if and only if the current test has a (either fatal or
   // non-fatal) failure.
   static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
 
-  // Logs a property for the current test, test case, or for the entire
+  // Logs a property for the current test, test suite, or for the entire
   // invocation of the test program when used outside of the context of a
-  // test case.  Only the last value for a given key is remembered.  These
+  // test suite.  Only the last value for a given key is remembered.  These
   // are public static so they can be called from utility functions that are
   // not members of the test fixture.  Calls to RecordProperty made during
   // lifespan of the test (from the moment its constructor starts to the
   // moment its destructor finishes) will be output in XML as attributes of
   // the <testcase> element.  Properties recorded from fixture's
-  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
   // corresponding <testsuite> element.  Calls to RecordProperty made in the
   // global context (before or after invocation of RUN_ALL_TESTS and from
   // SetUp/TearDown method of Environment objects registered with Google
   // Test) will be output as attributes of the <testsuites> element.
-  static void RecordProperty(const std::string& key, const std::string& value);
-  static void RecordProperty(const std::string& key, int value);
+  static void RecordProperty(const std::string &key, const std::string &value);
+  static void RecordProperty(const std::string &key, int value);
 
  protected:
   // Creates a Test object.
@@ -433,8 +473,8 @@
   virtual void TearDown();
 
  private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test case.
+  // Returns true if and only if the current test has the same fixture class
+  // as the first test in the current test suite.
   static bool HasSameFixtureClass();
 
   // Runs the test after the test fixture has been set up.
@@ -452,7 +492,7 @@
   // internal method to avoid clashing with names used in user TESTs.
   void DeleteSelf_() { delete this; }
 
-  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
+  const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
 
   // Often a user misspells SetUp() as Setup() and spends a long time
   // wondering why it is never called by Google Test.  The declaration of
@@ -471,7 +511,7 @@
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
 
   // We disallow copying Tests.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
@@ -488,24 +528,17 @@
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string &a_key, const std::string &a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char *key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char *value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string &new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -535,51 +568,61 @@
   // Returns the number of the test properties.
   int test_property_count() const;
 
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Failed(); }
+  // Returns true if and only if the test passed (i.e. no test part failed).
+  bool Passed() const { return !Skipped() && !Failed(); }
 
-  // Returns true iff the test failed.
+  // Returns true if and only if the test was skipped.
+  bool Skipped() const;
+
+  // Returns true if and only if the test failed.
   bool Failed() const;
 
-  // Returns true iff the test fatally failed.
+  // Returns true if and only if the test fatally failed.
   bool HasFatalFailure() const;
 
-  // Returns true iff the test has a non-fatal failure.
+  // Returns true if and only if the test has a non-fatal failure.
   bool HasNonfatalFailure() const;
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
-  const TestPartResult& GetTestPartResult(int i) const;
+  // Gets the time of the test case start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult &GetTestPartResult(int i) const;
 
   // Returns the i-th test property. i can range from 0 to
   // test_property_count() - 1. If i is not in that range, aborts the
   // program.
-  const TestProperty& GetTestProperty(int i) const;
+  const TestProperty &GetTestProperty(int i) const;
 
  private:
   friend class TestInfo;
-  friend class TestCase;
+  friend class TestSuite;
   friend class UnitTest;
   friend class internal::DefaultGlobalTestPartResultReporter;
   friend class internal::ExecDeathTest;
   friend class internal::TestResultAccessor;
   friend class internal::UnitTestImpl;
   friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
 
   // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult>& test_part_results() const {
+  const std::vector<TestPartResult> &test_part_results() const {
     return test_part_results_;
   }
 
   // Gets the vector of TestProperties.
-  const std::vector<TestProperty>& test_properties() const {
+  const std::vector<TestProperty> &test_properties() const {
     return test_properties_;
   }
 
+  // Sets the start time.
+  void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
+
   // Sets the elapsed time.
   void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
@@ -589,17 +632,17 @@
   // value will be updated, rather than storing multiple values for the same
   // key.  xml_element specifies the element for which the property is being
   // recorded and is used for validation.
-  void RecordProperty(const std::string& xml_element,
-                      const TestProperty& test_property);
+  void RecordProperty(const std::string &xml_element,
+                      const TestProperty &test_property);
 
   // Adds a failure if the key is a reserved attribute of Google Test
-  // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const std::string& xml_element,
-                                   const TestProperty& test_property);
+  // testsuite tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string &xml_element,
+                                   const TestProperty &test_property);
 
   // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult& test_part_result);
+  void AddTestPartResult(const TestPartResult &test_part_result);
 
   // Returns the death test count.
   int death_test_count() const { return death_test_count_; }
@@ -623,6 +666,8 @@
   std::vector<TestProperty> test_properties_;
   // Running count of death tests.
   int death_test_count_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
   // The elapsed time, in milliseconds.
   TimeInMillis elapsed_time_;
 
@@ -632,7 +677,7 @@
 
 // A TestInfo object stores the following information about a test:
 //
-//   Test case name
+//   Test suite name
 //   Test name
 //   Whether the test should be run
 //   A function pointer that creates the test object when invoked
@@ -647,40 +692,46 @@
   // don't inherit from TestInfo.
   ~TestInfo();
 
-  // Returns the test case name.
-  const char* test_case_name() const { return test_case_name_.c_str(); }
+  // Returns the test suite name.
+  const char *test_suite_name() const { return test_suite_name_.c_str(); }
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const char *test_case_name() const { return test_suite_name(); }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the test name.
-  const char* name() const { return name_.c_str(); }
+  const char *name() const { return name_.c_str(); }
 
   // Returns the name of the parameter type, or NULL if this is not a typed
   // or a type-parameterized test.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+  const char *type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
   }
 
   // Returns the text representation of the value parameter, or NULL if this
   // is not a value-parameterized test.
-  const char* value_param() const {
-    if (value_param_.get() != NULL)
-      return value_param_->c_str();
-    return NULL;
+  const char *value_param() const {
+    if (value_param_.get() != nullptr) return value_param_->c_str();
+    return nullptr;
   }
 
   // Returns the file name where this test is defined.
-  const char* file() const { return location_.file.c_str(); }
+  const char *file() const { return location_.file.c_str(); }
 
   // Returns the line where this test is defined.
   int line() const { return location_.line; }
 
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
   // Returns true if this test should run, that is if the test is not
   // disabled (or it is disabled but the also_run_disabled_tests flag has
   // been specified) and its full name matches the user-specified filter.
   //
   // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test case Foo is defined as
+  // The full name of a test Bar in test suite Foo is defined as
   // "Foo.Bar".  Only the tests that match the filter will run.
   //
   // A filter is a colon-separated list of glob (not regex) patterns,
@@ -693,45 +744,39 @@
   // contains the character 'A' or starts with "Foo.".
   bool should_run() const { return should_run_; }
 
-  // Returns true iff this test will appear in the XML report.
+  // Returns true if and only if this test will appear in the XML report.
   bool is_reportable() const {
-    // For now, the XML report includes all tests matching the filter.
-    // In the future, we may trim tests that are excluded because of
-    // sharding.
-    return matches_filter_;
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
   }
 
   // Returns the result of the test.
-  const TestResult* result() const { return &result_; }
+  const TestResult *result() const { return &result_; }
 
  private:
 #if GTEST_HAS_DEATH_TEST
   friend class internal::DefaultDeathTestFactory;
 #endif  // GTEST_HAS_DEATH_TEST
   friend class Test;
-  friend class TestCase;
+  friend class TestSuite;
   friend class internal::UnitTestImpl;
   friend class internal::StreamingListenerTest;
-  friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name,
-      const char* name,
-      const char* type_param,
-      const char* value_param,
-      internal::CodeLocation code_location,
-      internal::TypeId fixture_class_id,
-      Test::SetUpTestCaseFunc set_up_tc,
-      Test::TearDownTestCaseFunc tear_down_tc,
-      internal::TestFactoryBase* factory);
+  friend TestInfo *internal::MakeAndRegisterTestInfo(
+      const char *test_suite_name, const char *name, const char *type_param,
+      const char *value_param, internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
+      internal::TearDownTestSuiteFunc tear_down_tc,
+      internal::TestFactoryBase *factory);
 
   // Constructs a TestInfo object. The newly constructed instance assumes
   // ownership of the factory object.
-  TestInfo(const std::string& test_case_name,
-           const std::string& name,
-           const char* a_type_param,   // NULL if not a type-parameterized test
-           const char* a_value_param,  // NULL if not a value-parameterized test
+  TestInfo(const std::string &test_suite_name, const std::string &name,
+           const char *a_type_param,   // NULL if not a type-parameterized test
+           const char *a_value_param,  // NULL if not a value-parameterized test
            internal::CodeLocation a_code_location,
            internal::TypeId fixture_class_id,
-           internal::TestFactoryBase* factory);
+           internal::TestFactoryBase *factory);
 
   // Increments the number of death tests encountered in this test so
   // far.
@@ -743,26 +788,27 @@
   // deletes it.
   void Run();
 
-  static void ClearTestResult(TestInfo* test_info) {
+  static void ClearTestResult(TestInfo *test_info) {
     test_info->result_.Clear();
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_case_name_;     // Test case name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
+  const std::unique_ptr<const ::std::string> type_param_;
   // Text representation of the value parameter, or NULL if this is not a
   // value-parameterized test.
-  const internal::scoped_ptr<const ::std::string> value_param_;
+  const std::unique_ptr<const ::std::string> value_param_;
   internal::CodeLocation location_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
-  internal::TestFactoryBase* const factory_;  // The factory that creates
+  const internal::TypeId fixture_class_id_;  // ID of the test fixture class
+  bool should_run_;           // True if and only if this test should run
+  bool is_disabled_;          // True if and only if this test is disabled
+  bool matches_filter_;       // True if this test matches the
+                              // user-specified filter.
+  bool is_in_another_shard_;  // Will be run in another shard.
+  internal::TestFactoryBase *const factory_;  // The factory that creates
                                               // the test object
 
   // This field is mutable and needs to be reset before running the
@@ -772,187 +818,210 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
 };
 
-// A test case, which consists of a vector of TestInfos.
+// A test suite, which consists of a vector of TestInfos.
 //
-// TestCase is not copyable.
-class GTEST_API_ TestCase {
+// TestSuite is not copyable.
+class GTEST_API_ TestSuite {
  public:
-  // Creates a TestCase with the given name.
+  // Creates a TestSuite with the given name.
   //
-  // TestCase does NOT have a default constructor.  Always use this
-  // constructor to create a TestCase object.
+  // TestSuite does NOT have a default constructor.  Always use this
+  // constructor to create a TestSuite object.
   //
   // Arguments:
   //
-  //   name:         name of the test case
+  //   name:         name of the test suite
   //   a_type_param: the name of the test's type parameter, or NULL if
   //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  TestCase(const char* name, const char* a_type_param,
-           Test::SetUpTestCaseFunc set_up_tc,
-           Test::TearDownTestCaseFunc tear_down_tc);
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  TestSuite(const char *name, const char *a_type_param,
+            internal::SetUpTestSuiteFunc set_up_tc,
+            internal::TearDownTestSuiteFunc tear_down_tc);
 
-  // Destructor of TestCase.
-  virtual ~TestCase();
+  // Destructor of TestSuite.
+  virtual ~TestSuite();
 
-  // Gets the name of the TestCase.
-  const char* name() const { return name_.c_str(); }
+  // Gets the name of the TestSuite.
+  const char *name() const { return name_.c_str(); }
 
   // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test case.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+  // type-parameterized test suite.
+  const char *type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
   }
 
-  // Returns true if any test in this test case should run.
+  // Returns true if any test in this test suite should run.
   bool should_run() const { return should_run_; }
 
-  // Gets the number of successful tests in this test case.
+  // Gets the number of successful tests in this test suite.
   int successful_test_count() const;
 
-  // Gets the number of failed tests in this test case.
+  // Gets the number of skipped tests in this test suite.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests in this test suite.
   int failed_test_count() const;
 
   // Gets the number of disabled tests that will be reported in the XML report.
   int reportable_disabled_test_count() const;
 
-  // Gets the number of disabled tests in this test case.
+  // Gets the number of disabled tests in this test suite.
   int disabled_test_count() const;
 
   // Gets the number of tests to be printed in the XML report.
   int reportable_test_count() const;
 
-  // Get the number of tests in this test case that should run.
+  // Get the number of tests in this test suite that should run.
   int test_to_run_count() const;
 
-  // Gets the number of all tests in this test case.
+  // Gets the number of all tests in this test suite.
   int total_test_count() const;
 
-  // Returns true iff the test case passed.
+  // Returns true if and only if the test suite passed.
   bool Passed() const { return !Failed(); }
 
-  // Returns true iff the test case failed.
-  bool Failed() const { return failed_test_count() > 0; }
+  // Returns true if and only if the test suite failed.
+  bool Failed() const {
+    return failed_test_count() > 0 || ad_hoc_test_result().Failed();
+  }
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
+  // Gets the time of the test suite start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
   // Returns the i-th test among all the tests. i can range from 0 to
   // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo* GetTestInfo(int i) const;
+  const TestInfo *GetTestInfo(int i) const;
 
   // Returns the TestResult that holds test properties recorded during
-  // execution of SetUpTestCase and TearDownTestCase.
-  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+  // execution of SetUpTestSuite and TearDownTestSuite.
+  const TestResult &ad_hoc_test_result() const { return ad_hoc_test_result_; }
 
  private:
   friend class Test;
   friend class internal::UnitTestImpl;
 
-  // Gets the (mutable) vector of TestInfos in this TestCase.
-  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+  // Gets the (mutable) vector of TestInfos in this TestSuite.
+  std::vector<TestInfo *> &test_info_list() { return test_info_list_; }
 
-  // Gets the (immutable) vector of TestInfos in this TestCase.
-  const std::vector<TestInfo*>& test_info_list() const {
+  // Gets the (immutable) vector of TestInfos in this TestSuite.
+  const std::vector<TestInfo *> &test_info_list() const {
     return test_info_list_;
   }
 
   // Returns the i-th test among all the tests. i can range from 0 to
   // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo* GetMutableTestInfo(int i);
+  TestInfo *GetMutableTestInfo(int i);
 
   // Sets the should_run member.
   void set_should_run(bool should) { should_run_ = should; }
 
-  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
-  // destruction of the TestCase object.
-  void AddTestInfo(TestInfo * test_info);
+  // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
+  // destruction of the TestSuite object.
+  void AddTestInfo(TestInfo *test_info);
 
-  // Clears the results of all tests in this test case.
+  // Clears the results of all tests in this test suite.
   void ClearResult();
 
-  // Clears the results of all tests in the given test case.
-  static void ClearTestCaseResult(TestCase* test_case) {
-    test_case->ClearResult();
+  // Clears the results of all tests in the given test suite.
+  static void ClearTestSuiteResult(TestSuite *test_suite) {
+    test_suite->ClearResult();
   }
 
-  // Runs every test in this TestCase.
+  // Runs every test in this TestSuite.
   void Run();
 
-  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestCase().
-  void RunSetUpTestCase() { (*set_up_tc_)(); }
+  // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestSuite().
+  void RunSetUpTestSuite() {
+    if (set_up_tc_ != nullptr) {
+      (*set_up_tc_)();
+    }
+  }
 
-  // Runs TearDownTestCase() for this TestCase.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestCase().
-  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+  // Runs TearDownTestSuite() for this TestSuite.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestSuite().
+  void RunTearDownTestSuite() {
+    if (tear_down_tc_ != nullptr) {
+      (*tear_down_tc_)();
+    }
+  }
 
-  // Returns true iff test passed.
-  static bool TestPassed(const TestInfo* test_info) {
+  // Returns true if and only if test passed.
+  static bool TestPassed(const TestInfo *test_info) {
     return test_info->should_run() && test_info->result()->Passed();
   }
 
-  // Returns true iff test failed.
-  static bool TestFailed(const TestInfo* test_info) {
+  // Returns true if and only if test skipped.
+  static bool TestSkipped(const TestInfo *test_info) {
+    return test_info->should_run() && test_info->result()->Skipped();
+  }
+
+  // Returns true if and only if test failed.
+  static bool TestFailed(const TestInfo *test_info) {
     return test_info->should_run() && test_info->result()->Failed();
   }
 
-  // Returns true iff the test is disabled and will be reported in the XML
-  // report.
-  static bool TestReportableDisabled(const TestInfo* test_info) {
+  // Returns true if and only if the test is disabled and will be reported in
+  // the XML report.
+  static bool TestReportableDisabled(const TestInfo *test_info) {
     return test_info->is_reportable() && test_info->is_disabled_;
   }
 
-  // Returns true iff test is disabled.
-  static bool TestDisabled(const TestInfo* test_info) {
+  // Returns true if and only if test is disabled.
+  static bool TestDisabled(const TestInfo *test_info) {
     return test_info->is_disabled_;
   }
 
-  // Returns true iff this test will appear in the XML report.
-  static bool TestReportable(const TestInfo* test_info) {
+  // Returns true if and only if this test will appear in the XML report.
+  static bool TestReportable(const TestInfo *test_info) {
     return test_info->is_reportable();
   }
 
   // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo* test_info) {
+  static bool ShouldRunTest(const TestInfo *test_info) {
     return test_info->should_run();
   }
 
-  // Shuffles the tests in this test case.
-  void ShuffleTests(internal::Random* random);
+  // Shuffles the tests in this test suite.
+  void ShuffleTests(internal::Random *random);
 
   // Restores the test order to before the first shuffle.
   void UnshuffleTests();
 
-  // Name of the test case.
+  // Name of the test suite.
   std::string name_;
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
+  const std::unique_ptr<const ::std::string> type_param_;
   // The vector of TestInfos in their original order.  It owns the
   // elements in the vector.
-  std::vector<TestInfo*> test_info_list_;
+  std::vector<TestInfo *> test_info_list_;
   // Provides a level of indirection for the test list to allow easy
   // shuffling and restoring the test order.  The i-th element in this
   // vector is the index of the i-th test in the shuffled test list.
   std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test case.
-  Test::SetUpTestCaseFunc set_up_tc_;
-  // Pointer to the function that tears down the test case.
-  Test::TearDownTestCaseFunc tear_down_tc_;
-  // True iff any test in this test case should run.
+  // Pointer to the function that sets up the test suite.
+  internal::SetUpTestSuiteFunc set_up_tc_;
+  // Pointer to the function that tears down the test suite.
+  internal::TearDownTestSuiteFunc tear_down_tc_;
+  // True if and only if any test in this test suite should run.
   bool should_run_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
   // Elapsed time, in milliseconds.
   TimeInMillis elapsed_time_;
-  // Holds test properties recorded during execution of SetUpTestCase and
-  // TearDownTestCase.
+  // Holds test properties recorded during execution of SetUpTestSuite and
+  // TearDownTestSuite.
   TestResult ad_hoc_test_result_;
 
-  // We disallow copying TestCases.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+  // We disallow copying TestSuites.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
 };
 
 // An Environment object is capable of setting up and tearing down an
@@ -979,13 +1048,26 @@
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
 };
 
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult &result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
 // The interface for tracing execution of tests. The methods are organized in
 // the order the corresponding events are fired.
 class TestEventListener {
@@ -993,47 +1075,58 @@
   virtual ~TestEventListener() {}
 
   // Fired before any test activity starts.
-  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+  virtual void OnTestProgramStart(const UnitTest &unit_test) = 0;
 
   // Fired before each iteration of tests starts.  There may be more than
   // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
   // index, starting from 0.
-  virtual void OnTestIterationStart(const UnitTest& unit_test,
+  virtual void OnTestIterationStart(const UnitTest &unit_test,
                                     int iteration) = 0;
 
   // Fired before environment set-up for each iteration of tests starts.
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+  virtual void OnEnvironmentsSetUpStart(const UnitTest &unit_test) = 0;
 
   // Fired after environment set-up for each iteration of tests ends.
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) = 0;
 
-  // Fired before the test case starts.
-  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+  // Fired before the test suite starts.
+  virtual void OnTestSuiteStart(const TestSuite & /*test_suite*/) {}
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseStart(const TestCase & /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before the test starts.
-  virtual void OnTestStart(const TestInfo& test_info) = 0;
+  virtual void OnTestStart(const TestInfo &test_info) = 0;
 
   // Fired after a failed assertion or a SUCCEED() invocation.
-  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
+  virtual void OnTestPartResult(const TestPartResult &test_part_result) = 0;
 
   // Fired after the test ends.
-  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+  virtual void OnTestEnd(const TestInfo &test_info) = 0;
 
-  // Fired after the test case ends.
-  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+  // Fired after the test suite ends.
+  virtual void OnTestSuiteEnd(const TestSuite & /*test_suite*/) {}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseEnd(const TestCase & /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before environment tear-down for each iteration of tests starts.
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+  virtual void OnEnvironmentsTearDownStart(const UnitTest &unit_test) = 0;
 
   // Fired after environment tear-down for each iteration of tests ends.
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest &unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
-  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+  virtual void OnTestProgramEnd(const UnitTest &unit_test) = 0;
 };
 
 // The convenience class for users who need to override just one or two
@@ -1043,21 +1136,30 @@
 // above.
 class EmptyTestEventListener : public TestEventListener {
  public:
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
-                                    int /*iteration*/) {}
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
-  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
-  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
-  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
-  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
-                                  int /*iteration*/) {}
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest & /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest & /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestSuiteStart(const TestSuite & /*test_suite*/) override {}
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase & /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnTestStart(const TestInfo & /*test_info*/) override {}
+  void OnTestPartResult(const TestPartResult & /*test_part_result*/) override {}
+  void OnTestEnd(const TestInfo & /*test_info*/) override {}
+  void OnTestSuiteEnd(const TestSuite & /*test_suite*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase & /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest & /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest & /*unit_test*/,
+                          int /*iteration*/) override {}
+  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
 };
 
 // TestEventListeners lets users add listeners to track events in Google Test.
@@ -1069,19 +1171,19 @@
   // Appends an event listener to the end of the list. Google Test assumes
   // the ownership of the listener (i.e. it will delete the listener when
   // the test program finishes).
-  void Append(TestEventListener* listener);
+  void Append(TestEventListener *listener);
 
   // Removes the given event listener from the list and returns it.  It then
   // becomes the caller's responsibility to delete the listener. Returns
   // NULL if the listener is not found in the list.
-  TestEventListener* Release(TestEventListener* listener);
+  TestEventListener *Release(TestEventListener *listener);
 
   // Returns the standard listener responsible for the default console
   // output.  Can be removed from the listeners list to shut down default
   // console output.  Note that removing this object from the listener list
   // with Release transfers its ownership to the caller and makes this
   // function return NULL the next time.
-  TestEventListener* default_result_printer() const {
+  TestEventListener *default_result_printer() const {
     return default_result_printer_;
   }
 
@@ -1092,12 +1194,12 @@
   // removing this object from the listener list with Release transfers its
   // ownership to the caller and makes this function return NULL the next
   // time.
-  TestEventListener* default_xml_generator() const {
+  TestEventListener *default_xml_generator() const {
     return default_xml_generator_;
   }
 
  private:
-  friend class TestCase;
+  friend class TestSuite;
   friend class TestInfo;
   friend class internal::DefaultGlobalTestPartResultReporter;
   friend class internal::NoExecDeathTest;
@@ -1106,21 +1208,21 @@
 
   // Returns repeater that broadcasts the TestEventListener events to all
   // subscribers.
-  TestEventListener* repeater();
+  TestEventListener *repeater();
 
   // Sets the default_result_printer attribute to the provided listener.
   // The listener is also added to the listener list and previous
   // default_result_printer is removed from it and deleted. The listener can
   // also be NULL in which case it will not be added to the list. Does
   // nothing if the previous and the current listener objects are the same.
-  void SetDefaultResultPrinter(TestEventListener* listener);
+  void SetDefaultResultPrinter(TestEventListener *listener);
 
   // Sets the default_xml_generator attribute to the provided listener.  The
   // listener is also added to the listener list and previous
   // default_xml_generator is removed from it and deleted. The listener can
   // also be NULL in which case it will not be added to the list. Does
   // nothing if the previous and the current listener objects are the same.
-  void SetDefaultXmlGenerator(TestEventListener* listener);
+  void SetDefaultXmlGenerator(TestEventListener *listener);
 
   // Controls whether events will be forwarded by the repeater to the
   // listeners in the list.
@@ -1128,17 +1230,17 @@
   void SuppressEventForwarding();
 
   // The actual list of listeners.
-  internal::TestEventRepeater* repeater_;
+  internal::TestEventRepeater *repeater_;
   // Listener responsible for the standard result output.
-  TestEventListener* default_result_printer_;
+  TestEventListener *default_result_printer_;
   // Listener responsible for the creation of the XML output file.
-  TestEventListener* default_xml_generator_;
+  TestEventListener *default_xml_generator_;
 
   // We disallow copying TestEventListeners.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
 };
 
-// A UnitTest consists of a vector of TestCases.
+// A UnitTest consists of a vector of TestSuites.
 //
 // This is a singleton class.  The only instance of UnitTest is
 // created when UnitTest::GetInstance() is first called.  This
@@ -1153,7 +1255,7 @@
   // Gets the singleton UnitTest object.  The first time this method
   // is called, a UnitTest object is constructed and returned.
   // Consecutive calls will return the same object.
-  static UnitTest* GetInstance();
+  static UnitTest *GetInstance();
 
   // Runs all tests in this UnitTest object and prints the result.
   // Returns 0 if successful, or 1 otherwise.
@@ -1165,46 +1267,58 @@
 
   // Returns the working directory when the first TEST() or TEST_F()
   // was executed.  The UnitTest object owns the string.
-  const char* original_working_dir() const;
+  const char *original_working_dir() const;
 
-  // Returns the TestCase object for the test that's currently running,
+  // Returns the TestSuite object for the test that's currently running,
   // or NULL if no test is running.
-  const TestCase* current_test_case() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestSuite *current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+#endif
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo *current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
 
-#if GTEST_HAS_PARAM_TEST
-  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // Returns the ParameterizedTestSuiteRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
-#endif  // GTEST_HAS_PARAM_TEST
 
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
 
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
 
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
 
-  // Gets the number of all test cases that contain at least one test
+  // Gets the number of all test suites that contain at least one test
   // that should run.
+  int test_suite_to_run_count() const;
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  int successful_test_case_count() const;
+  int failed_test_case_count() const;
+  int total_test_case_count() const;
   int test_case_to_run_count() const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Gets the number of successful tests.
   int successful_test_count() const;
 
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
   // Gets the number of failed tests.
   int failed_test_count() const;
 
@@ -1230,24 +1344,30 @@
   // Gets the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const;
 
-  // Returns true iff the unit test passed (i.e. all test cases passed).
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
   bool Passed() const;
 
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
   bool Failed() const;
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const;
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite *GetTestSuite(int i) const;
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *GetTestCase(int i) const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the TestResult containing information on test failures and
-  // properties logged outside of individual test cases.
-  const TestResult& ad_hoc_test_result() const;
+  // properties logged outside of individual test suites.
+  const TestResult &ad_hoc_test_result() const;
 
   // Returns the list of event listeners that can be used to track events
   // inside Google Test.
-  TestEventListeners& listeners();
+  TestEventListeners &listeners();
 
  private:
   // Registers and returns a global test environment.  When a test
@@ -1259,46 +1379,45 @@
   // The UnitTest object takes ownership of the given environment.
   //
   // This method can only be called from the main thread.
-  Environment* AddEnvironment(Environment* env);
+  Environment *AddEnvironment(Environment *env);
 
   // Adds a TestPartResult to the current TestResult object.  All
   // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
-                         const std::string& message,
-                         const std::string& os_stack_trace)
+                         const char *file_name, int line_number,
+                         const std::string &message,
+                         const std::string &os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Adds a TestProperty to the current TestResult object when invoked from
-  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
-  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+  // from SetUpTestSuite or TearDownTestSuite, or to the global property set
   // when invoked elsewhere.  If the result already contains a property with
   // the same key, the value will be updated.
-  void RecordProperty(const std::string& key, const std::string& value);
+  void RecordProperty(const std::string &key, const std::string &value);
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i);
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite *GetMutableTestSuite(int i);
 
   // Accessors for the implementation object.
-  internal::UnitTestImpl* impl() { return impl_; }
-  const internal::UnitTestImpl* impl() const { return impl_; }
+  internal::UnitTestImpl *impl() { return impl_; }
+  const internal::UnitTestImpl *impl() const { return impl_; }
 
-  // These classes and funcions are friends as they need to access private
+  // These classes and functions are friends as they need to access private
   // members of UnitTest.
+  friend class ScopedTrace;
   friend class Test;
   friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
-  friend Environment* AddGlobalTestEnvironment(Environment* env);
-  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend Environment *AddGlobalTestEnvironment(Environment *env);
+  friend std::set<std::string> *internal::GetIgnoredParameterizedTestSuites();
+  friend internal::UnitTestImpl *internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string &message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1308,12 +1427,11 @@
 
   // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
   // Google Test trace stack.
-  void PushGTestTrace(const internal::TraceInfo& trace)
+  void PushGTestTrace(const internal::TraceInfo &trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1323,7 +1441,7 @@
   // the object is constructed.  We don't mark it as const here, as
   // doing so will cause a warning in the constructor of UnitTest.
   // Mutable state in *impl_ is protected by mutex_.
-  internal::UnitTestImpl* impl_;
+  internal::UnitTestImpl *impl_;
 
   // We disallow copying UnitTest.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
@@ -1347,7 +1465,7 @@
 // translation units and the environments have dependencies among them
 // (remember that the compiler doesn't guarantee the order in which
 // global variables from different translation units are initialized).
-inline Environment* AddGlobalTestEnvironment(Environment* env) {
+inline Environment *AddGlobalTestEnvironment(Environment *env) {
   return UnitTest::GetInstance()->AddEnvironment(env);
 }
 
@@ -1360,11 +1478,15 @@
 // updated.
 //
 // Calling the function for the second time has no user-visible effect.
-GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+GTEST_API_ void InitGoogleTest(int *argc, char **argv);
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
-GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+GTEST_API_ void InitGoogleTest(int *argc, wchar_t **argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleTest();
 
 namespace internal {
 
@@ -1372,27 +1494,29 @@
 // frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+AssertionResult CmpHelperEQFailure(const char *lhs_expression,
+                                   const char *rhs_expression, const T1 &lhs,
+                                   const T2 &rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
+// This block of code defines operator==/!=
+// to block lexical scope lookup.
+// It prevents using invalid operator==/!= defined at namespace scope.
+struct faketype {};
+inline bool operator==(faketype, faketype) { return true; }
+inline bool operator!=(faketype, faketype) { return false; }
+
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
-                            const T2& rhs) {
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
+AssertionResult CmpHelperEQ(const char *lhs_expression,
+                            const char *rhs_expression, const T1 &lhs,
+                            const T2 &rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
   }
-GTEST_DISABLE_MSC_WARNINGS_POP_()
 
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
@@ -1400,24 +1524,22 @@
 // With this overloaded version, we allow anonymous enums to be used
 // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
 // can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
-                                       const char* rhs_expression,
-                                       BiggestInt lhs,
-                                       BiggestInt rhs);
+GTEST_API_ AssertionResult CmpHelperEQ(const char *lhs_expression,
+                                       const char *rhs_expression,
+                                       BiggestInt lhs, BiggestInt rhs);
 
-// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
-// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
-// is a null pointer literal.  The following default implementation is
-// for lhs_is_null_literal being false.
-template <bool lhs_is_null_literal>
 class EqHelper {
  public:
   // This templatized version is for the general case.
-  template <typename T1, typename T2>
-  static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 const T1& lhs,
-                                 const T2& rhs) {
+  template <
+      typename T1, typename T2,
+      // Disable this overload for cases where one argument is a pointer
+      // and the other is the null pointer constant.
+      typename std::enable_if<!std::is_integral<T1>::value ||
+                              !std::is_pointer<T2>::value>::type * = nullptr>
+  static AssertionResult Compare(const char *lhs_expression,
+                                 const char *rhs_expression, const T1 &lhs,
+                                 const T2 &rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
@@ -1427,55 +1549,20 @@
   //
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
+  static AssertionResult Compare(const char *lhs_expression,
+                                 const char *rhs_expression, BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
-};
 
-// This specialization is used when the first argument to ASSERT_EQ()
-// is a null pointer literal, like NULL, false, or 0.
-template <>
-class EqHelper<true> {
- public:
-  // We define two overloaded versions of Compare().  The first
-  // version will be picked when the second argument to ASSERT_EQ() is
-  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
-  // EXPECT_EQ(false, a_bool).
-  template <typename T1, typename T2>
-  static AssertionResult Compare(
-      const char* lhs_expression,
-      const char* rhs_expression,
-      const T1& lhs,
-      const T2& rhs,
-      // The following line prevents this overload from being considered if T2
-      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
-      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
-      // to match the Secret* in the other overload, which would otherwise make
-      // this template match better.
-      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
-  }
-
-  // This version will be picked when the second argument to ASSERT_EQ() is a
-  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* lhs_expression,
-      const char* rhs_expression,
-      // We used to have a second template parameter instead of Secret*.  That
-      // template parameter would deduce to 'long', making this a better match
-      // than the first overload even without the first overload's EnableIf.
-      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
-      // non-pointer argument" (even a deduced integral argument), so the old
-      // implementation caused warnings in user code.
-      Secret* /* lhs (NULL) */,
-      T* rhs) {
+      const char *lhs_expression, const char *rhs_expression,
+      // Handle cases where '0' is used as a null pointer literal.
+      std::nullptr_t /* lhs */, T *rhs) {
     // We already know that 'lhs' is a null pointer.
     return CmpHelperEQ(lhs_expression, rhs_expression,
-                       static_cast<T*>(NULL), rhs);
+                       static_cast<T *>(nullptr), rhs);
   }
 };
 
@@ -1483,9 +1570,9 @@
 // frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
 // when calling EXPECT_OP in a tight loop.
 template <typename T1, typename T2>
-AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
-                                   const T1& val1, const T2& val2,
-                                   const char* op) {
+AssertionResult CmpHelperOpFailure(const char *expr1, const char *expr2,
+                                   const T1 &val1, const T2 &val2,
+                                   const char *op) {
   return AssertionFailure()
          << "Expected: (" << expr1 << ") " << op << " (" << expr2
          << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
@@ -1503,18 +1590,18 @@
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2, \
+                                     const T1 &val1, const T2 &val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }                                                                        \
+  GTEST_API_ AssertionResult CmpHelper##op_name(                           \
+      const char *expr1, const char *expr2, BiggestInt val1, BiggestInt val2)
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
@@ -1534,51 +1621,44 @@
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const char *s1, const char *s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
-                                              const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char *s1_expression,
+                                              const char *s2_expression,
+                                              const char *s1, const char *s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const char *s1, const char *s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                              const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
+                                              const char *s2_expression,
+                                              const char *s1, const char *s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const wchar_t *s1, const wchar_t *s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const wchar_t *s1, const wchar_t *s2);
 
 }  // namespace internal
 
@@ -1590,32 +1670,40 @@
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const char *needle,
+                                       const char *haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const wchar_t *needle,
+                                       const wchar_t *haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const char *needle,
+                                          const char *haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const wchar_t *needle,
+                                          const wchar_t *haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const ::std::string &needle,
+                                       const ::std::string &haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const ::std::string &needle,
+                                          const ::std::string &haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const ::std::wstring &needle,
+                                       const ::std::wstring &haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const ::std::wstring &needle,
+                                          const ::std::wstring &haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1628,10 +1716,9 @@
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
-                                         const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
+AssertionResult CmpHelperFloatingPointEQ(const char *lhs_expression,
+                                         const char *rhs_expression,
+                                         RawType lhs_value, RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1646,21 +1733,18 @@
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
 // Helper function for implementing ASSERT_NEAR.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
-                                                const char* expr2,
-                                                const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char *expr1,
+                                                const char *expr2,
+                                                const char *abs_error_expr,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1668,15 +1752,13 @@
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
-               const char* message);
+  AssertHelper(TestPartResult::Type type, const char *file, int line,
+               const char *message);
   ~AssertHelper();
 
   // Message assignment is a semantic trick to enable assertion
   // streaming; see the GTEST_MESSAGE_ macro below.
-  void operator=(const Message& message) const;
+  void operator=(const Message &message) const;
 
  private:
   // We put our data in a struct so that the size of the AssertHelper class can
@@ -1684,14 +1766,12 @@
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
-                     const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+    AssertHelperData(TestPartResult::Type t, const char *srcfile, int line_num,
+                     const char *msg)
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
-    const char* const file;
+    const char *const file;
     int const line;
     std::string const message;
 
@@ -1699,14 +1779,19 @@
     GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
   };
 
-  AssertHelperData* const data_;
+  AssertHelperData *const data_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
 };
 
+enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
+
+GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
+                                                            const char *fmt,
+                                                            ...);
+
 }  // namespace internal
 
-#if GTEST_HAS_PARAM_TEST
 // The pure interface class that all value-parameterized tests inherit from.
 // A value-parameterized class must inherit from both ::testing::Test and
 // ::testing::WithParamInterface. In most cases that just means inheriting
@@ -1724,13 +1809,13 @@
 //   FooTest() {
 //     // Can use GetParam() here.
 //   }
-//   virtual ~FooTest() {
+//   ~FooTest() override {
 //     // Can use GetParam() here.
 //   }
-//   virtual void SetUp() {
+//   void SetUp() override {
 //     // Can use GetParam() here.
 //   }
-//   virtual void TearDown {
+//   void TearDown override {
 //     // Can use GetParam() here.
 //   }
 // };
@@ -1739,7 +1824,7 @@
 //   Foo foo;
 //   ASSERT_TRUE(foo.DoesBar(GetParam()));
 // }
-// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
 
 template <typename T>
 class WithParamInterface {
@@ -1748,12 +1833,9 @@
   virtual ~WithParamInterface() {}
 
   // The current parameter value. Is also available in the test fixture's
-  // constructor. This member function is non-static, even though it only
-  // references static data, to reduce the opportunity for incorrect uses
-  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
-  // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const {
-    GTEST_CHECK_(parameter_ != NULL)
+  // constructor.
+  static const ParamType &GetParam() {
+    GTEST_CHECK_(parameter_ != nullptr)
         << "GetParam() can only be called inside a value-parameterized test "
         << "-- did you intend to write TEST_P instead of TEST_F?";
     return *parameter_;
@@ -1762,31 +1844,32 @@
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType *parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
-  static const ParamType* parameter_;
+  static const ParamType *parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
-const T* WithParamInterface<T>::parameter_ = NULL;
+const T *WithParamInterface<T>::parameter_ = nullptr;
 
 // Most value-parameterized classes can ignore the existence of
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-#endif  // GTEST_HAS_PARAM_TEST
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
+// Skips test in runtime.
+// Skipping test aborts current function.
+// Skipped tests are neither successful nor failed.
+#define GTEST_SKIP() GTEST_SKIP_("")
+
 // ADD_FAILURE unconditionally adds a failure to the current test.
 // SUCCEED generates a success - it doesn't automatically make the
 // current test successful, as a test is only successful when it has
@@ -1809,17 +1892,22 @@
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
 // Generates a fatal failure with a generic message.
 #define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
 
+// Like GTEST_FAIL(), but at the given source file location.
+#define GTEST_FAIL_AT(file, line)         \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kFatalFailure)
+
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1828,7 +1916,7 @@
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1856,23 +1944,18 @@
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
+#define EXPECT_TRUE(condition)                            \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
+#define EXPECT_FALSE(condition)                              \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition)                              \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-#include "gtest/gtest_pred_impl.h"
-
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -1914,15 +1997,13 @@
 //
 // Examples:
 //
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
 #define EXPECT_EQ(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
-                      val1, val2)
+  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
 #define EXPECT_NE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
@@ -1935,9 +2016,7 @@
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
 #define GTEST_ASSERT_EQ(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
-                      val1, val2)
+  ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -1953,27 +2032,27 @@
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -1998,7 +2077,7 @@
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2007,7 +2086,7 @@
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2024,29 +2103,29 @@
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2055,12 +2134,11 @@
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+GTEST_API_ AssertionResult FloatLE(const char *expr1, const char *expr2,
                                    float val1, float val2);
-GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2072,17 +2150,17 @@
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2097,9 +2175,54 @@
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char *file, int line, const T &message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char *file, int line, const char *message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+  ScopedTrace(const char *file, int line, const std::string &message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char *file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
 
 // Causes a trace (including the source file path, the current line
 // number, and the given message) to be included in every test failure
@@ -2112,13 +2235,17 @@
 // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
 // to appear in the same block - as long as they are on different
 // lines.
-#define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
-// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
-// the same type.  The value it returns is not interesting.
+// StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
+// are the same type.  The value it returns is not interesting.
 //
 // Instead of making StaticAssertTypeEq a class template, we make it a
 // function template that invokes a helper class template.  This
@@ -2147,18 +2274,18 @@
 //
 // to cause a compiler error.
 template <typename T1, typename T2>
-bool StaticAssertTypeEq() {
-  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+constexpr bool StaticAssertTypeEq() noexcept {
+  static_assert(std::is_same<T1, T2>::value, "T1 and T2 are not the same type");
   return true;
 }
 
 // Defines a test.
 //
-// The first parameter is the name of the test case, and the second
-// parameter is the name of the test within the test case.
+// The first parameter is the name of the test suite, and the second
+// parameter is the name of the test within the test suite.
 //
-// The convention is to end the test case name with "Test".  For
-// example, a test case for the Foo class can be named FooTest.
+// The convention is to end the test suite name with "Test".  For
+// example, a test suite for the Foo class can be named FooTest.
 //
 // Test code should appear between braces after an invocation of
 // this macro.  Example:
@@ -2177,28 +2304,28 @@
 // code.  GetTestTypeId() is guaranteed to always return the same
 // value, as it always calls GetTypeId<>() from the Google Test
 // framework.
-#define GTEST_TEST(test_case_name, test_name)\
-  GTEST_TEST_(test_case_name, test_name, \
-              ::testing::Test, ::testing::internal::GetTestTypeId())
+#define GTEST_TEST(test_suite_name, test_name)             \
+  GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
+              ::testing::internal::GetTestTypeId())
 
 // Define this macro to 1 to omit the definition of TEST(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_TEST
-# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
 #endif
 
 // Defines a test that uses a test fixture.
 //
 // The first parameter is the name of the test fixture class, which
-// also doubles as the test case name.  The second parameter is the
-// name of the test within the test case.
+// also doubles as the test suite name.  The second parameter is the
+// name of the test within the test suite.
 //
 // A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
+// the test code between braces after using this macro.  Example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
-//     virtual void SetUp() { b_.AddElement(3); }
+//     void SetUp() override { b_.AddElement(3); }
 //
 //     Foo a_;
 //     Foo b_;
@@ -2209,13 +2336,104 @@
 //   }
 //
 //   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
 //   }
-
-#define TEST_F(test_fixture, test_name)\
+//
+// GOOGLETEST_CM0011 DO NOT DELETE
+#if !GTEST_DONT_DEFINE_TEST
+#define TEST_F(test_fixture, test_name)              \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
+#endif  // !GTEST_DONT_DEFINE_TEST
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Dynamically registers a test with the framework.
+//
+// This is an advanced API only to be used when the `TEST` macros are
+// insufficient. The macros should be preferred when possible, as they avoid
+// most of the complexity of calling this function.
+//
+// The `factory` argument is a factory callable (move-constructible) object or
+// function pointer that creates a new instance of the Test object. It
+// handles ownership to the caller. The signature of the callable is
+// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
+// tests registered with the same `test_suite_name` must return the same
+// fixture type. This is checked at runtime.
+//
+// The framework will infer the fixture class from the factory and will call
+// the `SetUpTestSuite` and `TearDownTestSuite` for it.
+//
+// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+// undefined.
+//
+// Use case example:
+//
+// class MyFixture : public ::testing::Test {
+//  public:
+//   // All of these optional, just like in regular macro usage.
+//   static void SetUpTestSuite() { ... }
+//   static void TearDownTestSuite() { ... }
+//   void SetUp() override { ... }
+//   void TearDown() override { ... }
+// };
+//
+// class MyTest : public MyFixture {
+//  public:
+//   explicit MyTest(int data) : data_(data) {}
+//   void TestBody() override { ... }
+//
+//  private:
+//   int data_;
+// };
+//
+// void RegisterMyTests(const std::vector<int>& values) {
+//   for (int v : values) {
+//     ::testing::RegisterTest(
+//         "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+//         std::to_string(v).c_str(),
+//         __FILE__, __LINE__,
+//         // Important to use the fixture type as the return type here.
+//         [=]() -> MyFixture* { return new MyTest(v); });
+//   }
+// }
+// ...
+// int main(int argc, char** argv) {
+//   std::vector<int> values_to_test = LoadValuesFromConfig();
+//   RegisterMyTests(values_to_test);
+//   ...
+//   return RUN_ALL_TESTS();
+// }
+//
+template <int &... ExplicitParameterBarrier, typename Factory>
+TestInfo *RegisterTest(const char *test_suite_name, const char *test_name,
+                       const char *type_param, const char *value_param,
+                       const char *file, int line, Factory factory) {
+  using TestT = typename std::remove_pointer<decltype(factory())>::type;
+
+  class FactoryImpl : public internal::TestFactoryBase {
+   public:
+    explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
+    Test *CreateTest() override { return factory_(); }
+
+   private:
+    Factory factory_;
+  };
+
+  return internal::MakeAndRegisterTestInfo(
+      test_suite_name, test_name, type_param, value_param,
+      internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
+      internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
+      internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
+      new FactoryImpl{ std::move(factory) });
+}
 
 }  // namespace testing
 
@@ -2229,8 +2447,8 @@
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
index 30ae712..1fc2191 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h

@@ -27,18 +27,18 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
 // 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#include "gtest/gtest.h"
+
+namespace testing {
 
 // This header implements a family of generic predicate assertion
 // macros:
@@ -72,90 +72,70 @@
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
+    ;                                                           \
+  else                                                          \
     on_failure(gtest_ar.failure_message())
 
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char *pred_text, const char *e1,
+                                  Pred pred, const T1 &v1) {
   if (pred(v1)) return AssertionSuccess();
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1);
 }
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char *pred_text, const char *e1,
+                                  const char *e2, Pred pred, const T1 &v1,
+                                  const T2 &v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2);
 }
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -167,50 +147,34 @@
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3, Pred pred,
+                                  const T1 &v1, const T2 &v2, const T3 &v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3);
 }
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -222,57 +186,36 @@
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3,
+                                  const char *e4, Pred pred, const T1 &v1,
+                                  const T2 &v2, const T3 &v3, const T4 &v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4);
 }
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -284,64 +227,40 @@
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
+AssertionResult AssertPred5Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3,
+                                  const char *e4, const char *e5, Pred pred,
+                                  const T1 &v1, const T2 &v2, const T3 &v3,
+                                  const T4 &v4, const T5 &v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ", " << e5 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
+         << e5 << " evaluates to " << ::testing::PrintToString(v5);
 }
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -353,6 +272,6 @@
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-
+}  // namespace testing
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/libaom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
index da80ddc..3dc5b23 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h

@@ -26,10 +26,10 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code. GOOGLETEST_CM0003 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -40,19 +40,22 @@
 //
 // class MyClass {
 //  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
 // };
 //
 // class MyClassTest : public testing::Test {
 //   // ...
 // };
 //
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
 // }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
new file mode 100644
index 0000000..ff391fb
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md

@@ -0,0 +1,56 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+    `OsStackTraceGetterInterface`.
+*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+    `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GTEST_FLAG(flag_name)`
+*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
+    own flagfile flag parsing.
+*   `GTEST_DECLARE_bool_(name)`
+*   `GTEST_DECLARE_int32_(name)`
+*   `GTEST_DECLARE_string_(name)`
+*   `GTEST_DEFINE_bool_(name, default_val, doc)`
+*   `GTEST_DEFINE_int32_(name, default_val, doc)`
+*   `GTEST_DEFINE_string_(name, default_val, doc)`
+
+### Logging:
+
+*   `GTEST_LOG_(severity)`
+*   `GTEST_CHECK_(condition)`
+*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+*   `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+*   `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+*   `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+*   See documentation at `gtest/gtest-printers.h` for details on how to define a
+    custom printer.

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
index 7e744bd..cd85d95 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h

@@ -27,39 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Injection point for custom user configurations.
-// The following macros can be defined:
-//
-//   Flag related macros:
-//     GTEST_FLAG(flag_name)
-//     GTEST_USE_OWN_FLAGFILE_FLAG_  - Define to 0 when the system provides its
-//                                     own flagfile flag parsing.
-//     GTEST_DECLARE_bool_(name)
-//     GTEST_DECLARE_int32_(name)
-//     GTEST_DECLARE_string_(name)
-//     GTEST_DEFINE_bool_(name, default_val, doc)
-//     GTEST_DEFINE_int32_(name, default_val, doc)
-//     GTEST_DEFINE_string_(name, default_val, doc)
-//
-//   Test filtering:
-//     GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
-//                                  will be used if --GTEST_FLAG(test_filter)
-//                                  is not provided.
-//
-//   Logging:
-//     GTEST_LOG_(severity)
-//     GTEST_CHECK_(condition)
-//     Functions LogToStderr() and FlushInfoLog() have to be provided too.
-//
-//   Threading:
-//     GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
-//     GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
-//                                         already provided.
-//     Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
-//     GTEST_DEFINE_STATIC_MUTEX_(mutex)
-//
-//     GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-//     GTEST_LOCK_EXCLUDED_(locks)
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
index 60c1ea0..eb4467a 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h

@@ -31,8 +31,8 @@
 // installation of gTest.
 // It will be included from gtest-printers.h and the overrides in this file
 // will be visible to everyone.
-// See documentation at gtest/gtest-printers.h for details on how to define a
-// custom printer.
+//
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
index c27412a..4c8e07b 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h

@@ -27,11 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Injection point for custom user configurations.
-// The following macros can be defined:
-//
-// GTEST_OS_STACK_TRACE_GETTER_  - The name of an implementation of
-//                                 OsStackTraceGetterInterface.
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
index 2b3a78f..3e9497d 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h

@@ -27,19 +27,20 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
+#include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
 
 #include <stdio.h>
+#include <memory>
 
 namespace testing {
 namespace internal {
@@ -53,6 +54,9 @@
 
 #if GTEST_HAS_DEATH_TEST
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // DeathTest is a class that hides much of the complexity of the
 // GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
 // returns a concrete class that depends on the prevailing death test
@@ -76,18 +80,20 @@
   // argument is set.  If the death test should be skipped, the pointer
   // is set to NULL; otherwise, it is set to the address of a new concrete
   // DeathTest object that controls the execution of the current test.
-  static bool Create(const char* statement, const RE* regex,
-                     const char* file, int line, DeathTest** test);
+  static bool Create(const char *statement,
+                     Matcher<const std::string &> matcher, const char *file,
+                     int line, DeathTest **test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest() {}
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    explicit ReturnSentinel(DeathTest *test) : test_(test) {}
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
    private:
-    DeathTest* const test_;
+    DeathTest *const test_;
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
   } GTEST_ATTRIBUTE_UNUSED_;
 
@@ -125,9 +131,9 @@
 
   // Returns a human-readable outcome message regarding the outcome of
   // the last death test.
-  static const char* LastMessage();
+  static const char *LastMessage();
 
-  static void set_last_death_test_message(const std::string& message);
+  static void set_last_death_test_message(const std::string &message);
 
  private:
   // A string containing a description of the outcome of the last death test.
@@ -136,96 +142,123 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test) = 0;
+  virtual ~DeathTestFactory() {}
+  virtual bool Create(const char *statement,
+                      Matcher<const std::string &> matcher, const char *file,
+                      int line, DeathTest **test) = 0;
 };
 
 // A concrete DeathTestFactory implementation for normal use.
 class DefaultDeathTestFactory : public DeathTestFactory {
  public:
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test);
+  bool Create(const char *statement, Matcher<const std::string &> matcher,
+              const char *file, int line, DeathTest **test) override;
 };
 
 // Returns true if exit_status describes a process that was terminated
 // by a signal, or exited normally with a nonzero exit code.
 GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 
+// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
+// and interpreted as a regex (rather than an Eq matcher) for legacy
+// compatibility.
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    ::testing::internal::RE regex) {
+  return ContainsRegex(regex.pattern());
+}
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(const char *regex) {
+  return ContainsRegex(regex);
+}
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    const ::std::string &regex) {
+  return ContainsRegex(regex);
+}
+
+// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
+// used directly.
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    Matcher<const ::std::string &> matcher) {
+  return matcher;
+}
+
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception &gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
+  } catch (...) {                                                            \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-# endif
+#endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
-# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    const ::testing::internal::RE& gtest_regex = (regex); \
-    ::testing::internal::DeathTest* gtest_dt; \
-    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
-        __FILE__, __LINE__, &gtest_dt)) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-    } \
-    if (gtest_dt != NULL) { \
-      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
-          gtest_dt_ptr(gtest_dt); \
-      switch (gtest_dt->AssumeRole()) { \
-        case ::testing::internal::DeathTest::OVERSEE_TEST: \
-          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
-            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-          } \
-          break; \
-        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
-          ::testing::internal::DeathTest::ReturnSentinel \
-              gtest_sentinel(gtest_dt); \
-          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
-          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
-          break; \
-        } \
-        default: \
-          break; \
-      } \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
-      fail(::testing::internal::DeathTest::LastMessage())
+#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::DeathTest *gtest_dt;                                  \
+    if (!::testing::internal::DeathTest::Create(                               \
+            #statement,                                                        \
+            ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
+            __FILE__, __LINE__, &gtest_dt)) {                                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                        \
+    }                                                                          \
+    if (gtest_dt != nullptr) {                                                 \
+      std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) {                                        \
+        case ::testing::internal::DeathTest::OVERSEE_TEST:                     \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) {                \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                  \
+          }                                                                    \
+          break;                                                               \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: {                   \
+          ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel(       \
+              gtest_dt);                                                       \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt);            \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
+          break;                                                               \
+        }                                                                      \
+        default: break;                                                        \
+      }                                                                        \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__)                                \
+        : fail(::testing::internal::DeathTest::LastMessage())
 // The symbol "fail" here expands to something into which a message
 // can be streamed.
 
 // This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher)    \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                  \
+  if (::testing::internal::AlwaysTrue()) {                       \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);   \
+  } else if (!::testing::internal::AlwaysTrue()) {               \
+    ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
+  } else                                                         \
     ::testing::Message()
 
 // A class representing the parsed contents of the
@@ -233,19 +266,15 @@
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
+  InternalRunDeathTestFlag(const std::string &a_file, int a_line, int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
+    if (write_fd_ >= 0) posix::Close(write_fd_);
   }
 
-  const std::string& file() const { return file_; }
+  const std::string &file() const { return file_; }
   int line() const { return line_; }
   int index() const { return index_; }
   int write_fd() const { return write_fd_; }
@@ -262,54 +291,7 @@
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
-
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
+InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag();
 
 #endif  // GTEST_HAS_DEATH_TEST
 

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
index 7a13b4b..b228d47 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h

@@ -27,21 +27,24 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: keith.ray@gmail.com (Keith Ray)
-//
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
 // Google Test.  They are subject to change without notice.
 //
-// This file is #included in <gtest/internal/gtest-internal.h>.
+// This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 namespace internal {
 
@@ -58,24 +61,22 @@
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath &rhs) : pathname_(rhs.pathname_) {}
 
-  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+  explicit FilePath(const std::string &pathname) : pathname_(pathname) {
     Normalize();
   }
 
-  FilePath& operator=(const FilePath& rhs) {
+  FilePath &operator=(const FilePath &rhs) {
     Set(rhs);
     return *this;
   }
 
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
+  void Set(const FilePath &rhs) { pathname_ = rhs.pathname_; }
 
-  const std::string& string() const { return pathname_; }
-  const char* c_str() const { return pathname_.c_str(); }
+  const std::string &string() const { return pathname_; }
+  const char *c_str() const { return pathname_.c_str(); }
 
   // Returns the current working directory, or "" if unsuccessful.
   static FilePath GetCurrentDir();
@@ -84,16 +85,15 @@
   // extension = "xml", returns "dir/test.xml". If number is greater
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
-                               const char* extension);
+  static FilePath MakeFileName(const FilePath &directory,
+                               const FilePath &base_name, int number,
+                               const char *extension);
 
   // Given directory = "dir", relative_path = "test.xml",
   // returns "dir/test.xml".
   // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath& directory,
-                              const FilePath& relative_path);
+  static FilePath ConcatPaths(const FilePath &directory,
+                              const FilePath &relative_path);
 
   // Returns a pathname for a file that does not currently exist. The pathname
   // will be directory/base_name.extension or
@@ -103,11 +103,11 @@
   // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
   // There could be a race condition if two or more processes are calling this
   // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath& directory,
-                                         const FilePath& base_name,
-                                         const char* extension);
+  static FilePath GenerateUniqueFileName(const FilePath &directory,
+                                         const FilePath &base_name,
+                                         const char *extension);
 
-  // Returns true iff the path is "".
+  // Returns true if and only if the path is "".
   bool IsEmpty() const { return pathname_.empty(); }
 
   // If input name has a trailing separator character, removes it and returns
@@ -135,7 +135,7 @@
   // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
   // FilePath("dir/file"). If a case-insensitive extension is not
   // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char* extension) const;
+  FilePath RemoveExtension(const char *extension) const;
 
   // Creates directories so that path exists. Returns true if successful or if
   // the directories already exist; returns false if unable to create
@@ -195,7 +195,7 @@
   // Returns a pointer to the last occurence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
-  const char* FindLastPathSeparator() const;
+  const char *FindLastPathSeparator() const;
 
   std::string pathname_;
 };  // class FilePath
@@ -203,4 +203,6 @@
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
index ebd1cf6..9640aba 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h

@@ -27,42 +27,44 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
+#include <cstdint>
 #include <iomanip>
 #include <limits>
 #include <map>
 #include <set>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-type-util.h"
 
 // Due to C++ preprocessor weirdness, we need double indirection to
@@ -74,71 +76,75 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
 
-class ProtocolMessage;
-namespace proto2 { class Message; }
+// Stringifies its argument.
+// Work around a bug in visual studio which doesn't accept code like this:
+//
+//   #define GTEST_STRINGIFY_(name) #name
+//   #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ...
+//   MACRO(, x, y)
+//
+// Complaining about the argument to GTEST_STRINGIFY_ being empty.
+// This is allowed by the spec.
+#define GTEST_STRINGIFY_HELPER_(name, ...) #name
+#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
+
+namespace proto2 {
+class Message;
+}
 
 namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test cases.
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
 
 template <typename T>
-::std::string PrintToString(const T& value);
+::std::string PrintToString(const T &value);
 
 namespace internal {
 
-struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
 GTEST_API_ extern const char kStackTraceMarker[];
 
-// Two overloaded helpers for checking at compile time whether an
-// expression is a null pointer literal (i.e. NULL or any 0-valued
-// compile-time integral constant).  Their return values have
-// different sizes, so we can use sizeof() to test which version is
-// picked by the compiler.  These helpers have no implementations, as
-// we only need their signatures.
-//
-// Given IsNullLiteralHelper(x), the compiler will pick the first
-// version if x can be implicitly converted to Secret*, and pick the
-// second version otherwise.  Since Secret is a secret and incomplete
-// type, the only expression a user can write that has type Secret* is
-// a null pointer literal.  Therefore, we know that x is a null
-// pointer literal if and only if the first version is picked by the
-// compiler.
-char IsNullLiteralHelper(Secret* p);
-char (&IsNullLiteralHelper(...))[2];  // NOLINT
+// An IgnoredValue object can be implicitly constructed from ANY value.
+class IgnoredValue {
+  struct Sink {};
 
-// A compile-time bool constant that is true if and only if x is a
-// null pointer literal (i.e. NULL or any 0-valued compile-time
-// integral constant).
-#ifdef GTEST_ELLIPSIS_NEEDS_POD_
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_IS_NULL_LITERAL_(x) false
-#else
-# define GTEST_IS_NULL_LITERAL_(x) \
-    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
-#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+ public:
+  // This constructor template allows any value to be implicitly
+  // converted to IgnoredValue.  The object has no data member and
+  // doesn't try to remember anything about the argument.  We
+  // deliberately omit the 'explicit' keyword in order to allow the
+  // conversion to be implicit.
+  // Disable the conversion if T already has a magical conversion operator.
+  // Otherwise we get ambiguity.
+  template <typename T,
+            typename std::enable_if<!std::is_convertible<T, Sink>::value,
+                                    int>::type = 0>
+  IgnoredValue(const T & /* ignored */) {}  // NOLINT(runtime/explicit)
+};
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string &gtest_msg,
+                                         const Message &user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
+
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
 // are enabled).  We derive it from std::runtime_error, which is for
@@ -147,48 +153,31 @@
 // frameworks know how to extract and print the message inside it.
 class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
  public:
-  explicit GoogleTestFailureException(const TestPartResult& failure);
+  explicit GoogleTestFailureException(const TestPartResult &failure);
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
+
 #endif  // GTEST_HAS_EXCEPTIONS
 
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
-
 namespace edit_distance {
 // Returns the optimal edits to go from 'left' to 'right'.
 // All edits cost the same, with replace having lower priority than
 // add/remove.
-// Simple implementation of the Wagner–Fischer algorithm.
+// Simple implementation of the Wagner-Fischer algorithm.
 // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
 enum EditType { kMatch, kAdd, kRemove, kReplace };
 GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<size_t>& left, const std::vector<size_t>& right);
+    const std::vector<size_t> &left, const std::vector<size_t> &right);
 
 // Same as above, but the input is represented as strings.
 GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string>& left,
-    const std::vector<std::string>& right);
+    const std::vector<std::string> &left,
+    const std::vector<std::string> &right);
 
 // Create a diff of the input strings in Unified diff format.
-GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
-                                         const std::vector<std::string>& right,
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string> &left,
+                                         const std::vector<std::string> &right,
                                          size_t context = 2);
 
 }  // namespace edit_distance
@@ -197,9 +186,9 @@
 // format.
 // If not null, stores in 'total_line_count' the total number of lines found
 // in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
+GTEST_API_ std::string DiffStrings(const std::string &left,
+                                   const std::string &right,
+                                   size_t *total_line_count);
 
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
@@ -213,21 +202,19 @@
 //   expected_value:      "5"
 //   actual_value:        "6"
 //
-// The ignoring_case parameter is true iff the assertion is a
+// The ignoring_case parameter is true if and only if the assertion is a
 // *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
 // be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
-                                     const char* actual_expression,
-                                     const std::string& expected_value,
-                                     const std::string& actual_value,
+GTEST_API_ AssertionResult EqFailure(const char *expected_expression,
+                                     const char *actual_expression,
+                                     const std::string &expected_value,
+                                     const std::string &actual_value,
                                      bool ignoring_case);
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
+    const AssertionResult &assertion_result, const char *expression_text,
+    const char *actual_predicate_value, const char *expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -268,11 +255,11 @@
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
+  static const size_t kBitCount = 8 * sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
+      std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -281,8 +268,8 @@
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -307,7 +294,7 @@
   // around may change its bits, although the new value is guaranteed
   // to be also a NAN.  Therefore, don't expect this constructor to
   // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+  explicit FloatingPoint(const RawType &x) { u_.value_ = x; }
 
   // Static methods
 
@@ -321,9 +308,7 @@
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -342,26 +327,26 @@
   // Returns the sign bit of this number.
   Bits sign_bit() const { return kSignBitMask & u_.bits_; }
 
-  // Returns true iff this is NAN (not a number).
+  // Returns true if and only if this is NAN (not a number).
   bool is_nan() const {
     // It's a NAN if the exponent bits are all ones and the fraction
     // bits are not entirely zeros.
     return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
   }
 
-  // Returns true iff this number is at most kMaxUlps ULP's away from
-  // rhs.  In particular, this function:
+  // Returns true if and only if this number is at most kMaxUlps ULP's away
+  // from rhs.  In particular, this function:
   //
   //   - returns false if either number is (or both are) NAN.
   //   - treats really large numbers as almost equal to infinity.
   //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint& rhs) const {
+  bool AlmostEquals(const FloatingPoint &rhs) const {
     // The IEEE standard says that any comparison operation involving
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
   }
 
  private:
@@ -411,9 +396,13 @@
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
 template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -421,12 +410,12 @@
 typedef FloatingPoint<double> Double;
 
 // In order to catch the mistake of putting tests that use different
-// test fixture classes in the same test case, we need to assign
+// test fixture classes in the same test suite, we need to assign
 // unique IDs to fixture classes and compare them.  The TypeId type is
 // used to hold such IDs.  The user should treat TypeId as an opaque
 // type: the only operation allowed on TypeId values is to compare
 // them for equality using the == operator.
-typedef const void* TypeId;
+typedef const void *TypeId;
 
 template <typename T>
 class TypeIdHelper {
@@ -467,7 +456,7 @@
 
   // Creates a test instance to run. The instance is both created and destroyed
   // within TestInfoImpl::Run()
-  virtual Test* CreateTest() = 0;
+  virtual Test *CreateTest() = 0;
 
  protected:
   TestFactoryBase() {}
@@ -481,7 +470,7 @@
 template <class TestClass>
 class TestFactoryImpl : public TestFactoryBase {
  public:
-  virtual Test* CreateTest() { return new TestClass; }
+  Test *CreateTest() override { return new TestClass; }
 };
 
 #if GTEST_OS_WINDOWS
@@ -490,30 +479,83 @@
 // {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
 // We pass a long instead of HRESULT to avoid causing an
 // include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char *expr,
                                             long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+GTEST_API_ AssertionResult IsHRESULTFailure(const char *expr,
                                             long hr);  // NOLINT
 
 #endif  // GTEST_OS_WINDOWS
 
-// Types of SetUpTestCase() and TearDownTestCase() functions.
-typedef void (*SetUpTestCaseFunc)();
-typedef void (*TearDownTestCaseFunc)();
+// Types of SetUpTestSuite() and TearDownTestSuite() functions.
+using SetUpTestSuiteFunc = void (*)();
+using TearDownTestSuiteFunc = void (*)();
 
 struct CodeLocation {
-  CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {}
+  CodeLocation(const std::string &a_file, int a_line)
+      : file(a_file), line(a_line) {}
 
-  string file;
+  std::string file;
   int line;
 };
 
+//  Helper to identify which setup function for TestCase / TestSuite to call.
+//  Only one function is allowed, either TestCase or TestSute but not both.
+
+// Utility functions to help SuiteApiResolver
+using SetUpTearDownSuiteFuncType = void (*)();
+
+inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
+    SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
+  return a == def ? nullptr : a;
+}
+
+template <typename T>
+//  Note that SuiteApiResolver inherits from T because
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SuiteApiResolver can access them.
+struct SuiteApiResolver : T {
+  // testing::Test is only forward declared at this point. So we make it a
+  // dependend class for the compiler to be OK with it.
+  using Test =
+      typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
+
+  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char *filename,
+                                                        int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
+           "make sure there is only one present at "
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+
+  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char *filename,
+                                                           int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
+           " please make sure there is only one present at"
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+};
+
 // Creates a new TestInfo object and registers it with Google Test;
 // returns the created object.
 //
 // Arguments:
 //
-//   test_case_name:   name of the test case
+//   test_suite_name:   name of the test suite
 //   name:             name of the test
 //   type_param        the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -521,42 +563,41 @@
 //                     or NULL if this is not a type-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
 //   factory:          pointer to the factory that creates a test object.
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
-GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    CodeLocation code_location,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory);
+GTEST_API_ TestInfo *MakeAndRegisterTestInfo(
+    const char *test_suite_name, const char *name, const char *type_param,
+    const char *value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory);
 
 // If *pstr starts with the given prefix, modifies *pstr to be right
 // past the prefix and returns true; otherwise leaves *pstr unchanged
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+GTEST_API_ bool SkipPrefix(const char *prefix, const char **pstr);
 
 #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
-// State of the definition of a type-parameterized test case.
-class GTEST_API_ TypedTestCasePState {
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test suite.
+class GTEST_API_ TypedTestSuitePState {
  public:
-  TypedTestCasePState() : registered_(false) {}
+  TypedTestSuitePState() : registered_(false) {}
 
   // Adds the given test name to defined_test_names_ and return true
-  // if the test case hasn't been registered; otherwise aborts the
+  // if the test suite hasn't been registered; otherwise aborts the
   // program.
-  bool AddTestName(const char* file, int line, const char* case_name,
-                   const char* test_name) {
+  bool AddTestName(const char *file, int line, const char *case_name,
+                   const char *test_name) {
     if (registered_) {
-      fprintf(stderr, "%s Test %s must be defined before "
-              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+      fprintf(stderr,
+              "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
               FormatFileLocation(file, line).c_str(), test_name, case_name);
       fflush(stderr);
       posix::Abort();
@@ -566,11 +607,11 @@
     return true;
   }
 
-  bool TestExists(const std::string& test_name) const {
+  bool TestExists(const std::string &test_name) const {
     return registered_tests_.count(test_name) > 0;
   }
 
-  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+  const CodeLocation &GetCodeLocation(const std::string &test_name) const {
     RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
     GTEST_CHECK_(it != registered_tests_.end());
     return it->second;
@@ -579,8 +620,9 @@
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
+  const char *VerifyRegisteredTestNames(const char *test_suite_name,
+                                        const char *file, int line,
+                                        const char *registered_tests);
 
  private:
   typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
@@ -589,28 +631,68 @@
   RegisteredTestsMap registered_tests_;
 };
 
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TypedTestCasePState = TypedTestSuitePState;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Skips to the first non-space char after the first comma in 'str';
 // returns NULL if no comma is found in 'str'.
-inline const char* SkipComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  if (comma == NULL) {
-    return NULL;
+inline const char *SkipComma(const char *str) {
+  const char *comma = strchr(str, ',');
+  if (comma == nullptr) {
+    return nullptr;
   }
-  while (IsSpace(*(++comma))) {}
+  while (IsSpace(*(++comma))) {
+  }
   return comma;
 }
 
 // Returns the prefix of 'str' before the first comma in it; returns
 // the entire string if it contains no comma.
-inline std::string GetPrefixUntilComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  return comma == NULL ? str : std::string(str, comma);
+inline std::string GetPrefixUntilComma(const char *str) {
+  const char *comma = strchr(str, ',');
+  return comma == nullptr ? str : std::string(str, comma);
 }
 
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
-void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
+void SplitString(const ::std::string &str, char delimiter,
+                 ::std::vector<::std::string> *dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(internal::None, std::vector<std::string> *, int) {
+}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string> *result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
 
 // TypeParameterizedTest<Fixture, TestSel, Types>::Register()
 // registers a list of type-parameterized tests with Google Test.  The
@@ -623,13 +705,13 @@
 class TypeParameterizedTest {
  public:
   // 'index' is the index of the test in the type list 'Types'
-  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char* prefix,
-                       CodeLocation code_location,
-                       const char* case_name, const char* test_names,
-                       int index) {
+  static bool Register(const char *prefix, const CodeLocation &code_location,
+                       const char *case_name, const char *test_names, int index,
+                       const std::vector<std::string> &type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
     typedef typename GTEST_BIND_(TestSel, Type) TestClass;
@@ -637,76 +719,97 @@
     // First, registers the first type-parameterized test in the type
     // list.
     MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[static_cast<size_t>(index)])
+            .c_str(),
         StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
         GetTypeName<Type>().c_str(),
-        NULL,  // No value parameter.
-        code_location,
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
+        nullptr,  // No value parameter.
+        code_location, GetTypeId<FixtureClass>(),
+        SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
         new TestFactoryImpl<TestClass>);
 
     // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, code_location, case_name, test_names, index + 1);
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
   }
 };
 
 // The base case for the compile time recursion.
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
+class TypeParameterizedTest<Fixture, TestSel, internal::None> {
  public:
-  static bool Register(const char* /*prefix*/, CodeLocation,
-                       const char* /*case_name*/, const char* /*test_names*/,
-                       int /*index*/) {
+  static bool Register(const char * /*prefix*/, const CodeLocation &,
+                       const char * /*case_name*/, const char * /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string> & =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
 
-// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+                                                   CodeLocation code_location);
+GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
+    const char *case_name);
+
+// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
 // registers *all combinations* of 'Tests' and 'Types' with Google
 // Test.  The return value is insignificant - we just need to return
 // something such that we can call this function in a namespace scope.
 template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
-class TypeParameterizedTestCase {
+class TypeParameterizedTestSuite {
  public:
-  static bool Register(const char* prefix, CodeLocation code_location,
-                       const TypedTestCasePState* state,
-                       const char* case_name, const char* test_names) {
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
+  static bool Register(const char *prefix, CodeLocation code_location,
+                       const TypedTestSuitePState *state, const char *case_name,
+                       const char *test_names,
+                       const std::vector<std::string> &type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    RegisterTypeParameterizedTestSuiteInstantiation(case_name);
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
       fflush(stderr);
       posix::Abort();
     }
-    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+    const CodeLocation &test_location = state->GetCodeLocation(test_name);
 
     typedef typename Tests::Head Head;
 
     // First, register the first test in 'Test' for each type in 'Types'.
     TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, test_location, case_name, test_names, 0);
+        prefix, test_location, case_name, test_names, 0, type_names);
 
     // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, code_location, state,
-                   case_name, SkipComma(test_names));
+    return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
+                                      Types>::Register(prefix, code_location,
+                                                       state, case_name,
+                                                       SkipComma(test_names),
+                                                       type_names);
   }
 };
 
 // The base case for the compile time recursion.
 template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
  public:
-  static bool Register(const char* /*prefix*/, CodeLocation,
-                       const TypedTestCasePState* /*state*/,
-                       const char* /*case_name*/, const char* /*test_names*/) {
+  static bool Register(const char * /*prefix*/, const CodeLocation &,
+                       const TypedTestSuitePState * /*state*/,
+                       const char * /*case_name*/, const char * /*test_names*/,
+                       const std::vector<std::string> & =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -723,8 +826,8 @@
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest *unit_test,
+                                                       int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -739,9 +842,19 @@
 // variable declared in a conditional expression always being NULL in
 // the else branch.
 struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char* str) : value(str) {}
+  ConstCharPtr(const char *str) : value(str) {}
   operator bool() const { return true; }
-  const char* value;
+  const char *value;
+};
+
+// Helper for declaring std::string within 'if' statement
+// in pre C++17 build environment.
+struct TrueWithString {
+  TrueWithString() = default;
+  explicit TrueWithString(const char *str) : value(str) {}
+  explicit TrueWithString(const std::string &str) : value(str) {}
+  explicit operator bool() const { return true; }
+  std::string value;
 };
 
 // A simple Linear Congruential Generator for generating random
@@ -751,160 +864,30 @@
 // but it's good enough for our purposes.
 class GTEST_API_ Random {
  public:
-  static const UInt32 kMaxRange = 1u << 31;
+  static const uint32_t kMaxRange = 1u << 31;
 
-  explicit Random(UInt32 seed) : state_(seed) {}
+  explicit Random(uint32_t seed) : state_(seed) {}
 
-  void Reseed(UInt32 seed) { state_ = seed; }
+  void Reseed(uint32_t seed) { state_ = seed; }
 
   // Generates a random number from [0, range).  Crashes if 'range' is
   // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
+  uint32_t Generate(uint32_t range);
 
  private:
-  UInt32 state_;
+  uint32_t state_;
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
 };
 
-// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
-// compiler error iff T1 and T2 are different types.
-template <typename T1, typename T2>
-struct CompileAssertTypesEqual;
-
-template <typename T>
-struct CompileAssertTypesEqual<T, T> {
-};
-
-// Removes the reference from a type if it is a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::remove_reference, which is not widely available yet.
-template <typename T>
-struct RemoveReference { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveReference<T&> { typedef T type; };  // NOLINT
-
-// A handy wrapper around RemoveReference that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_REFERENCE_(T) \
-    typename ::testing::internal::RemoveReference<T>::type
-
-// Removes const from a type if it is a const type, otherwise leaves
-// it unchanged.  This is the same as tr1::remove_const, which is not
-// widely available yet.
-template <typename T>
-struct RemoveConst { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveConst<const T> { typedef T type; };  // NOLINT
-
-// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
-// definition to fail to remove the const in 'const int[3]' and 'const
-// char[3][4]'.  The following specialization works around the bug.
-template <typename T, size_t N>
-struct RemoveConst<const T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-
-#if defined(_MSC_VER) && _MSC_VER < 1400
-// This is the only specialization that allows VC++ 7.1 to remove const in
-// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
-// and thus needs to be conditionally compiled.
-template <typename T, size_t N>
-struct RemoveConst<T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-#endif
-
-// A handy wrapper around RemoveConst that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_CONST_(T) \
-    typename ::testing::internal::RemoveConst<T>::type
-
 // Turns const U&, U&, const U, and U all into U.
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
-    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
-
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
-// ImplicitlyConvertible<From, To>::value is a compile-time bool
-// constant that's true iff type From can be implicitly converted to
-// type To.
-template <typename From, typename To>
-class ImplicitlyConvertible {
- private:
-  // We need the following helper functions only for their types.
-  // They have no implementations.
-
-  // MakeFrom() is an expression whose type is From.  We cannot simply
-  // use From(), as the type From may not have a public default
-  // constructor.
-  static typename AddReference<From>::type MakeFrom();
-
-  // These two functions are overloaded.  Given an expression
-  // Helper(x), the compiler will pick the first version if x can be
-  // implicitly converted to type To; otherwise it will pick the
-  // second version.
-  //
-  // The first version returns a value of size 1, and the second
-  // version returns a value of size 2.  Therefore, by checking the
-  // size of Helper(x), which can be done at compile time, we can tell
-  // which version of Helper() is used, and hence whether x can be
-  // implicitly converted to type To.
-  static char Helper(To);
-  static char (&Helper(...))[2];  // NOLINT
-
-  // We have to put the 'public' section after the 'private' section,
-  // or MSVC refuses to compile the code.
- public:
-#if defined(__BORLANDC__)
-  // C++Builder cannot use member overload resolution during template
-  // instantiation.  The simplest workaround is to use its C++0x type traits
-  // functions (C++Builder 2009 and above only).
-  static const bool value = __is_convertible(From, To);
-#else
-  // MSVC warns about implicitly converting from double to int for
-  // possible loss of data, so we need to temporarily disable the
-  // warning.
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif  // __BORLANDC__
-};
-template <typename From, typename To>
-const bool ImplicitlyConvertible<From, To>::value;
+  typename std::remove_const<typename std::remove_reference<T>::type>::type
 
 // IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true iff T is type ProtocolMessage, proto2::Message, or a subclass
-// of those.
+// true if and only if T is type proto2::Message or a subclass of it.
 template <typename T>
 struct IsAProtocolMessage
-    : public bool_constant<
-  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
-  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
-};
+    : public std::is_convertible<const T *, const ::proto2::Message *> {};
 
 // When the compiler sees expression IsContainerTest<C>(0), if C is an
 // STL-style container class, the first overload of IsContainerTest
@@ -917,8 +900,11 @@
 // a container class by checking the type of IsContainerTest<C>(0).
 // The value of the expression is insignificant.
 //
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
 // class itself (e.g. you can refer to class iterator as either
 // 'iterator' or 'iterator::iterator').  If we look for C::iterator
 // only, for example, we would mistakenly think that a class named
@@ -928,23 +914,71 @@
 // IsContainerTest(typename C::const_iterator*) and
 // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
 typedef int IsContainer;
-template <class C>
-IsContainer IsContainerTest(int /* dummy */,
-                            typename C::iterator* /* it */ = NULL,
-                            typename C::const_iterator* /* const_it */ = NULL) {
+template <class C,
+          class Iterator = decltype(::std::declval<const C &>().begin()),
+          class = decltype(::std::declval<const C &>().end()),
+          class = decltype(++::std::declval<Iterator &>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
   return 0;
 }
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
 
-// EnableIf<condition>::type is void when 'Cond' is true, and
-// undefined when 'Cond' is false.  To use SFINAE to make a function
-// overload only apply when a particular expression is true, add
-// "typename EnableIf<expression>::type* = 0" as the last parameter.
-template<bool> struct EnableIf;
-template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher *, typename U::reverse_iterator *);
+  template <typename U>
+  static int test(typename U::hasher *, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
+struct IsRecursiveContainerImpl;
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, false> : public std::false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true> {
+  using value_type = decltype(*std::declval<typename C::const_iterator>());
+  using type =
+      std::is_same<typename std::remove_const<
+                       typename std::remove_reference<value_type>::type>::type,
+                   C>;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
 
 // Utilities for native arrays.
 
@@ -953,15 +987,17 @@
 // 0, ArrayEq() degenerates into comparing a single pair of values.
 
 template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+bool ArrayEq(const T *lhs, size_t size, const U *rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T &lhs, const U &rhs) {
+  return lhs == rhs;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -969,10 +1005,9 @@
 // the previous ArrayEq() function, arrays with different sizes would
 // lead to different copies of the template code.
 template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+bool ArrayEq(const T *lhs, size_t size, const U *rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -980,10 +1015,9 @@
 // Finds the first element in the iterator range [begin, end) that
 // equals elem.  Element may be a native array type itself.
 template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+Iter ArrayAwareFind(Iter begin, Iter end, const Element &elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
+    if (internal::ArrayEq(*it, elem)) return it;
   }
   return end;
 }
@@ -993,15 +1027,17 @@
 // CopyArray() degenerates into copying a single value.
 
 template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to);
+void CopyArray(const T *from, size_t size, U *to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T &from, U *to) {
+  *to = from;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1009,7 +1045,7 @@
 // the previous CopyArray() function, arrays with different sizes
 // would lead to different copies of the template code.
 template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to) {
+void CopyArray(const T *from, size_t size, U *to) {
   for (size_t i = 0; i != size; i++) {
     internal::CopyArray(from[i], to + i);
   }
@@ -1035,47 +1071,44 @@
  public:
   // STL-style container typedefs.
   typedef Element value_type;
-  typedef Element* iterator;
-  typedef const Element* const_iterator;
+  typedef Element *iterator;
+  typedef const Element *const_iterator;
 
   // Constructs from a native array. References the source.
-  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+  NativeArray(const Element *array, size_t count, RelationToSourceReference) {
     InitRef(array, count);
   }
 
   // Constructs from a native array. Copies the source.
-  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+  NativeArray(const Element *array, size_t count, RelationToSourceCopy) {
     InitCopy(array, count);
   }
 
   // Copy constructor.
-  NativeArray(const NativeArray& rhs) {
+  NativeArray(const NativeArray &rhs) {
     (this->*rhs.clone_)(rhs.array_, rhs.size_);
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
   }
 
   // STL-style container methods.
   size_t size() const { return size_; }
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
+  bool operator==(const NativeArray &rhs) const {
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
-  enum {
-    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
-        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
-  };
+  static_assert(!std::is_const<Element>::value, "Type must not be const");
+  static_assert(!std::is_reference<Element>::value,
+                "Type must not be a reference");
 
   // Initializes this object with a copy of the input.
-  void InitCopy(const Element* array, size_t a_size) {
-    Element* const copy = new Element[a_size];
+  void InitCopy(const Element *array, size_t a_size) {
+    Element *const copy = new Element[a_size];
     CopyArray(array, a_size, copy);
     array_ = copy;
     size_ = a_size;
@@ -1083,25 +1116,161 @@
   }
 
   // Initializes this object with a reference of the input.
-  void InitRef(const Element* array, size_t a_size) {
+  void InitRef(const Element *array, size_t a_size) {
     array_ = array;
     size_ = a_size;
     clone_ = &NativeArray::InitRef;
   }
 
-  const Element* array_;
+  const Element *array_;
   size_t size_;
-  void (NativeArray::*clone_)(const Element*, size_t);
+  void (NativeArray::*clone_)(const Element *, size_t);
 
   GTEST_DISALLOW_ASSIGN_(NativeArray);
 };
 
+// Backport of std::index_sequence.
+template <size_t... Is>
+struct IndexSequence {
+  using type = IndexSequence;
+};
+
+// Double the IndexSequence, and one if plus_one is true.
+template <bool plus_one, typename T, size_t sizeofT>
+struct DoubleSequence;
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
+};
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)...>;
+};
+
+// Backport of std::make_index_sequence.
+// It uses O(ln(N)) instantiation depth.
+template <size_t N>
+struct MakeIndexSequence
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+                     N / 2>::type {};
+
+template <>
+struct MakeIndexSequence<0> : IndexSequence<> {};
+
+template <size_t>
+struct Ignore {
+  Ignore(...);  // NOLINT
+};
+
+template <typename>
+struct ElemFromListImpl;
+template <size_t... I>
+struct ElemFromListImpl<IndexSequence<I...>> {
+  // We make Ignore a template to solve a problem with MSVC.
+  // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but
+  // MSVC doesn't understand how to deal with that pack expansion.
+  // Use `0 * I` to have a single instantiation of Ignore.
+  template <typename R>
+  static R Apply(Ignore<0 * I>..., R (*)(), ...);
+};
+
+template <size_t N, typename... T>
+struct ElemFromList {
+  using type =
+      decltype(ElemFromListImpl<typename MakeIndexSequence<N>::type>::Apply(
+          static_cast<T (*)()>(nullptr)...));
+};
+
+template <typename... T>
+class FlatTuple;
+
+template <typename Derived, size_t I>
+struct FlatTupleElemBase;
+
+template <typename... T, size_t I>
+struct FlatTupleElemBase<FlatTuple<T...>, I> {
+  using value_type = typename ElemFromList<I, T...>::type;
+  FlatTupleElemBase() = default;
+  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  value_type value;
+};
+
+template <typename Derived, typename Idx>
+struct FlatTupleBase;
+
+template <size_t... Idx, typename... T>
+struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
+    : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
+  using Indices = IndexSequence<Idx...>;
+  FlatTupleBase() = default;
+  explicit FlatTupleBase(T... t)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+};
+
+// Analog to std::tuple but with different tradeoffs.
+// This class minimizes the template instantiation depth, thus allowing more
+// elements than std::tuple would. std::tuple has been seen to require an
+// instantiation depth of more than 10x the number of elements in some
+// implementations.
+// FlatTuple and ElemFromList are not recursive and have a fixed depth
+// regardless of T...
+// MakeIndexSequence, on the other hand, it is recursive but with an
+// instantiation depth of O(ln(N)).
+template <typename... T>
+class FlatTuple
+    : private FlatTupleBase<FlatTuple<T...>,
+                            typename MakeIndexSequence<sizeof...(T)>::type> {
+  using Indices = typename FlatTupleBase<
+      FlatTuple<T...>, typename MakeIndexSequence<sizeof...(T)>::type>::Indices;
+
+ public:
+  FlatTuple() = default;
+  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type &Get() const {
+    return static_cast<const FlatTupleElemBase<FlatTuple, I> *>(this)->value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type &Get() {
+    return static_cast<FlatTupleElemBase<FlatTuple, I> *>(this)->value;
+  }
+};
+
+// Utility functions to be called with static_assert to induce deprecation
+// warnings.
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TEST_SUITE_P")
+constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE_P is deprecated, please use "
+    "TYPED_TEST_SUITE_P")
+constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE is deprecated, please use "
+    "TYPED_TEST_SUITE")
+constexpr bool TypedTestCaseIsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
+    "REGISTER_TYPED_TEST_SUITE_P")
+constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TYPED_TEST_SUITE_P")
+constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
+
 }  // namespace internal
 }  // namespace testing
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1115,124 +1284,158 @@
 #define GTEST_SUCCESS_(message) \
   GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
 
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+#define GTEST_SKIP_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
+
+// Suppress MSVC warning 4072 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  }
 
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") {                   \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const &) {                                  \
+      gtest_caught_expected = true;                                         \
+    } catch (...) {                                                         \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else                                                                    \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value)
 
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
+#if GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+  catch (std::exception const &e) {                                          \
+    gtest_msg.value =                                                        \
+        ("it throws std::exception-derived exception with description: \""); \
+    gtest_msg.value += e.what();                                             \
+    gtest_msg.value += "\".";                                                \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);            \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
 // represenation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
 
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
-#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-  test_case_name##_##test_name##_Test
+#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+  test_suite_name##_##test_name##_Test
 
 // Helper macro for defining tests.
-#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
-class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
- public:\
-  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
- private:\
-  virtual void TestBody();\
-  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
-};\
-\
-::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
-  ::test_info_ =\
-    ::testing::internal::MakeAndRegisterTestInfo(\
-        #test_case_name, #test_name, NULL, NULL, \
-        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-        (parent_id), \
-        parent_class::SetUpTestCase, \
-        parent_class::TearDownTestCase, \
-        new ::testing::internal::TestFactoryImpl<\
-            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
+                "test_suite_name must not be empty");                         \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
+                "test_name must not be empty");                               \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
+      : public parent_class {                                                 \
+   public:                                                                    \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
+                                                           test_name));       \
+    GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
+                                                           test_name));       \
+                                                                              \
+   private:                                                                   \
+    void TestBody() override;                                                 \
+    static ::testing::TestInfo *const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
+  };                                                                          \
+                                                                              \
+  ::testing::TestInfo *const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
+                                                    test_name)::test_info_ =  \
+      ::testing::internal::MakeAndRegisterTestInfo(                           \
+          #test_suite_name, #test_name, nullptr, nullptr,                     \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
+              test_suite_name, test_name)>);                                  \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h
deleted file mode 100644
index 3602942..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h
+++ /dev/null

@@ -1,243 +0,0 @@
-// Copyright 2003 Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: Dan Egnor (egnor@google.com)
-//
-// A "smart" pointer type with reference tracking.  Every pointer to a
-// particular object is kept on a circular linked list.  When the last pointer
-// to an object is destroyed or reassigned, the object is deleted.
-//
-// Used properly, this deletes the object when the last reference goes away.
-// There are several caveats:
-// - Like all reference counting schemes, cycles lead to leaks.
-// - Each smart pointer is actually two pointers (8 bytes instead of 4).
-// - Every time a pointer is assigned, the entire list of pointers to that
-//   object is traversed.  This class is therefore NOT SUITABLE when there
-//   will often be more than two or three pointers to a particular object.
-// - References are only tracked as long as linked_ptr<> objects are copied.
-//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
-//   will happen (double deletion).
-//
-// A good use of this class is storing object references in STL containers.
-// You can safely put linked_ptr<> in a vector<>.
-// Other uses may not be as good.
-//
-// Note: If you use an incomplete type with linked_ptr<>, the class
-// *containing* linked_ptr<> must have a constructor and destructor (even
-// if they do nothing!).
-//
-// Bill Gibbons suggested we use something like this.
-//
-// Thread Safety:
-//   Unlike other linked_ptr implementations, in this implementation
-//   a linked_ptr object is thread-safe in the sense that:
-//     - it's safe to copy linked_ptr objects concurrently,
-//     - it's safe to copy *from* a linked_ptr and read its underlying
-//       raw pointer (e.g. via get()) concurrently, and
-//     - it's safe to write to two linked_ptrs that point to the same
-//       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
-// confusion with normal linked_ptr.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-
-#include <stdlib.h>
-#include <assert.h>
-
-#include "gtest/internal/gtest-port.h"
-
-namespace testing {
-namespace internal {
-
-// Protects copying of all linked_ptr objects.
-GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// This is used internally by all instances of linked_ptr<>.  It needs to be
-// a non-template class because different types of linked_ptr<> can refer to
-// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
-// So, it needs to be possible for different types of linked_ptr to participate
-// in the same circular linked list, so we need a single class type here.
-//
-// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
-class linked_ptr_internal {
- public:
-  // Create a new circle that includes only this instance.
-  void join_new() {
-    next_ = this;
-  }
-
-  // Many linked_ptr operations may change p.link_ for some linked_ptr
-  // variable p in the same circle as this object.  Therefore we need
-  // to prevent two such operations from occurring concurrently.
-  //
-  // Note that different types of linked_ptr objects can coexist in a
-  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
-  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
-  // protect all linked_ptr objects.  This can create serious
-  // contention in production code, but is acceptable in a testing
-  // framework.
-
-  // Join an existing circle.
-  void join(linked_ptr_internal const* ptr)
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    linked_ptr_internal const* p = ptr;
-    while (p->next_ != ptr) {
-      assert(p->next_ != this &&
-             "Trying to join() a linked ring we are already in. "
-             "Is GMock thread safety enabled?");
-      p = p->next_;
-    }
-    p->next_ = this;
-    next_ = ptr;
-  }
-
-  // Leave whatever circle we're part of.  Returns true if we were the
-  // last member of the circle.  Once this is done, you can join() another.
-  bool depart()
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    if (next_ == this) return true;
-    linked_ptr_internal const* p = next_;
-    while (p->next_ != this) {
-      assert(p->next_ != next_ &&
-             "Trying to depart() a linked ring we are not in. "
-             "Is GMock thread safety enabled?");
-      p = p->next_;
-    }
-    p->next_ = next_;
-    return false;
-  }
-
- private:
-  mutable linked_ptr_internal const* next_;
-};
-
-template <typename T>
-class linked_ptr {
- public:
-  typedef T element_type;
-
-  // Take over ownership of a raw pointer.  This should happen as soon as
-  // possible after the object is created.
-  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
-  ~linked_ptr() { depart(); }
-
-  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
-  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
-  linked_ptr(linked_ptr const& ptr) {  // NOLINT
-    assert(&ptr != this);
-    copy(&ptr);
-  }
-
-  // Assignment releases the old value and acquires the new.
-  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
-    depart();
-    copy(&ptr);
-    return *this;
-  }
-
-  linked_ptr& operator=(linked_ptr const& ptr) {
-    if (&ptr != this) {
-      depart();
-      copy(&ptr);
-    }
-    return *this;
-  }
-
-  // Smart pointer members.
-  void reset(T* ptr = NULL) {
-    depart();
-    capture(ptr);
-  }
-  T* get() const { return value_; }
-  T* operator->() const { return value_; }
-  T& operator*() const { return *value_; }
-
-  bool operator==(T* p) const { return value_ == p; }
-  bool operator!=(T* p) const { return value_ != p; }
-  template <typename U>
-  bool operator==(linked_ptr<U> const& ptr) const {
-    return value_ == ptr.get();
-  }
-  template <typename U>
-  bool operator!=(linked_ptr<U> const& ptr) const {
-    return value_ != ptr.get();
-  }
-
- private:
-  template <typename U>
-  friend class linked_ptr;
-
-  T* value_;
-  linked_ptr_internal link_;
-
-  void depart() {
-    if (link_.depart()) delete value_;
-  }
-
-  void capture(T* ptr) {
-    value_ = ptr;
-    link_.join_new();
-  }
-
-  template <typename U> void copy(linked_ptr<U> const* ptr) {
-    value_ = ptr->get();
-    if (value_)
-      link_.join(&ptr->link_);
-    else
-      link_.join_new();
-  }
-};
-
-template<typename T> inline
-bool operator==(T* ptr, const linked_ptr<T>& x) {
-  return ptr == x.get();
-}
-
-template<typename T> inline
-bool operator!=(T* ptr, const linked_ptr<T>& x) {
-  return ptr != x.get();
-}
-
-// A function to convert T* into linked_ptr<T>
-// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
-// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
-template <typename T>
-linked_ptr<T> make_linked_ptr(T* ptr) {
-  return linked_ptr<T>(ptr);
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h
deleted file mode 100644
index 4d1d81d..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h
+++ /dev/null

@@ -1,5146 +0,0 @@
-// This file was GENERATED by command:
-//     pump.py gtest-param-util-generated.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently Google Test supports at most 50 arguments in Values,
-// and at most 10 arguments in Combine. Please contact
-// googletestframework@googlegroups.com if you need more.
-// Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tuple which is
-// currently set at 10.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-#include "gtest/internal/gtest-param-util.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end);
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-
-// Used in the Values() function to provide polymorphic capabilities.
-template <typename T1>
-class ValueArray1 {
- public:
-  explicit ValueArray1(T1 v1) : v1_(v1) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray1& other);
-
-  const T1 v1_;
-};
-
-template <typename T1, typename T2>
-class ValueArray2 {
- public:
-  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray2& other);
-
-  const T1 v1_;
-  const T2 v2_;
-};
-
-template <typename T1, typename T2, typename T3>
-class ValueArray3 {
- public:
-  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray3& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-class ValueArray4 {
- public:
-  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray4& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class ValueArray5 {
- public:
-  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray5& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class ValueArray6 {
- public:
-  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray6& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class ValueArray7 {
- public:
-  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray7& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class ValueArray8 {
- public:
-  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray8& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class ValueArray9 {
- public:
-  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray9& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class ValueArray10 {
- public:
-  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray10& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-class ValueArray11 {
- public:
-  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray11& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-class ValueArray12 {
- public:
-  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray12& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-class ValueArray13 {
- public:
-  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray13& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-class ValueArray14 {
- public:
-  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray14& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-class ValueArray15 {
- public:
-  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray15& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-class ValueArray16 {
- public:
-  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray16& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-class ValueArray17 {
- public:
-  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray17& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-class ValueArray18 {
- public:
-  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray18& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-class ValueArray19 {
- public:
-  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray19& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-class ValueArray20 {
- public:
-  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray20& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-class ValueArray21 {
- public:
-  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray21& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-class ValueArray22 {
- public:
-  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray22& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-class ValueArray23 {
- public:
-  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray23& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-class ValueArray24 {
- public:
-  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray24& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-class ValueArray25 {
- public:
-  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray25& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-class ValueArray26 {
- public:
-  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray26& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-class ValueArray27 {
- public:
-  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray27& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-class ValueArray28 {
- public:
-  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray28& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-class ValueArray29 {
- public:
-  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray29& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-class ValueArray30 {
- public:
-  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray30& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-class ValueArray31 {
- public:
-  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray31& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-class ValueArray32 {
- public:
-  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray32& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-class ValueArray33 {
- public:
-  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray33& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-class ValueArray34 {
- public:
-  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray34& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-class ValueArray35 {
- public:
-  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray35& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-class ValueArray36 {
- public:
-  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray36& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-class ValueArray37 {
- public:
-  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray37& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-class ValueArray38 {
- public:
-  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray38& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-class ValueArray39 {
- public:
-  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray39& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-class ValueArray40 {
- public:
-  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray40& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-class ValueArray41 {
- public:
-  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray41& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-class ValueArray42 {
- public:
-  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray42& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-class ValueArray43 {
- public:
-  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
-      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray43& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-class ValueArray44 {
- public:
-  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
-      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
-      v43_(v43), v44_(v44) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray44& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-class ValueArray45 {
- public:
-  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
-      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray45& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-class ValueArray46 {
- public:
-  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray46& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-class ValueArray47 {
- public:
-  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
-      v47_(v47) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray47& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-class ValueArray48 {
- public:
-  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
-      v46_(v46), v47_(v47), v48_(v48) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray48& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-class ValueArray49 {
- public:
-  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
-      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray49& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-class ValueArray50 {
- public:
-  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
-      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray50& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-  const T50 v50_;
-};
-
-# if GTEST_HAS_COMBINE
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Generates values from the Cartesian product of values produced
-// by the argument generators.
-//
-template <typename T1, typename T2>
-class CartesianProductGenerator2
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
- public:
-  typedef ::testing::tuple<T1, T2> ParamType;
-
-  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2)
-      : g1_(g1), g2_(g2) {}
-  virtual ~CartesianProductGenerator2() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current2_;
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator2::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator2& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-};  // class CartesianProductGenerator2
-
-
-template <typename T1, typename T2, typename T3>
-class CartesianProductGenerator3
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3> ParamType;
-
-  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  virtual ~CartesianProductGenerator3() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current3_;
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator3::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator3& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-};  // class CartesianProductGenerator3
-
-
-template <typename T1, typename T2, typename T3, typename T4>
-class CartesianProductGenerator4
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
-
-  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  virtual ~CartesianProductGenerator4() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current4_;
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator4::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator4& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-};  // class CartesianProductGenerator4
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class CartesianProductGenerator5
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
-
-  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  virtual ~CartesianProductGenerator5() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current5_;
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator5::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator5& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-};  // class CartesianProductGenerator5
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class CartesianProductGenerator6
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
-        T6> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
-
-  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  virtual ~CartesianProductGenerator6() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current6_;
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator6::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator6& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-};  // class CartesianProductGenerator6
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class CartesianProductGenerator7
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
-        T7> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
-
-  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  virtual ~CartesianProductGenerator7() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current7_;
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator7::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator7& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-};  // class CartesianProductGenerator7
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class CartesianProductGenerator8
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
-
-  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  virtual ~CartesianProductGenerator8() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current8_;
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator8::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator8& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-};  // class CartesianProductGenerator8
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class CartesianProductGenerator9
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
-
-  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  virtual ~CartesianProductGenerator9() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current9_;
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator9::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator9& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-};  // class CartesianProductGenerator9
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class CartesianProductGenerator10
-    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9, T10> > {
- public:
-  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
-
-  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
-      const ParamGenerator<T10>& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  virtual ~CartesianProductGenerator10() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9,
-      const ParamGenerator<T10>& g10,
-      const typename ParamGenerator<T10>::iterator& current10)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
-          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current10_;
-      if (current10_ == end10_) {
-        current10_ = begin10_;
-        ++current9_;
-      }
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_ &&
-          current10_ == typed_other->current10_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_),
-        begin10_(other.begin10_),
-        end10_(other.end10_),
-        current10_(other.current10_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_ ||
-          current10_ == end10_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    const typename ParamGenerator<T10>::iterator begin10_;
-    const typename ParamGenerator<T10>::iterator end10_;
-    typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator10::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator10& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-  const ParamGenerator<T10> g10_;
-};  // class CartesianProductGenerator10
-
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Helper classes providing Combine() with polymorphic features. They allow
-// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
-// convertible to U.
-//
-template <class Generator1, class Generator2>
-class CartesianProductHolder2 {
- public:
-CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
-      : g1_(g1), g2_(g2) {}
-  template <typename T1, typename T2>
-  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2> >(
-        new CartesianProductGenerator2<T1, T2>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder2& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-};  // class CartesianProductHolder2
-
-template <class Generator1, class Generator2, class Generator3>
-class CartesianProductHolder3 {
- public:
-CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  template <typename T1, typename T2, typename T3>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
-        new CartesianProductGenerator3<T1, T2, T3>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder3& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-};  // class CartesianProductHolder3
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4>
-class CartesianProductHolder4 {
- public:
-CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  template <typename T1, typename T2, typename T3, typename T4>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
-        new CartesianProductGenerator4<T1, T2, T3, T4>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder4& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-};  // class CartesianProductHolder4
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5>
-class CartesianProductHolder5 {
- public:
-CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
-        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder5& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-};  // class CartesianProductHolder5
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6>
-class CartesianProductHolder6 {
- public:
-CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
-        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder6& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-};  // class CartesianProductHolder6
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7>
-class CartesianProductHolder7 {
- public:
-CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
-      T7> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
-        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder7& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-};  // class CartesianProductHolder7
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8>
-class CartesianProductHolder8 {
- public:
-CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
-      T8> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
-        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder8& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-};  // class CartesianProductHolder8
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9>
-class CartesianProductHolder9 {
- public:
-CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9> >(
-        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder9& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-};  // class CartesianProductHolder9
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9, class Generator10>
-class CartesianProductHolder10 {
- public:
-CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9, const Generator10& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9, typename T10>
-  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-      T10> >() const {
-    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-        T10> >(
-        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-            T10>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_),
-        static_cast<ParamGenerator<T10> >(g10_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder10& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-  const Generator10 g10_;
-};  // class CartesianProductHolder10
-
-# endif  // GTEST_HAS_COMBINE
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
deleted file mode 100644
index 5c7c47a..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
+++ /dev/null

@@ -1,286 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 50  $$ Maximum length of Values arguments we want to support.
-$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently Google Test supports at most $n arguments in Values,
-// and at most $maxtuple arguments in Combine. Please contact
-// googletestframework@googlegroups.com if you need more.
-// Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tuple which is
-// currently set at $maxtuple.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-#include "gtest/internal/gtest-param-util.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end);
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-
-// Used in the Values() function to provide polymorphic capabilities.
-$range i 1..n
-$for i [[
-$range j 1..i
-
-template <$for j, [[typename T$j]]>
-class ValueArray$i {
- public:
-  $if i==1 [[explicit ]]ValueArray$i($for j, [[T$j v$j]]) : $for j, [[v$(j)_(v$j)]] {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {$for j, [[static_cast<T>(v$(j)_)]]};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray$i& other);
-
-$for j [[
-
-  const T$j v$(j)_;
-]]
-
-};
-
-]]
-
-# if GTEST_HAS_COMBINE
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Generates values from the Cartesian product of values produced
-// by the argument generators.
-//
-$range i 2..maxtuple
-$for i [[
-$range j 1..i
-$range k 2..i
-
-template <$for j, [[typename T$j]]>
-class CartesianProductGenerator$i
-    : public ParamGeneratorInterface< ::testing::tuple<$for j, [[T$j]]> > {
- public:
-  typedef ::testing::tuple<$for j, [[T$j]]> ParamType;
-
-  CartesianProductGenerator$i($for j, [[const ParamGenerator<T$j>& g$j]])
-      : $for j, [[g$(j)_(g$j)]] {}
-  virtual ~CartesianProductGenerator$i() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.begin()]]);
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.end()]]);
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base, $for j, [[
-
-      const ParamGenerator<T$j>& g$j,
-      const typename ParamGenerator<T$j>::iterator& current$(j)]])
-        : base_(base),
-$for j, [[
-
-          begin$(j)_(g$j.begin()), end$(j)_(g$j.end()), current$(j)_(current$j)
-]]    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current$(i)_;
-
-$for k [[
-      if (current$(i+2-k)_ == end$(i+2-k)_) {
-        current$(i+2-k)_ = begin$(i+2-k)_;
-        ++current$(i+2-k-1)_;
-      }
-
-]]
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         ($for j  && [[
-
-          current$(j)_ == typed_other->current$(j)_
-]]);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_), $for j, [[
-
-        begin$(j)_(other.begin$(j)_),
-        end$(j)_(other.end$(j)_),
-        current$(j)_(other.current$(j)_)
-]] {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType($for j, [[*current$(j)_]]);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-$for j  || [[
-
-          current$(j)_ == end$(j)_
-]];
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-$for j [[
-
-    const typename ParamGenerator<T$j>::iterator begin$(j)_;
-    const typename ParamGenerator<T$j>::iterator end$(j)_;
-    typename ParamGenerator<T$j>::iterator current$(j)_;
-]]
-
-    ParamType current_value_;
-  };  // class CartesianProductGenerator$i::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator$i& other);
-
-
-$for j [[
-  const ParamGenerator<T$j> g$(j)_;
-
-]]
-};  // class CartesianProductGenerator$i
-
-
-]]
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Helper classes providing Combine() with polymorphic features. They allow
-// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
-// convertible to U.
-//
-$range i 2..maxtuple
-$for i [[
-$range j 1..i
-
-template <$for j, [[class Generator$j]]>
-class CartesianProductHolder$i {
- public:
-CartesianProductHolder$i($for j, [[const Generator$j& g$j]])
-      : $for j, [[g$(j)_(g$j)]] {}
-  template <$for j, [[typename T$j]]>
-  operator ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >() const {
-    return ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >(
-        new CartesianProductGenerator$i<$for j, [[T$j]]>(
-$for j,[[
-
-        static_cast<ParamGenerator<T$j> >(g$(j)_)
-]]));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder$i& other);
-
-
-$for j [[
-  const Generator$j g$(j)_;
-
-]]
-};  // class CartesianProductHolder$i
-
-]]
-
-# endif  // GTEST_HAS_COMBINE
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
index 82cab9b..0d8fc71 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h

@@ -26,40 +26,37 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
 
 // Type and function utilities for implementing parameterized tests.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
 #include <ctype.h>
 
+#include <cassert>
 #include <iterator>
+#include <memory>
 #include <set>
+#include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-linked_ptr.h"
 #include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-printers.h"
-
-#if GTEST_HAS_PARAM_TEST
+#include "gtest/gtest-test-part.h"
 
 namespace testing {
-
 // Input to a parameterized test name generator, describing a test parameter.
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
+  TestParamInfo(const ParamType &a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -68,7 +65,7 @@
 // testing::PrintToString.
 struct PrintToStringParamName {
   template <class ParamType>
-  std::string operator()(const TestParamInfo<ParamType>& info) const {
+  std::string operator()(const TestParamInfo<ParamType> &info) const {
     return PrintToString(info.param);
   }
 };
@@ -76,16 +73,19 @@
 namespace internal {
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
+// Utility Functions
+
 // Outputs a message explaining invalid registration of different
-// fixture class for the same test case. This may happen when
+// fixture class for the same test suite. This may happen when
 // TEST_P macro is used to define two tests with the same name
 // but in different namespaces.
-GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          CodeLocation code_location);
+GTEST_API_ void ReportInvalidTestSuiteType(const char *test_suite_name,
+                                           CodeLocation code_location);
 
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -96,7 +96,7 @@
   // A pointer to the base generator instance.
   // Used only for the purposes of iterator comparison
   // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  virtual const ParamGeneratorInterface<T> *BaseGenerator() const = 0;
   // Advances iterator to point to the next element
   // provided by the generator. The caller is responsible
   // for not calling Advance() on an iterator equal to
@@ -104,16 +104,16 @@
   virtual void Advance() = 0;
   // Clones the iterator object. Used for implementing copy semantics
   // of ParamIterator<T>.
-  virtual ParamIteratorInterface* Clone() const = 0;
+  virtual ParamIteratorInterface *Clone() const = 0;
   // Dereferences the current iterator and provides (read-only) access
   // to the pointed value. It is the caller's responsibility not to call
   // Current() on an iterator equal to BaseGenerator()->End().
   // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T* Current() const = 0;
+  virtual const T *Current() const = 0;
   // Determines whether the given iterator and other point to the same
   // element in the sequence generated by the generator.
   // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+  virtual bool Equals(const ParamIteratorInterface &other) const = 0;
 };
 
 // Class iterating over elements provided by an implementation of
@@ -123,41 +123,40 @@
 class ParamIterator {
  public:
   typedef T value_type;
-  typedef const T& reference;
+  typedef const T &reference;
   typedef ptrdiff_t difference_type;
 
   // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
-  ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
+  ParamIterator(const ParamIterator &other) : impl_(other.impl_->Clone()) {}
+  ParamIterator &operator=(const ParamIterator &other) {
+    if (this != &other) impl_.reset(other.impl_->Clone());
     return *this;
   }
 
-  const T& operator*() const { return *impl_->Current(); }
-  const T* operator->() const { return impl_->Current(); }
+  const T &operator*() const { return *impl_->Current(); }
+  const T *operator->() const { return impl_->Current(); }
   // Prefix version of operator++.
-  ParamIterator& operator++() {
+  ParamIterator &operator++() {
     impl_->Advance();
     return *this;
   }
   // Postfix version of operator++.
   ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T>* clone = impl_->Clone();
+    ParamIteratorInterface<T> *clone = impl_->Clone();
     impl_->Advance();
     return ParamIterator(clone);
   }
-  bool operator==(const ParamIterator& other) const {
+  bool operator==(const ParamIterator &other) const {
     return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
   }
-  bool operator!=(const ParamIterator& other) const {
+  bool operator!=(const ParamIterator &other) const {
     return !(*this == other);
   }
 
  private:
   friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  scoped_ptr<ParamIteratorInterface<T> > impl_;
+  explicit ParamIterator(ParamIteratorInterface<T> *impl) : impl_(impl) {}
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -170,8 +169,8 @@
   virtual ~ParamGeneratorInterface() {}
 
   // Generator interface definition
-  virtual ParamIteratorInterface<T>* Begin() const = 0;
-  virtual ParamIteratorInterface<T>* End() const = 0;
+  virtual ParamIteratorInterface<T> *Begin() const = 0;
+  virtual ParamIteratorInterface<T> *End() const = 0;
 };
 
 // Wraps ParamGeneratorInterface<T> and provides general generator syntax
@@ -179,15 +178,15 @@
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
 
-  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+  explicit ParamGenerator(ParamGeneratorInterface<T> *impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator &other) : impl_(other.impl_) {}
 
-  ParamGenerator& operator=(const ParamGenerator& other) {
+  ParamGenerator &operator=(const ParamGenerator &other) {
     impl_ = other.impl_;
     return *this;
   }
@@ -196,7 +195,7 @@
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -207,37 +206,37 @@
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
-  virtual ~RangeGenerator() {}
+      : begin_(begin), end_(end), step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
+  ~RangeGenerator() override {}
 
-  virtual ParamIteratorInterface<T>* Begin() const {
+  ParamIteratorInterface<T> *Begin() const override {
     return new Iterator(this, begin_, 0, step_);
   }
-  virtual ParamIteratorInterface<T>* End() const {
+  ParamIteratorInterface<T> *End() const override {
     return new Iterator(this, end_, end_index_, step_);
   }
 
  private:
   class Iterator : public ParamIteratorInterface<T> {
    public:
-    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+    Iterator(const ParamGeneratorInterface<T> *base, T value, int index,
              IncrementT step)
         : base_(base), value_(value), index_(index), step_(step) {}
-    virtual ~Iterator() {}
+    ~Iterator() override {}
 
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+    const ParamGeneratorInterface<T> *BaseGenerator() const override {
       return base_;
     }
-    virtual void Advance() {
+    void Advance() override {
       value_ = static_cast<T>(value_ + step_);
       index_++;
     }
-    virtual ParamIteratorInterface<T>* Clone() const {
+    ParamIteratorInterface<T> *Clone() const override {
       return new Iterator(*this);
     }
-    virtual const T* Current() const { return &value_; }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+    const T *Current() const override { return &value_; }
+    bool Equals(const ParamIteratorInterface<T> &other) const override {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
@@ -249,31 +248,28 @@
     }
 
    private:
-    Iterator(const Iterator& other)
-        : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
-          step_(other.step_) {}
+    Iterator(const Iterator &other)
+        : ParamIteratorInterface<T>(), base_(other.base_), value_(other.value_),
+          index_(other.index_), step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
+    void operator=(const Iterator &other);
 
-    const ParamGeneratorInterface<T>* const base_;
+    const ParamGeneratorInterface<T> *const base_;
     T value_;
     int index_;
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
-                               const IncrementT& step) {
+  static int CalculateEndIndex(const T &begin, const T &end,
+                               const IncrementT &step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
     return end_index;
   }
 
   // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator& other);
+  void operator=(const RangeGenerator &other);
 
   const T begin_;
   const T end_;
@@ -283,7 +279,6 @@
   const int end_index_;
 };  // class RangeGenerator
 
-
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -294,12 +289,12 @@
   template <typename ForwardIterator>
   ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
       : container_(begin, end) {}
-  virtual ~ValuesInIteratorRangeGenerator() {}
+  ~ValuesInIteratorRangeGenerator() override {}
 
-  virtual ParamIteratorInterface<T>* Begin() const {
+  ParamIteratorInterface<T> *Begin() const override {
     return new Iterator(this, container_.begin());
   }
-  virtual ParamIteratorInterface<T>* End() const {
+  ParamIteratorInterface<T> *End() const override {
     return new Iterator(this, container_.end());
   }
 
@@ -308,19 +303,19 @@
 
   class Iterator : public ParamIteratorInterface<T> {
    public:
-    Iterator(const ParamGeneratorInterface<T>* base,
+    Iterator(const ParamGeneratorInterface<T> *base,
              typename ContainerType::const_iterator iterator)
         : base_(base), iterator_(iterator) {}
-    virtual ~Iterator() {}
+    ~Iterator() override {}
 
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+    const ParamGeneratorInterface<T> *BaseGenerator() const override {
       return base_;
     }
-    virtual void Advance() {
+    void Advance() override {
       ++iterator_;
       value_.reset();
     }
-    virtual ParamIteratorInterface<T>* Clone() const {
+    ParamIteratorInterface<T> *Clone() const override {
       return new Iterator(*this);
     }
     // We need to use cached value referenced by iterator_ because *iterator_
@@ -330,41 +325,39 @@
     // can advance iterator_ beyond the end of the range, and we cannot
     // detect that fact. The client code, on the other hand, is
     // responsible for not calling Current() on an out-of-range iterator.
-    virtual const T* Current() const {
-      if (value_.get() == NULL)
-        value_.reset(new T(*iterator_));
+    const T *Current() const override {
+      if (value_.get() == nullptr) value_.reset(new T(*iterator_));
       return value_.get();
     }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+    bool Equals(const ParamIteratorInterface<T> &other) const override {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
-    Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(),
-          base_(other.base_),
+    Iterator(const Iterator &other)
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(), base_(other.base_),
           iterator_(other.iterator_) {}
 
-    const ParamGeneratorInterface<T>* const base_;
+    const ParamGeneratorInterface<T> *const base_;
     typename ContainerType::const_iterator iterator_;
     // A cached value of *iterator_. We keep it here to allow access by
     // pointer in the wrapping iterator's operator->().
     // value_ needs to be mutable to be accessed in Current().
-    // Use of scoped_ptr helps manage cached value's lifetime,
+    // Use of std::unique_ptr helps manage cached value's lifetime,
     // which is bound by the lifespan of the iterator itself.
-    mutable scoped_ptr<const T> value_;
+    mutable std::unique_ptr<const T> value_;
   };  // class ValuesInIteratorRangeGenerator::Iterator
 
   // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator& other);
+  void operator=(const ValuesInIteratorRangeGenerator &other);
 
   const ContainerType container_;
 };  // class ValuesInIteratorRangeGenerator
@@ -374,31 +367,18 @@
 // Default parameterized test name generator, returns a string containing the
 // integer test parameter index.
 template <class ParamType>
-std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+std::string DefaultParamName(const TestParamInfo<ParamType> &info) {
   Message name_stream;
   name_stream << info.index;
   return name_stream.GetString();
 }
 
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Parameterized test name overload helpers, which help the
-// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
-// test name generator and user param name generator.
-template <class ParamType, class ParamNameGenFunctor>
-ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
-  return func;
+template <typename T = int>
+void TestNotEmpty() {
+  static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
 }
-
-template <class ParamType>
-struct ParamNameGenFunc {
-  typedef std::string Type(const TestParamInfo<ParamType>&);
-};
-
-template <class ParamType>
-typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
-  return DefaultParamName;
-}
+template <typename T = int>
+void TestNotEmpty(const T &) {}
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
@@ -408,9 +388,9 @@
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
-  virtual Test* CreateTest() {
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
+  Test *CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
   }
@@ -430,7 +410,7 @@
  public:
   virtual ~TestMetaFactoryBase() {}
 
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+  virtual TestFactoryBase *CreateTestFactory(ParamType parameter) = 0;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -438,19 +418,19 @@
 // TestMetaFactory creates test factories for passing into
 // MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
 // ownership of test factory pointer, same factory object cannot be passed
-// into that method twice. But ParameterizedTestCaseInfo is going to call
+// into that method twice. But ParameterizedTestSuiteInfo is going to call
 // it for each Test/Parameter value combination. Thus it needs meta factory
 // creator class.
-template <class TestCase>
+template <class TestSuite>
 class TestMetaFactory
-    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+    : public TestMetaFactoryBase<typename TestSuite::ParamType> {
  public:
-  typedef typename TestCase::ParamType ParamType;
+  using ParamType = typename TestSuite::ParamType;
 
   TestMetaFactory() {}
 
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
-    return new ParameterizedTestFactory<TestCase>(parameter);
+  TestFactoryBase *CreateTestFactory(ParamType parameter) override {
+    return new ParameterizedTestFactory<TestSuite>(parameter);
   }
 
  private:
@@ -459,273 +439,484 @@
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
-// ParameterizedTestCaseInfoBase is a generic interface
-// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// ParameterizedTestSuiteInfoBase is a generic interface
+// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
 // accumulates test information provided by TEST_P macro invocations
-// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
 // and uses that information to register all resulting test instances
-// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
-// a collection of pointers to the ParameterizedTestCaseInfo objects
+// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
+// a collection of pointers to the ParameterizedTestSuiteInfo objects
 // and calls RegisterTests() on each of them when asked.
-class ParameterizedTestCaseInfoBase {
+class ParameterizedTestSuiteInfoBase {
  public:
-  virtual ~ParameterizedTestCaseInfoBase() {}
+  virtual ~ParameterizedTestSuiteInfoBase() {}
 
-  // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
+  // Base part of test suite name for display purposes.
+  virtual const std::string &GetTestSuiteName() const = 0;
   // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const = 0;
+  virtual TypeId GetTestSuiteTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
-  // test case right before running them in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // test suite right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
   virtual void RegisterTests() = 0;
 
  protected:
-  ParameterizedTestCaseInfoBase() {}
+  ParameterizedTestSuiteInfoBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
-// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
-// macro invocations for a particular test case and generators
-// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
-// test case. It registers tests with all values generated by all
+// Report a the name of a test_suit as safe to ignore
+// as the side effect of construction of this type.
+struct MarkAsIgnored {
+  explicit MarkAsIgnored(const char *test_suite);
+};
+
+GTEST_API_ void InsertSyntheticTestCase(const std::string &name,
+                                        CodeLocation location, bool has_test_p);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test suite and generators
+// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
+// test suite. It registers tests with all values generated by all
 // generators when asked.
-template <class TestCase>
-class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+template <class TestSuite>
+class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
  public:
   // ParamType and GeneratorCreationFunc are private types but are required
   // for declarations of public methods AddTestPattern() and
-  // AddTestCaseInstantiation().
-  typedef typename TestCase::ParamType ParamType;
+  // AddTestSuiteInstantiation().
+  using ParamType = typename TestSuite::ParamType;
   // A function that returns an instance of appropriate generator type.
   typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
+  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType> &);
 
-  explicit ParameterizedTestCaseInfo(
-      const char* name, CodeLocation code_location)
-      : test_case_name_(name), code_location_(code_location) {}
+  explicit ParameterizedTestSuiteInfo(const char *name,
+                                      CodeLocation code_location)
+      : test_suite_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  const std::string &GetTestSuiteName() const override {
+    return test_suite_name_;
+  }
   // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
-  // test_case_name is the base name of the test case (without invocation
+  // test_suite_name is the base name of the test suite (without invocation
   // prefix). test_base_name is the name of an individual test without
   // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
-  // test case base name and DoBar is test base name.
-  void AddTestPattern(const char* test_case_name,
-                      const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
-                                                       test_base_name,
-                                                       meta_factory)));
+  // test suite base name and DoBar is test base name.
+  void AddTestPattern(const char *test_suite_name, const char *test_base_name,
+                      TestMetaFactoryBase<ParamType> *meta_factory) {
+    tests_.push_back(std::shared_ptr<TestInfo>(
+        new TestInfo(test_suite_name, test_base_name, meta_factory)));
   }
-  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
   // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
-                               GeneratorCreationFunc* func,
-                               ParamNameGeneratorFunc* name_func,
-                               const char* file,
-                               int line) {
+  int AddTestSuiteInstantiation(const std::string &instantiation_name,
+                                GeneratorCreationFunc *func,
+                                ParamNameGeneratorFunc *name_func,
+                                const char *file, int line) {
     instantiations_.push_back(
         InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
   }
-  // UnitTest class invokes this method to register tests in this test case
-  // test cases right before running tests in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  // UnitTest has a guard to prevent from calling this method more then once.
-  virtual void RegisterTests() {
+  // UnitTest class invokes this method to register tests in this test suite
+  // right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more than once.
+  void RegisterTests() override {
+    bool generated_instantiations = false;
+
     for (typename TestInfoContainer::iterator test_it = tests_.begin();
          test_it != tests_.end(); ++test_it) {
-      linked_ptr<TestInfo> test_info = *test_it;
+      std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
-        const string& instantiation_name = gen_it->name;
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
+        const std::string &instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
-        ParamNameGeneratorFunc* name_func = gen_it->name_func;
-        const char* file = gen_it->file;
+        ParamNameGeneratorFunc *name_func = gen_it->name_func;
+        const char *file = gen_it->file;
         int line = gen_it->line;
 
-        string test_case_name;
-        if ( !instantiation_name.empty() )
-          test_case_name = instantiation_name + "/";
-        test_case_name += test_info->test_case_base_name;
+        std::string test_suite_name;
+        if (!instantiation_name.empty())
+          test_suite_name = instantiation_name + "/";
+        test_suite_name += test_info->test_suite_base_name;
 
         size_t i = 0;
         std::set<std::string> test_param_names;
         for (typename ParamGenerator<ParamType>::iterator param_it =
                  generator.begin();
              param_it != generator.end(); ++param_it, ++i) {
+          generated_instantiations = true;
+
           Message test_name_stream;
 
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
+              << "' is invalid, in " << file << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
-          test_name_stream << test_info->test_base_name << "/" << param_name;
+          if (!test_info->test_base_name.empty()) {
+            test_name_stream << test_info->test_base_name << "/";
+          }
+          test_name_stream << param_name;
           MakeAndRegisterTestInfo(
-              test_case_name.c_str(),
-              test_name_stream.GetString().c_str(),
-              NULL,  // No type parameter.
-              PrintToString(*param_it).c_str(),
-              code_location_,
-              GetTestCaseTypeId(),
-              TestCase::SetUpTestCase,
-              TestCase::TearDownTestCase,
+              test_suite_name.c_str(), test_name_stream.GetString().c_str(),
+              nullptr,  // No type parameter.
+              PrintToString(*param_it).c_str(), code_location_,
+              GetTestSuiteTypeId(),
+              SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
+              SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }  // for gen_it
-    }  // for test_it
+      }    // for gen_it
+    }      // for test_it
+
+    if (!generated_instantiations) {
+      // There are no generaotrs, or they all generate nothing ...
+      InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
+                              !tests_.empty());
+    }
   }  // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
   // with TEST_P macro.
   struct TestInfo {
-    TestInfo(const char* a_test_case_base_name,
-             const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
-        test_case_base_name(a_test_case_base_name),
-        test_base_name(a_test_base_name),
-        test_meta_factory(a_test_meta_factory) {}
+    TestInfo(const char *a_test_suite_base_name, const char *a_test_base_name,
+             TestMetaFactoryBase<ParamType> *a_test_meta_factory)
+        : test_suite_base_name(a_test_suite_base_name),
+          test_base_name(a_test_base_name),
+          test_meta_factory(a_test_meta_factory) {}
 
-    const string test_case_base_name;
-    const string test_base_name;
-    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const std::string test_suite_base_name;
+    const std::string test_base_name;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
   };
-  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
+  // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
+    InstantiationInfo(const std::string &name_in,
+                      GeneratorCreationFunc *generator_in,
+                      ParamNameGeneratorFunc *name_func_in, const char *file_in,
+                      int line_in)
+        : name(name_in), generator(generator_in), name_func(name_func_in),
+          file(file_in), line(line_in) {}
 
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
+    std::string name;
+    GeneratorCreationFunc *generator;
+    ParamNameGeneratorFunc *name_func;
+    const char *file;
+    int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
-  static bool IsValidParamName(const std::string& name) {
+  static bool IsValidParamName(const std::string &name) {
     // Check for empty string
-    if (name.empty())
-      return false;
+    if (name.empty()) return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_')
-        return false;
+      if (!isalnum(name[index]) && name[index] != '_') return false;
     }
 
     return true;
   }
 
-  const string test_case_name_;
+  const std::string test_suite_name_;
   CodeLocation code_location_;
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
-};  // class ParameterizedTestCaseInfo
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+};  // class ParameterizedTestSuiteInfo
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+template <class TestCase>
+using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
-// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
-// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
-// macros use it to locate their corresponding ParameterizedTestCaseInfo
-// descriptors.
-class ParameterizedTestCaseRegistry {
+// ParameterizedTestSuiteRegistry contains a map of
+// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
+// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
+// ParameterizedTestSuiteInfo descriptors.
+class ParameterizedTestSuiteRegistry {
  public:
-  ParameterizedTestCaseRegistry() {}
-  ~ParameterizedTestCaseRegistry() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      delete *it;
+  ParameterizedTestSuiteRegistry() {}
+  ~ParameterizedTestSuiteRegistry() {
+    for (auto &test_suite_info : test_suite_infos_) {
+      delete test_suite_info;
     }
   }
 
   // Looks up or creates and returns a structure containing information about
-  // tests and instantiations of a particular test case.
-  template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
-      const char* test_case_name,
-      CodeLocation code_location) {
-    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      if ((*it)->GetTestCaseName() == test_case_name) {
-        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+  // tests and instantiations of a particular test suite.
+  template <class TestSuite>
+  ParameterizedTestSuiteInfo<TestSuite> *GetTestSuitePatternHolder(
+      const char *test_suite_name, CodeLocation code_location) {
+    ParameterizedTestSuiteInfo<TestSuite> *typed_test_info = nullptr;
+    for (auto &test_suite_info : test_suite_infos_) {
+      if (test_suite_info->GetTestSuiteName() == test_suite_name) {
+        if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
           // Complain about incorrect usage of Google Test facilities
           // and terminate the program since we cannot guaranty correct
-          // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name, code_location);
+          // test suite setup and tear-down in this case.
+          ReportInvalidTestSuiteType(test_suite_name, code_location);
           posix::Abort();
         } else {
           // At this point we are sure that the object we found is of the same
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestCaseInfo<TestCase> >(*it);
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
         }
         break;
       }
     }
-    if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
-          test_case_name, code_location);
-      test_case_infos_.push_back(typed_test_info);
+    if (typed_test_info == nullptr) {
+      typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
+          test_suite_name, code_location);
+      test_suite_infos_.push_back(typed_test_info);
     }
     return typed_test_info;
   }
   void RegisterTests() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      (*it)->RegisterTests();
+    for (auto &test_suite_info : test_suite_infos_) {
+      test_suite_info->RegisterTests();
     }
   }
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase> *GetTestCasePatternHolder(
+      const char *test_case_name, CodeLocation code_location) {
+    return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
+  }
+
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ private:
+  using TestSuiteInfoContainer =
+      ::std::vector<ParameterizedTestSuiteInfoBase *>;
+
+  TestSuiteInfoContainer test_suite_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+};
+
+// Keep track of what type-parameterized test suite are defined and
+// where as well as which are intatiated. This allows susequently
+// identifying suits that are defined but never used.
+class TypeParameterizedTestSuiteRegistry {
+ public:
+  // Add a suite definition
+  void RegisterTestSuite(const char *test_suite_name,
+                         CodeLocation code_location);
+
+  // Add an instantiation of a suit.
+  void RegisterInstantiation(const char *test_suite_name);
+
+  // For each suit repored as defined but not reported as instantiation,
+  // emit a test that reports that fact (configurably, as an error).
+  void CheckForInstantiations();
+
+ private:
+  struct TypeParameterizedTestSuiteInfo {
+    explicit TypeParameterizedTestSuiteInfo(CodeLocation c)
+        : code_location(c), instantiated(false) {}
+
+    CodeLocation code_location;
+    bool instantiated;
+  };
+
+  std::map<std::string, TypeParameterizedTestSuiteInfo> suites_;
+};
+
+}  // namespace internal
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container &container);
+
+namespace internal {
+// Used in the Values() function to provide polymorphic capabilities.
+
+template <typename... Ts>
+class ValueArray {
+ public:
+  ValueArray(Ts... v) : v_{ std::move(v)... } {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {  // NOLINT
+    return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
+  }
 
  private:
-  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+  template <typename T, size_t... I>
+  std::vector<T> MakeVector(IndexSequence<I...>) const {
+    return std::vector<T>{ static_cast<T>(v_.template Get<I>())... };
+  }
 
-  TestCaseInfoContainer test_case_infos_;
+  FlatTuple<Ts...> v_;
+};
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+template <typename... T>
+class CartesianProductGenerator
+    : public ParamGeneratorInterface<::std::tuple<T...>> {
+ public:
+  typedef ::std::tuple<T...> ParamType;
+
+  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...> &g)
+      : generators_(g) {}
+  ~CartesianProductGenerator() override {}
+
+  ParamIteratorInterface<ParamType> *Begin() const override {
+    return new Iterator(this, generators_, false);
+  }
+  ParamIteratorInterface<ParamType> *End() const override {
+    return new Iterator(this, generators_, true);
+  }
+
+ private:
+  template <class I>
+  class IteratorImpl;
+  template <size_t... I>
+  class IteratorImpl<IndexSequence<I...>>
+      : public ParamIteratorInterface<ParamType> {
+   public:
+    IteratorImpl(const ParamGeneratorInterface<ParamType> *base,
+                 const std::tuple<ParamGenerator<T>...> &generators,
+                 bool is_end)
+        : base_(base), begin_(std::get<I>(generators).begin()...),
+          end_(std::get<I>(generators).end()...),
+          current_(is_end ? end_ : begin_) {
+      ComputeCurrentValue();
+    }
+    ~IteratorImpl() override {}
+
+    const ParamGeneratorInterface<ParamType> *BaseGenerator() const override {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    void Advance() override {
+      assert(!AtEnd());
+      // Advance the last iterator.
+      ++std::get<sizeof...(T) - 1>(current_);
+      // if that reaches end, propagate that up.
+      AdvanceIfEnd<sizeof...(T) - 1>();
+      ComputeCurrentValue();
+    }
+    ParamIteratorInterface<ParamType> *Clone() const override {
+      return new IteratorImpl(*this);
+    }
+
+    const ParamType *Current() const override { return current_value_.get(); }
+
+    bool Equals(const ParamIteratorInterface<ParamType> &other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const IteratorImpl *typed_other =
+          CheckedDowncastToActualType<const IteratorImpl>(&other);
+
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      if (AtEnd() && typed_other->AtEnd()) return true;
+
+      bool same = true;
+      bool dummy[] = { (same = same &&
+                               std::get<I>(current_) ==
+                                   std::get<I>(typed_other->current_))... };
+      (void)dummy;
+      return same;
+    }
+
+   private:
+    template <size_t ThisI>
+    void AdvanceIfEnd() {
+      if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
+
+      bool last = ThisI == 0;
+      if (last) {
+        // We are done. Nothing else to propagate.
+        return;
+      }
+
+      constexpr size_t NextI = ThisI - (ThisI != 0);
+      std::get<ThisI>(current_) = std::get<ThisI>(begin_);
+      ++std::get<NextI>(current_);
+      AdvanceIfEnd<NextI>();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
+    }
+    bool AtEnd() const {
+      bool at_end = false;
+      bool dummy[] = { (at_end = at_end || std::get<I>(current_) ==
+                                               std::get<I>(end_))... };
+      (void)dummy;
+      return at_end;
+    }
+
+    const ParamGeneratorInterface<ParamType> *const base_;
+    std::tuple<typename ParamGenerator<T>::iterator...> begin_;
+    std::tuple<typename ParamGenerator<T>::iterator...> end_;
+    std::tuple<typename ParamGenerator<T>::iterator...> current_;
+    std::shared_ptr<ParamType> current_value_;
+  };
+
+  using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
+
+  std::tuple<ParamGenerator<T>...> generators_;
+};
+
+template <class... Gen>
+class CartesianProductHolder {
+ public:
+  CartesianProductHolder(const Gen &... g) : generators_(g...) {}
+  template <typename... T>
+  operator ParamGenerator<::std::tuple<T...>>() const {
+    return ParamGenerator<::std::tuple<T...>>(
+        new CartesianProductGenerator<T...>(generators_));
+  }
+
+ private:
+  std::tuple<Gen...> generators_;
 };
 
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
index 74ab949..f803a19 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h

@@ -27,7 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
@@ -37,57 +37,75 @@
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-#elif defined __SYMBIAN32__
-# define GTEST_OS_SYMBIAN 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(__MINGW__) || defined(__MINGW32__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
+#elif defined __OS2__
+#define GTEST_OS_OS2 1
 #elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
+#elif defined __DragonFly__
+#define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
+#define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
 #elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
 #elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
 #elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+#define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+#define GTEST_OS_HAIKU 1
+#elif defined ESP8266
+#define GTEST_OS_ESP8266 1
+#elif defined ESP32
+#define GTEST_OS_ESP32 1
 #endif  // __CYGWIN__
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
index da57e65..083da56 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h

@@ -27,8 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan)
-//
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -40,6 +38,8 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
@@ -72,12 +72,6 @@
 //                              is/isn't available.
 //   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
 //                              are enabled.
-//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
 //   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
 //                              expressions are/aren't available.
 //   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
@@ -87,8 +81,6 @@
 //   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
 //                              std::wstring does/doesn't work (Google Test can
 //                              be used where std::wstring is unavailable).
-//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
-//                              is/isn't available.
 //   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
 //                              compiler supports Microsoft's "Structured
 //                              Exception Handling".
@@ -96,12 +88,6 @@
 //                            - Define it to 1/0 to indicate whether the
 //                              platform supports I/O stream redirection using
 //                              dup() and dup2().
-//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
-//                              Test's own tr1 tuple implementation should be
-//                              used.  Unused when the user sets
-//                              GTEST_HAS_TR1_TUPLE to 0.
-//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
-//                              is building in C++11/C++98 mode.
 //   GTEST_LINKED_AS_SHARED_LIBRARY
 //                            - Define to 1 when compiling tests that use
 //                              Google Test as a shared library (known as
@@ -109,6 +95,12 @@
 //   GTEST_CREATE_SHARED_LIBRARY
 //                            - Define to 1 when compiling Google Test itself
 //                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
 
 // Platform-indicating macros
 // --------------------------
@@ -121,17 +113,22 @@
 //
 //   GTEST_OS_AIX      - IBM AIX
 //   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_DRAGONFLY - DragonFlyBSD
 //   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
+//   GTEST_OS_HAIKU    - Haiku
 //   GTEST_OS_HPUX     - HP-UX
 //   GTEST_OS_LINUX    - Linux
 //     GTEST_OS_LINUX_ANDROID - Google Android
 //   GTEST_OS_MAC      - Mac OS X
 //     GTEST_OS_IOS    - iOS
 //   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
 //   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_OS2      - OS/2
 //   GTEST_OS_QNX      - QNX
 //   GTEST_OS_SOLARIS  - Sun Solaris
-//   GTEST_OS_SYMBIAN  - Symbian
 //   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
 //     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
 //     GTEST_OS_WINDOWS_MINGW    - MinGW
@@ -140,7 +137,7 @@
 //     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
 //   GTEST_OS_ZOS      - z/OS
 //
-// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
 // most stable support.  Since core members of the Google Test project
 // don't have access to other platforms, support for them may be less
 // stable.  If you notice any problems on your platform, please notify
@@ -166,19 +163,16 @@
 //   EXPECT_DEATH(DoSomethingDeadly());
 // #endif
 //
-//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
-//                            tests)
 //   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
 //   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
-//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+//                            the above RE\b(s) are mutually exclusive.
 
 // Misc public macros
 // ------------------
@@ -196,36 +190,29 @@
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
 //   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
 //                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_ASSIGN_   - disables copy operator=.
 //   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_DISALLOW_MOVE_ASSIGN_   - disables move operator=.
+//   GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
 //   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
 //                                        suppressed (constant conditional).
 //   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
 //                                        is suppressed.
-//
-// C++11 feature wrappers:
-//
-//   testing::internal::move  - portability wrapper for std::move.
+//   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
+//                                    Matcher<absl::string_view>
+//                                    specializations.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
 //                            - synchronization primitives.
 //
-// Template meta programming:
-//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
-//   IteratorTraits - partial implementation of std::iterator_traits, which
-//                    is not available in libCstd when compiled with Sun C++.
-//
-// Smart pointers:
-//   scoped_ptr     - as in TR2.
-//
 // Regular expressions:
 //   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -241,8 +228,7 @@
 //
 // Integer types:
 //   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
+//   TimeInMillis   - integers of known sizes.
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
@@ -253,52 +239,60 @@
 // Environment variable utilities:
 //   GetEnv()             - gets the value of an environment variable.
 //   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   Int32FromGTestEnv()  - parses an int32_t environment variable.
 //   StringFromGTestEnv() - parses a string environment variable.
+//
+// Deprecation warnings:
+//   GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
+//                                        deprecated; calling a marked function
+//                                        should generate a compiler warning
 
 #include <ctype.h>   // for isspace, etc
 #include <stddef.h>  // for ptrdiff_t
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
 #ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 #endif
 
-#include <algorithm>  // NOLINT
 #include <iostream>  // NOLINT
-#include <sstream>  // NOLINT
+#include <memory>
 #include <string>  // NOLINT
-#include <utility>
+#include <tuple>
 #include <vector>  // NOLINT
 
-#include "gtest/internal/gtest-port-arch.h"
 #include "gtest/internal/custom/gtest-port.h"
+#include "gtest/internal/gtest-port-arch.h"
 
 #if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -306,121 +300,71 @@
 //   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
-#if _MSC_VER >= 1500
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
+#if defined(_MSC_VER)
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
 #else
-// Older versions of MSVC don't have __pragma.
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+// Not all compilers are MSVC
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
-#ifndef GTEST_LANG_CXX11
-// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
-// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
-// value for __cplusplus, and recent versions of clang, gcc, and
-// probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
-// Compiling in at least C++11 mode.
-#  define GTEST_LANG_CXX11 1
-# else
-#  define GTEST_LANG_CXX11 0
-# endif
-#endif
-
-// Distinct from C++11 language support, some environments don't provide
-// proper C++11 library support. Notably, it's possible to build in
-// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
-// with no C++11 support.
-//
-// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
-// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
-// this date, so check for those versions by their date stamps.
-// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
-#if GTEST_LANG_CXX11 && \
-    (!defined(__GLIBCXX__) || ( \
-        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
-        /* Blacklist of patch releases of older branches: */ \
-        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
-        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
-        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
-        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
-# define GTEST_STDLIB_CXX11 1
-#endif
-
-// Only use C++11 library features if the library provides them.
-#if GTEST_STDLIB_CXX11
-# define GTEST_HAS_STD_BEGIN_AND_END_ 1
-# define GTEST_HAS_STD_FORWARD_LIST_ 1
-# define GTEST_HAS_STD_FUNCTION_ 1
-# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
-# define GTEST_HAS_STD_MOVE_ 1
-# define GTEST_HAS_STD_SHARED_PTR_ 1
-# define GTEST_HAS_STD_TYPE_TRAITS_ 1
-# define GTEST_HAS_STD_UNIQUE_PTR_ 1
-#endif
-
-// C++11 specifies that <tuple> provides std::tuple.
-// Some platforms still might not have it, however.
-#if GTEST_LANG_CXX11
-# define GTEST_HAS_STD_TUPLE_ 1
-# if defined(__clang__)
-// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
-#  if defined(__has_include) && !__has_include(<tuple>)
-#   undef GTEST_HAS_STD_TUPLE_
-#  endif
-# elif defined(_MSC_VER)
-// Inspired by boost/config/stdlib/dinkumware.hpp
-#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
-#   undef GTEST_HAS_STD_TUPLE_
-#  endif
-# elif defined(__GLIBCXX__)
-// Inspired by boost/config/stdlib/libstdcpp3.hpp,
-// http://gcc.gnu.org/gcc-4.2/changes.html and
-// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
-#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
-#   undef GTEST_HAS_STD_TUPLE_
-#  endif
-# endif
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
+#else
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
 // In order to avoid having to include <windows.h>, use forward declaration
-// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
 // This assumption is verified by
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
-struct _RTL_CRITICAL_SECTION;
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <unistd.h>
+#include <strings.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
+#include <android/api-level.h>  // NOLINT
 #endif
 
-// Defines this to true iff Google Test can use POSIX regular expressions.
+// Defines this to true if and only if Google Test can use POSIX regular
+// expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-# endif
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+#endif
 #endif
 
 #if GTEST_USES_PCRE
@@ -432,165 +376,143 @@
 // won't compile otherwise.  We can #include it here as we already
 // included <stdlib.h>, which is guaranteed to define size_t through
 // <stddef.h>.
-# include <regex.h>  // NOLINT
+#include <regex.h>  // NOLINT
 
-# define GTEST_USES_POSIX_RE 1
+#define GTEST_USES_POSIX_RE 1
 
 #elif GTEST_OS_WINDOWS
 
 // <regex.h> is not available on Windows.  Use our own simple regex
 // implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
+#define GTEST_USES_SIMPLE_RE 1
 
 #else
 
 // <regex.h> may not be available on this platform.  Use our own
 // simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
+#define GTEST_USES_SIMPLE_RE 1
 
 #endif  // GTEST_USES_PCRE
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
-// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
-// but iff cleanups are enabled after that. In Obj-C++ files, there can be
-// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
-// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
-// exceptions starting at clang r206352, but which checked for cleanups prior to
-// that. To reliably check for C++ exception availability with clang, check for
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
+// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
+// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
+// there can be cleanups for ObjC exceptions which also need cleanups, even if
+// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which
+// checks for C++ exceptions starting at clang r206352, but which checked for
+// cleanups prior to that. To reliably check for C++ exception availability with
+// clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
-// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
-// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
-#ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
-# define GTEST_HAS_GLOBAL_STRING 0
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
 #ifndef GTEST_HAS_STD_WSTRING
 // The user didn't tell us whether ::std::wstring is available, so we need
 // to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
-//   is available.
-
 // Cygwin 1.7 and below doesn't support ::std::wstring.
 // Solaris' libc++ doesn't support it either.  Android has
 // no support for it at least as recent as Froyo (2.2).
-# define GTEST_HAS_STD_WSTRING \
-    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+#define GTEST_HAS_STD_WSTRING                                         \
+  (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266))
 
 #endif  // GTEST_HAS_STD_WSTRING
 
-#ifndef GTEST_HAS_GLOBAL_WSTRING
-// The user didn't tell us whether ::wstring is available, so we need
-// to figure it out.
-# define GTEST_HAS_GLOBAL_WSTRING \
-    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
 // Determines whether RTTI is available.
 #ifndef GTEST_HAS_RTTI
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-# ifdef _MSC_VER
+#ifdef _MSC_VER
 
-#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
+// enabled.
+#elif defined(__GNUC__)
 
-#  ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
 
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-# else
+#else
 
 // For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
 
-# endif  // _MSC_VER
+#endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -600,151 +522,22 @@
 //
 // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
 // to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#define GTEST_HAS_PTHREAD                                                      \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+   GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
+   GTEST_OS_HAIKU)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-# include <pthread.h>  // NOLINT
+#include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
+#include <time.h>  // NOLINT
 #endif
 
-// Determines if hash_map/hash_set are available.
-// Only used for testing against those containers.
-#if !defined(GTEST_HAS_HASH_MAP_)
-# if _MSC_VER
-#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
-#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
-# endif  // _MSC_VER
-#endif  // !defined(GTEST_HAS_HASH_MAP_)
-
-// Determines whether Google Test can use tr1/tuple.  You can define
-// this macro to 0 to prevent Google Test from using tuple (any
-// feature depending on tuple with be disabled in this mode).
-#ifndef GTEST_HAS_TR1_TUPLE
-# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
-// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
-#  define GTEST_HAS_TR1_TUPLE 0
-# else
-// The user didn't tell us not to do it, so we assume it's OK.
-#  define GTEST_HAS_TR1_TUPLE 1
-# endif
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether Google Test's own tr1 tuple implementation
-// should be used.
-#ifndef GTEST_USE_OWN_TR1_TUPLE
-// The user didn't tell us, so we need to figure it out.
-
-// We use our own TR1 tuple if we aren't sure the user has an
-// implementation of it already.  At this time, libstdc++ 4.0.0+ and
-// MSVC 2010 are the only mainstream standard libraries that come
-// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
-// pretends to be GCC by defining __GNUC__ and friends, but cannot
-// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
-// tuple in a 323 MB Feature Pack download, which we cannot assume the
-// user has.  QNX's QCC compiler is a modified GCC but it doesn't
-// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
-// and it can be used with some compilers that define __GNUC__.
-# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
-#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
-# endif
-
-// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
-// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
-// can build with clang but need to use gcc4.2's libstdc++).
-# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
-#  define GTEST_ENV_HAS_STD_TUPLE_ 1
-# endif
-
-# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
-#  define GTEST_USE_OWN_TR1_TUPLE 0
-# else
-#  define GTEST_USE_OWN_TR1_TUPLE 1
-# endif
-
-#endif  // GTEST_USE_OWN_TR1_TUPLE
-
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tuple.
-#if GTEST_HAS_STD_TUPLE_
-# include <tuple>  // IWYU pragma: export
-# define GTEST_TUPLE_NAMESPACE_ ::std
-#endif  // GTEST_HAS_STD_TUPLE_
-
-// We include tr1::tuple even if std::tuple is available to define printers for
-// them.
-#if GTEST_HAS_TR1_TUPLE
-# ifndef GTEST_TUPLE_NAMESPACE_
-#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
-# endif  // GTEST_TUPLE_NAMESPACE_
-
-# if GTEST_USE_OWN_TR1_TUPLE
-#  include "gtest/internal/gtest-tuple.h"  // IWYU pragma: export  // NOLINT
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
-# elif GTEST_OS_SYMBIAN
-
-// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
-// use STLport's tuple implementation, which unfortunately doesn't
-// work as the copy of STLport distributed with Symbian is incomplete.
-// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
-// use its own tuple implementation.
-#  ifdef BOOST_HAS_TR1_TUPLE
-#   undef BOOST_HAS_TR1_TUPLE
-#  endif  // BOOST_HAS_TR1_TUPLE
-
-// This prevents <boost/tr1/detail/config.hpp>, which defines
-// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
-#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
-#  include <tuple>  // IWYU pragma: export  // NOLINT
-
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
-// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
-// not conform to the TR1 spec, which requires the header to be <tuple>.
-
-#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
-// which is #included by <tr1/tuple>, to not compile when RTTI is
-// disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
-// <tr1/functional> from being included.
-#   define _TR1_FUNCTIONAL 1
-#   include <tr1/tuple>
-#   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
-#  else
-#   include <tr1/tuple>  // NOLINT
-#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
-#  include <tuple>  // IWYU pragma: export  // NOLINT
-# endif  // GTEST_USE_OWN_TR1_TUPLE
-
-#endif  // GTEST_HAS_TR1_TUPLE
-
 // Determines whether clone(2) is supported.
 // Usually it will only be available on Linux, excluding
 // Linux on the Itanium architecture.
@@ -752,20 +545,23 @@
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -774,56 +570,43 @@
 #ifndef GTEST_HAS_STREAM_REDIRECTION
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
-// Google Test does not support death tests for VC 7.1 and earlier as
-// abort() in a VC 7.1 application compiled as GUI in debug config
 // pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
-     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
-# define GTEST_HAS_DEATH_TEST 1
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||             \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                                   \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
+     GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
+     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
+#define GTEST_HAS_DEATH_TEST 1
 #endif
 
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
 // Determines whether to support type-driven tests.
 
 // Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
-#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
-#endif
-
-// Determines whether to support Combine(). This only makes sense when
-// value-parameterized tests are enabled.  The implementation doesn't
-// work on Sun Studio since it doesn't understand templated conversion
-// operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
-# define GTEST_HAS_COMBINE 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
 #define GTEST_WIDE_STRING_USES_UTF16_ \
-    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+  (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
 
 // Determines whether test results can be streamed to a socket.
-#if GTEST_OS_LINUX
-# define GTEST_CAN_STREAM_RESULTS_ 1
+#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -837,9 +620,12 @@
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -854,37 +640,64 @@
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
 #elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
-// A macro to disallow operator=
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+#if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__(                                              \
+      (__format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+// A macro to disallow copy operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
+#define GTEST_DISALLOW_ASSIGN_(type) type &operator=(type const &) = delete
 
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) = delete;                \
   GTEST_DISALLOW_ASSIGN_(type)
 
+// A macro to disallow move operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
+  type &operator=(type &&) noexcept = delete
+
+// A macro to disallow move constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
+  type(type &&) noexcept = delete;            \
+  GTEST_DISALLOW_MOVE_ASSIGN_(type)
+
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
 // following the argument list:
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
-#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
 #else
-# define GTEST_MUST_USE_RESULT_
-#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+#define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
 // constant. In some contexts this warning is false positive and needs to be
@@ -894,10 +707,9 @@
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -905,113 +717,124 @@
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
 // Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
-
-#define GTEST_IS_THREADSAFE \
-    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
-     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
-     || GTEST_HAS_PTHREAD)
+#define GTEST_HAS_SEH 0
+#endif
 
 #endif  // GTEST_HAS_SEH
 
+#ifndef GTEST_IS_THREADSAFE
+
+#define GTEST_IS_THREADSAFE                                                 \
+  (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ ||                                     \
+   (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
+   GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_IS_THREADSAFE
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
 #ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
-#endif // _MSC_VER
+#define GTEST_API_ __attribute__((visibility("default")))
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
 
 #ifndef GTEST_API_
-# define GTEST_API_
-#endif
+#define GTEST_API_
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
 #else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_INLINE_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
+#if !defined(GTEST_HAS_CXXABI_H_)
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
 #else
-# define GTEST_HAS_CXXABI_H_ 0
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable HWAddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-#endif  // __clang__
-
-// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
-// unsigned integer overflow instrumentation.
-#if defined(__clang__)
-# if defined(__has_attribute) && __has_attribute(no_sanitize)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
-       __attribute__((no_sanitize("unsigned-integer-overflow")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
-# endif  // defined(__has_attribute) && __has_attribute(no_sanitize)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
 
 class Message;
 
-#if defined(GTEST_TUPLE_NAMESPACE_)
-// Import tuple and friends into the ::testing namespace.
-// It is part of our interface, having them in ::testing allows us to change
-// their types as needed.
-using GTEST_TUPLE_NAMESPACE_::get;
-using GTEST_TUPLE_NAMESPACE_::make_tuple;
-using GTEST_TUPLE_NAMESPACE_::tuple;
-using GTEST_TUPLE_NAMESPACE_::tuple_size;
-using GTEST_TUPLE_NAMESPACE_::tuple_element;
-#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+// Legacy imports for backwards compatibility.
+// New code should use std:: names directly.
+using std::get;
+using std::make_tuple;
+using std::tuple;
+using std::tuple_element;
+using std::tuple_size;
 
 namespace internal {
 
@@ -1020,208 +843,61 @@
 // Secret object, which is what we want.
 class Secret;
 
-// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
-// expression is true. For example, you could use it to verify the
-// size of a static array:
+// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
+// time expression is true (in new code, use static_assert instead). For
+// example, you could use it to verify the size of a static array:
 //
 //   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
 //                         names_incorrect_size);
 //
-// or to make sure a struct is smaller than a certain size:
-//
-//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
-//
-// The second argument to the macro is the name of the variable. If
-// the expression is false, most compilers will issue a warning/error
-// containing the name of the variable.
-
-#if GTEST_LANG_CXX11
-# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-#else  // !GTEST_LANG_CXX11
-template <bool>
-  struct CompileAssert {
-};
-
-# define GTEST_COMPILE_ASSERT_(expr, msg) \
-  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
-      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
-#endif  // !GTEST_LANG_CXX11
-
-// Implementation details of GTEST_COMPILE_ASSERT_:
-//
-// (In C++11, we simply use static_assert instead of the following)
-//
-// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
-//   elements (and thus is invalid) when the expression is false.
-//
-// - The simpler definition
-//
-//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
-//
-//   does not work, as gcc supports variable-length arrays whose sizes
-//   are determined at run-time (this is gcc's extension and not part
-//   of the C++ standard).  As a result, gcc fails to reject the
-//   following code with the simple definition:
-//
-//     int foo;
-//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
-//                                      // not a compile-time constant.
-//
-// - By using the type CompileAssert<(bool(expr))>, we ensures that
-//   expr is a compile-time constant.  (Template arguments must be
-//   determined at compile-time.)
-//
-// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
-//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
-//
-//     CompileAssert<bool(expr)>
-//
-//   instead, these compilers will refuse to compile
-//
-//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
-//
-//   (They seem to think the ">" in "5 > 0" marks the end of the
-//   template argument list.)
-//
-// - The array size is (bool(expr) ? 1 : -1), instead of simply
-//
-//     ((expr) ? 1 : -1).
-//
-//   This is to avoid running into a bug in MS VC 7.1, which
-//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
-
-// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
-//
-// This template is declared, but intentionally undefined.
-template <typename T1, typename T2>
-struct StaticAssertTypeEqHelper;
-
-template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {
-  enum { value = true };
-};
-
-// Evaluates to the number of elements in 'array'.
-#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
-
-#if GTEST_HAS_GLOBAL_STRING
-typedef ::string string;
-#else
-typedef ::std::string string;
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-typedef ::wstring wstring;
-#elif GTEST_HAS_STD_WSTRING
-typedef ::std::wstring wstring;
-#endif  // GTEST_HAS_GLOBAL_WSTRING
+// The second argument to the macro must be a valid C++ identifier. If the
+// expression is false, compiler will issue an error containing this identifier.
+#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
 
 // A helper for suppressing warnings on constant condition.  It just
 // returns 'condition'.
 GTEST_API_ bool IsTrue(bool condition);
 
-// Defines scoped_ptr.
-
-// This implementation of scoped_ptr is PARTIAL - it only contains
-// enough stuff to satisfy Google Test's need.
-template <typename T>
-class scoped_ptr {
- public:
-  typedef T element_type;
-
-  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
-  ~scoped_ptr() { reset(); }
-
-  T& operator*() const { return *ptr_; }
-  T* operator->() const { return ptr_; }
-  T* get() const { return ptr_; }
-
-  T* release() {
-    T* const ptr = ptr_;
-    ptr_ = NULL;
-    return ptr;
-  }
-
-  void reset(T* p = NULL) {
-    if (p != ptr_) {
-      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
-        delete ptr_;
-      }
-      ptr_ = p;
-    }
-  }
-
-  friend void swap(scoped_ptr& a, scoped_ptr& b) {
-    using std::swap;
-    swap(a.ptr_, b.ptr_);
-  }
-
- private:
-  T* ptr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
-};
-
 // Defines RE.
 
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
 // Regular Expression syntax.
 class GTEST_API_ RE {
  public:
   // A copy constructor is required by the Standard to initialize object
   // references from r-values.
-  RE(const RE& other) { Init(other.pattern()); }
+  RE(const RE &other) { Init(other.pattern()); }
 
   // Constructs an RE from a string.
-  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+  RE(const ::std::string &regex) { Init(regex.c_str()); }  // NOLINT
 
-#if GTEST_HAS_GLOBAL_STRING
-
-  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  RE(const char* regex) { Init(regex); }  // NOLINT
+  RE(const char *regex) { Init(regex); }  // NOLINT
   ~RE();
 
   // Returns the string representation of the regex.
-  const char* pattern() const { return pattern_; }
+  const char *pattern() const { return pattern_; }
 
-  // FullMatch(str, re) returns true iff regular expression re matches
-  // the entire str.
-  // PartialMatch(str, re) returns true iff regular expression re
+  // FullMatch(str, re) returns true if and only if regular expression re
+  // matches the entire str.
+  // PartialMatch(str, re) returns true if and only if regular expression re
   // matches a substring of str (including str itself).
-  //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
-  // when str contains NUL characters.
-  static bool FullMatch(const ::std::string& str, const RE& re) {
+  static bool FullMatch(const ::std::string &str, const RE &re) {
     return FullMatch(str.c_str(), re);
   }
-  static bool PartialMatch(const ::std::string& str, const RE& re) {
+  static bool PartialMatch(const ::std::string &str, const RE &re) {
     return PartialMatch(str.c_str(), re);
   }
 
-#if GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const ::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const char* str, const RE& re);
-  static bool PartialMatch(const char* str, const RE& re);
+  static bool FullMatch(const char *str, const RE &re);
+  static bool PartialMatch(const char *str, const RE &re);
 
  private:
-  void Init(const char* regex);
-
-  // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
-  // std::string.
-  const char* pattern_;
+  void Init(const char *regex);
+  const char *pattern_;
   bool is_valid_;
 
 #if GTEST_USES_POSIX_RE
@@ -1231,21 +907,23 @@
 
 #else  // GTEST_USES_SIMPLE_RE
 
-  const char* full_pattern_;  // For FullMatch();
+  const char *full_pattern_;  // For FullMatch();
 
 #endif
 
   GTEST_DISALLOW_ASSIGN_(RE);
 };
 
+#endif  // GTEST_USES_PCRE
+
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+GTEST_API_ ::std::string FormatFileLocation(const char *file, int line);
 
 // Formats a file location for compiler-independent XML output.
 // Although this function is not platform dependent, we put it next to
 // FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
                                                                int line);
 
 // Defines logging utilities:
@@ -1254,24 +932,19 @@
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
 // scope.
 class GTEST_API_ GTestLog {
  public:
-  GTestLog(GTestLogSeverity severity, const char* file, int line);
+  GTestLog(GTestLogSeverity severity, const char *file, int line);
 
   // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
   ~GTestLog();
 
-  ::std::ostream& GetStream() { return ::std::cerr; }
+  ::std::ostream &GetStream() { return ::std::cerr; }
 
  private:
   const GTestLogSeverity severity_;
@@ -1281,12 +954,13 @@
 
 #if !defined(GTEST_LOG_)
 
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
 
 inline void LogToStderr() {}
-inline void FlushInfoLog() { fflush(NULL); }
+inline void FlushInfoLog() { fflush(nullptr); }
 
 #endif  // !defined(GTEST_LOG_)
 
@@ -1305,12 +979,12 @@
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -1319,18 +993,32 @@
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
 
-#if GTEST_HAS_STD_MOVE_
-using std::move;
-#else  // GTEST_HAS_STD_MOVE_
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-const T& move(const T& t) {
-  return t;
-}
-#endif  // GTEST_HAS_STD_MOVE_
+struct ConstRef {
+  typedef const T &type;
+};
+template <typename T>
+struct ConstRef<T &> {
+  typedef T &type;
+};
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
@@ -1352,8 +1040,10 @@
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1376,22 +1066,22 @@
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From *f) {         // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-    const To to = NULL;
-    ::testing::internal::ImplicitCast_<From*>(to);
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From *>(to);
   }
 
 #if GTEST_HAS_RTTI
   // RTTI: debug mode only!
-  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+  GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
 #endif
   return static_cast<To>(f);
 }
@@ -1402,17 +1092,17 @@
 // When RTTI is available, the function performs a runtime
 // check to enforce this.
 template <class Derived, class Base>
-Derived* CheckedDowncastToActualType(Base* base) {
+Derived *CheckedDowncastToActualType(Base *base) {
 #if GTEST_HAS_RTTI
   GTEST_CHECK_(typeid(*base) == typeid(Derived));
 #endif
 
 #if GTEST_HAS_DOWNCAST_
-  return ::down_cast<Derived*>(base);
+  return ::down_cast<Derived *>(base);
 #elif GTEST_HAS_RTTI
-  return dynamic_cast<Derived*>(base);  // NOLINT
+  return dynamic_cast<Derived *>(base);  // NOLINT
 #else
-  return static_cast<Derived*>(base);  // Poor man's downcast.
+  return static_cast<Derived *>(base);  // Poor man's downcast.
 #endif
 }
 
@@ -1430,31 +1120,28 @@
 GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Returns a path to temporary directory.
-GTEST_API_ std::string TempDir();
-
 // Returns the size (in bytes) of a file.
-GTEST_API_ size_t GetFileSize(FILE* file);
+GTEST_API_ size_t GetFileSize(FILE *file);
 
 // Reads the entire content of a file as a string.
-GTEST_API_ std::string ReadEntireFile(FILE* file);
+GTEST_API_ std::string ReadEntireFile(FILE *file);
 
 // All command line arguments.
-GTEST_API_ const ::std::vector<testing::internal::string>& GetArgvs();
+GTEST_API_ std::vector<std::string> GetArgvs();
 
 #if GTEST_HAS_DEATH_TEST
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
-
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string> *new_argvs);
+void SetInjectableArgvs(const std::vector<std::string> &new_argvs);
+void ClearInjectableArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
+#if GTEST_HAS_PTHREAD
 // Sleeps for (roughly) n milliseconds.  This function is only for testing
 // Google Test's own constructs.  Don't use it in user tests, either
 // directly or indirectly.
@@ -1463,15 +1150,15 @@
     0,                  // 0 seconds.
     n * 1000L * 1000L,  // And n ms.
   };
-  nanosleep(&time, NULL);
+  nanosleep(&time, nullptr);
 }
-# endif  // GTEST_HAS_PTHREAD
+#endif  // GTEST_HAS_PTHREAD
 
-# if GTEST_HAS_NOTIFICATION_
+#if GTEST_HAS_NOTIFICATION_
 // Notification has already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
 // and destroyed in the controller thread.
@@ -1481,11 +1168,9 @@
 class Notification {
  public:
   Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
   }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
+  ~Notification() { pthread_mutex_destroy(&mutex_); }
 
   // Notifies all threads created with this notification to start. Must
   // be called from the controller thread.
@@ -1502,8 +1187,7 @@
       pthread_mutex_lock(&mutex_);
       const bool notified = notified_;
       pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
+      if (notified) break;
       SleepMilliseconds(10);
     }
   }
@@ -1515,7 +1199,7 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 GTEST_API_ void SleepMilliseconds(int n);
 
@@ -1528,7 +1212,7 @@
   // undesirable because it defines a lot of symbols and macros that tend to
   // conflict with client code. This assumption is verified by
   // WindowsTypesTest.HANDLEIsVoidStar.
-  typedef void* Handle;
+  typedef void *Handle;
   AutoHandle();
   explicit AutoHandle(Handle handle);
 
@@ -1539,7 +1223,8 @@
   void Reset(Handle handle);
 
  private:
-  // Returns true iff the handle is a valid handle object that can be closed.
+  // Returns true if and only if the handle is a valid handle object that can be
+  // closed.
   bool IsCloseable() const;
 
   Handle handle_;
@@ -1564,12 +1249,12 @@
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
-# endif  // GTEST_HAS_NOTIFICATION_
+#endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1588,9 +1273,9 @@
 // example, SunStudio) treat them as different types.  Since class methods
 // cannot be defined with C-linkage we need to define a free C-function to
 // pass into pthread_create().
-extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
-  static_cast<ThreadWithParamBase*>(thread)->Run();
-  return NULL;
+extern "C" inline void *ThreadFuncWithCLinkage(void *thread) {
+  static_cast<ThreadWithParamBase *>(thread)->Run();
+  return nullptr;
 }
 
 // Helper class for testing Google Test's multi-threading constructs.
@@ -1610,51 +1295,49 @@
  public:
   typedef void UserThreadFunc(T);
 
-  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : func_(func),
-        param_(param),
-        thread_can_start_(thread_can_start),
+  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
+      : func_(func), param_(param), thread_can_start_(thread_can_start),
         finished_(false) {
-    ThreadWithParamBase* const base = this;
+    ThreadWithParamBase *const base = this;
     // The thread can be created only after all fields except thread_
     // have been initialized.
     GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+        pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
   }
-  ~ThreadWithParam() { Join(); }
+  ~ThreadWithParam() override { Join(); }
 
   void Join() {
     if (!finished_) {
-      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
       finished_ = true;
     }
   }
 
-  virtual void Run() {
-    if (thread_can_start_ != NULL)
-      thread_can_start_->WaitForNotification();
+  void Run() override {
+    if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
     func_(param_);
   }
 
  private:
-  UserThreadFunc* const func_;  // User-supplied thread function.
+  UserThreadFunc *const func_;  // User-supplied thread function.
   const T param_;  // User-supplied parameter to the thread function.
   // When non-NULL, used to block execution until the controller thread
   // notifies.
-  Notification* const thread_can_start_;
-  bool finished_;  // true iff we know that the thread function has finished.
+  Notification *const thread_can_start_;
+  bool finished_;  // true if and only if we know that the thread function has
+                   // finished.
   pthread_t thread_;  // The native thread object.
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
 };
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1698,7 +1381,7 @@
   // Initializes owner_thread_id_ and critical_section_ in static mutexes.
   void ThreadSafeLazyInit();
 
-  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
   // we assume that 0 is an invalid value for thread IDs.
   unsigned int owner_thread_id_;
 
@@ -1706,16 +1389,16 @@
   // by the linker.
   MutexType type_;
   long critical_section_init_phase_;  // NOLINT
-  _RTL_CRITICAL_SECTION* critical_section_;
+  GTEST_CRITICAL_SECTION *critical_section_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1724,13 +1407,12 @@
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex *mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
-  Mutex* const mutex_;
+  Mutex *const mutex_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
 };
@@ -1752,7 +1434,7 @@
   // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
   // responsibility not to call this when the ThreadLocal<T> instance already
   // has a value on the current thread.
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const = 0;
 
  protected:
   ThreadLocalBase() {}
@@ -1769,12 +1451,12 @@
  public:
   // Registers thread_local_instance as having value on the current thread.
   // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance);
+  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
+      const ThreadLocalBase *thread_local_instance);
 
   // Invoked when a ThreadLocal instance is destroyed.
   static void OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance);
+      const ThreadLocalBase *thread_local_instance);
 };
 
 class GTEST_API_ ThreadWithParamBase {
@@ -1788,7 +1470,7 @@
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  ThreadWithParamBase(Runnable *runnable, Notification *thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1801,25 +1483,19 @@
  public:
   typedef void UserThreadFunc(T);
 
-  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
+  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
+    RunnableImpl(UserThreadFunc *func, T param) : func_(func), param_(param) {}
     virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
+    virtual void Run() { func_(param_); }
 
    private:
-    UserThreadFunc* const func_;
+    UserThreadFunc *const func_;
     const T param_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
@@ -1859,15 +1535,15 @@
 class ThreadLocal : public ThreadLocalBase {
  public:
   ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T& value)
+  explicit ThreadLocal(const T &value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
+  T *pointer() { return GetOrCreateValue(); }
+  const T *pointer() const { return GetOrCreateValue(); }
+  const T &get() const { return *pointer(); }
+  void set(const T &value) { *pointer() = value; }
 
  private:
   // Holds a value of T.  Can be deleted via its base class without the caller
@@ -1875,22 +1551,22 @@
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
     ValueHolder() : value_() {}
-    explicit ValueHolder(const T& value) : value_(value) {}
+    explicit ValueHolder(const T &value) : value_(value) {}
 
-    T* pointer() { return &value_; }
+    T *pointer() { return &value_; }
 
    private:
     T value_;
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
   };
 
-
-  T* GetOrCreateValue() const {
-    return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  T *GetOrCreateValue() const {
+    return static_cast<ValueHolder *>(
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1898,7 +1574,7 @@
    public:
     ValueHolderFactory() {}
     virtual ~ValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const = 0;
+    virtual ValueHolder *MakeNewHolder() const = 0;
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
@@ -1907,7 +1583,7 @@
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
@@ -1915,8 +1591,8 @@
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
-    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
+    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
+    ValueHolder *MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
@@ -1926,12 +1602,12 @@
     GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
   };
 
-  scoped_ptr<ValueHolderFactory> default_factory_;
+  std::unique_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1978,24 +1654,27 @@
 };
 
 // Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
-#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() }
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, 0 }
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
 class Mutex : public MutexBase {
  public:
   Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
 
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
@@ -2008,13 +1687,12 @@
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase *mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
-  MutexBase* const mutex_;
+  MutexBase *const mutex_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
 };
@@ -2034,17 +1712,17 @@
 
 // Called by pthread to delete thread-local data stored by
 // pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+extern "C" inline void DeleteThreadLocalValue(void *value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase *>(value_holder);
 }
 
 // Implements thread-local storage on pthreads-based systems.
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal()
       : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T& value)
+  explicit ThreadLocal(const T &value)
       : key_(CreateKey()),
         default_factory_(new InstanceValueHolderFactory(value)) {}
 
@@ -2057,19 +1735,19 @@
     GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
   }
 
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
+  T *pointer() { return GetOrCreateValue(); }
+  const T *pointer() const { return GetOrCreateValue(); }
+  const T &get() const { return *pointer(); }
+  void set(const T &value) { *pointer() = value; }
 
  private:
   // Holds a value of type T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
     ValueHolder() : value_() {}
-    explicit ValueHolder(const T& value) : value_(value) {}
+    explicit ValueHolder(const T &value) : value_(value) {}
 
-    T* pointer() { return &value_; }
+    T *pointer() { return &value_; }
 
    private:
     T value_;
@@ -2085,15 +1763,15 @@
     return key;
   }
 
-  T* GetOrCreateValue() const {
-    ThreadLocalValueHolderBase* const holder =
-        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
-    if (holder != NULL) {
+  T *GetOrCreateValue() const {
+    ThreadLocalValueHolderBase *const holder =
+        static_cast<ThreadLocalValueHolderBase *>(pthread_getspecific(key_));
+    if (holder != nullptr) {
       return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
     }
 
-    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
-    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    ValueHolder *const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase *const holder_base = new_holder;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
     return new_holder->pointer();
   }
@@ -2102,7 +1780,7 @@
    public:
     ValueHolderFactory() {}
     virtual ~ValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const = 0;
+    virtual ValueHolder *MakeNewHolder() const = 0;
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
@@ -2111,7 +1789,7 @@
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
@@ -2119,8 +1797,8 @@
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
-    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
+    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
+    ValueHolder *MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
@@ -2132,12 +1810,12 @@
 
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
-  scoped_ptr<ValueHolderFactory> default_factory_;
+  std::unique_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -2154,10 +1832,10 @@
   void AssertHeld() const {}
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -2166,20 +1844,21 @@
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+  explicit GTestMutexLock(Mutex *) {}  // NOLINT
 };
 
 typedef GTestMutexLock MutexLock;
 
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T& value) : value_(value) {}
-  T* pointer() { return &value_; }
-  const T* pointer() const { return &value_; }
-  const T& get() const { return value_; }
-  void set(const T& value) { value_ = value; }
+  explicit ThreadLocal(const T &value) : value_(value) {}
+  T *pointer() { return &value_; }
+  const T *pointer() const { return &value_; }
+  const T &get() const { return value_; }
+  void set(const T &value) { value_ = value; }
+
  private:
   T value_;
 };
@@ -2190,68 +1869,12 @@
 // we cannot detect it.
 GTEST_API_ size_t GetThreadCount();
 
-// Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
-// and the IBM XL C/C++ compiler try to instantiate a copy constructor
-// for objects passed through ellipsis (...), failing for uncopyable
-// objects.  We define this to ensure that only POD is passed through
-// ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_ELLIPSIS_NEEDS_POD_ 1
-#else
-# define GTEST_CAN_COMPARE_NULL 1
-#endif
-
-// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
-// const T& and const T* in a function template.  These compilers
-// _can_ decide between class template specializations for T and T*,
-// so a tr1::type_traits-like is_pointer works.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
-# define GTEST_NEEDS_IS_POINTER_ 1
-#endif
-
-template <bool bool_value>
-struct bool_constant {
-  typedef bool_constant<bool_value> type;
-  static const bool value = bool_value;
-};
-template <bool bool_value> const bool bool_constant<bool_value>::value;
-
-typedef bool_constant<false> false_type;
-typedef bool_constant<true> true_type;
-
-template <typename T>
-struct is_pointer : public false_type {};
-
-template <typename T>
-struct is_pointer<T*> : public true_type {};
-
-template <typename Iterator>
-struct IteratorTraits {
-  typedef typename Iterator::value_type value_type;
-};
-
-template <typename T>
-struct IteratorTraits<T*> {
-  typedef T value_type;
-};
-
-template <typename T>
-struct IteratorTraits<const T*> {
-  typedef T value_type;
-};
-
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -2296,8 +1919,7 @@
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
   return str;
 }
 
@@ -2315,112 +1937,125 @@
 
 typedef struct _stat StatStruct;
 
-# ifdef __BORLANDC__
+#ifdef __BORLANDC__
 inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
+inline int StrCaseCmp(const char *s1, const char *s2) {
   return stricmp(s1, s2);
 }
-inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
+inline char *StrDup(const char *src) { return strdup(src); }
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE
 inline int IsATTY(int /* fd */) { return 0; }
-#  else
+#else
 inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char* s1, const char* s2) {
+#endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char *s1, const char *s2) {
   return _stricmp(s1, s2);
 }
-inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
+inline char *StrDup(const char *src) { return _strdup(src); }
+#endif  // __BORLANDC__
 
-# if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+#if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE *file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-# else
-inline int FileNo(FILE* file) { return _fileno(file); }
-inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
-inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
+#else
+inline int FileNo(FILE *file) { return _fileno(file); }
+inline int Stat(const char *path, StatStruct *buf) { return _stat(path, buf); }
+inline int RmDir(const char *dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#elif GTEST_OS_ESP8266
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE *file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char *path, StatStruct *buf) {
+  // stat function not implemented on ESP8266
+  return 0;
 }
-# endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char *s1, const char *s2) {
+  return strcasecmp(s1, s2);
+}
+inline char *StrDup(const char *src) { return strdup(src); }
+inline int RmDir(const char *dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
 
 #else
 
 typedef struct stat StatStruct;
 
-inline int FileNo(FILE* file) { return fileno(file); }
+inline int FileNo(FILE *file) { return fileno(file); }
 inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
+inline int Stat(const char *path, StatStruct *buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char *s1, const char *s2) {
   return strcasecmp(s1, s2);
 }
-inline char* StrDup(const char* src) { return strdup(src); }
-inline int RmDir(const char* dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+inline char *StrDup(const char *src) { return strdup(src); }
+inline int RmDir(const char *dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
 
 #endif  // GTEST_OS_WINDOWS
 
 // Functions deprecated by MSVC 8.0.
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
-
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
 #if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-inline int ChDir(const char* dir) { return chdir(dir); }
+inline int ChDir(const char *dir) { return chdir(dir); }
 #endif
-inline FILE* FOpen(const char* path, const char* mode) {
+inline FILE *FOpen(const char *path, const char *mode) {
   return fopen(path, mode);
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE *FReopen(const char *path, const char *mode, FILE *stream) {
   return freopen(path, mode, stream);
 }
-inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+inline FILE *FDOpen(int fd, const char *mode) { return fdopen(fd, mode); }
 #endif
-inline int FClose(FILE* fp) { return fclose(fp); }
+inline int FClose(FILE *fp) { return fclose(fp); }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void* buf, unsigned int count) {
+inline int Read(int fd, void *buf, unsigned int count) {
   return static_cast<int>(read(fd, buf, count));
 }
-inline int Write(int fd, const void* buf, unsigned int count) {
+inline int Write(int fd, const void *buf, unsigned int count) {
   return static_cast<int>(write(fd, buf, count));
 }
 inline int Close(int fd) { return close(fd); }
-inline const char* StrError(int errnum) { return strerror(errnum); }
+inline const char *StrError(int errnum) { return strerror(errnum); }
 #endif
-inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
-  // We are on Windows CE, which has no environment variables.
+inline const char *GetEnv(const char *name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+  // We are on an embedded platform, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
-  return NULL;
+  return nullptr;
 #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
   // Environment variables which we programmatically clear will be set to the
   // empty string rather than unset (NULL).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != NULL && env[0] != '\0') ? env : NULL;
+  const char *const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
 #else
   return getenv(name);
 #endif
 }
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Windows CE has no C library. The abort() function is used in
 // several places in Google Test. This implementation provides a reasonable
 // imitation of standard behaviour.
-void Abort();
+[[noreturn]] void Abort();
 #else
-inline void Abort() { abort(); }
+[[noreturn]] inline void Abort() {
+  abort();
+}
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
 }  // namespace posix
@@ -2430,27 +2065,24 @@
 // MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
 // function in order to achieve that.  We use macro definition here because
 // snprintf is a variadic function.
-#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
-// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
-// complain about _snprintf.
-# define GTEST_SNPRINTF_ _snprintf
+// Windows CE does not define _snprintf_s
+#define GTEST_SNPRINTF_ _snprintf
 #else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
 #endif
 
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
+// The biggest signed integer type the compiler supports.
 //
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+// long long is guaranteed to be at least 64-bits in C++11.
+using BiggestInt = long long;  // NOLINT
+
+// The maximum number a BiggestInt can represent.
+constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits<BiggestInt>::max)();
 
 // This template class serves as a compile-time function from size to
 // type.  It maps a size in bytes to a primitive type with that
@@ -2475,93 +2107,126 @@
  public:
   // This prevents the user from using TypeWithSize<N> with incorrect
   // values of N.
-  typedef void UInt;
+  using UInt = void;
 };
 
 // The specialization for size 4.
 template <>
 class TypeWithSize<4> {
  public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
+  using Int = std::int32_t;
+  using UInt = std::uint32_t;
 };
 
 // The specialization for size 8.
 template <>
 class TypeWithSize<8> {
  public:
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+  using Int = std::int64_t;
+  using UInt = std::uint64_t;
 };
 
 // Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Utilities for command line flags and environment variables.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
 #if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
 #endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
 
 #if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
 
 // Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name)
 #define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+  GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
 #define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val)
 #define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
 #define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
 
 #endif  // !defined(GTEST_DECLARE_bool_)
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
-// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
-// function.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+bool ParseInt32(const Message &src_text, const char *str, int32_t *value);
 
-// Parses a bool/Int32/string from the environment variable
+// Parses a bool/int32_t/string from the environment variable
 // corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-std::string StringFromGTestEnv(const char* flag, const char* default_val);
+bool BoolFromGTestEnv(const char *flag, bool default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char *flag, int32_t default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char *StringFromGTestEnv(const char *flag, const char *default_val);
 
 }  // namespace internal
 }  // namespace testing
 
+#if !defined(GTEST_INTERNAL_DEPRECATED)
+
+// Internal Macro to mark an API deprecated, for googletest usage only
+// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
+// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
+// a deprecated entity will trigger a warning when compiled with
+// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
+// For msvc /W3 option will need to be used
+// Note that for 'other' compilers this macro evaluates to nothing to prevent
+// compilations errors.
+#if defined(_MSC_VER)
+#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__GNUC__)
+#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
+#else
+#define GTEST_INTERNAL_DEPRECATED(message)
+#endif
+
+#endif  // !defined(GTEST_INTERNAL_DEPRECATED)
+
+#if GTEST_HAS_ABSL
+// Always use absl::string_view for Matcher<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include "absl/strings/string_view.h"
+namespace testing {
+namespace internal {
+using StringView = ::absl::string_view;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::string_view for Matcher<>
+// specializations.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include <string_view>
+namespace testing {
+namespace internal {
+using StringView = ::std::string_view;
+}  // namespace internal
+}  // namespace testing
+   // The case where absl is configured NOT to alias std::string_view is not
+   // supported.
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
index 97f1a7f..f1f9330 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h

@@ -27,26 +27,27 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
 // Google Test.  They are subject to change without notice. They should not used
 // by code external to Google Test.
 //
-// This header file is #included by <gtest/internal/gtest-internal.h>.
+// This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
 #endif
 
 #include <string.h>
+#include <cstdint>
 #include <string>
 
 #include "gtest/internal/gtest-port.h"
@@ -66,7 +67,7 @@
   //
   // This is different from strdup() in string.h, which allocates
   // memory using malloc().
-  static const char* CloneCString(const char* c_str);
+  static const char *CloneCString(const char *c_str);
 
 #if GTEST_OS_WINDOWS_MOBILE
   // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
@@ -81,7 +82,7 @@
   // The wide string is created using the ANSI codepage (CP_ACP) to
   // match the behaviour of the ANSI versions of Win32 calls and the
   // C runtime.
-  static LPCWSTR AnsiToUtf16(const char* c_str);
+  static LPCWSTR AnsiToUtf16(const char *c_str);
 
   // Creates an ANSI string from the given wide string, allocating
   // memory using new. The caller is responsible for deleting the return
@@ -91,41 +92,41 @@
   // The returned string is created using the ANSI codepage (CP_ACP) to
   // match the behaviour of the ANSI versions of Win32 calls and the
   // C runtime.
-  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+  static const char *Utf16ToAnsi(LPCWSTR utf16_str);
 #endif
 
-  // Compares two C strings.  Returns true iff they have the same content.
+  // Compares two C strings.  Returns true if and only if they have the same
+  // content.
   //
   // Unlike strcmp(), this function can handle NULL argument(s).  A
   // NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CStringEquals(const char* lhs, const char* rhs);
+  static bool CStringEquals(const char *lhs, const char *rhs);
 
   // Converts a wide C string to a String using the UTF-8 encoding.
   // NULL will be converted to "(null)".  If an error occurred during
   // the conversion, "(failed to convert from wide string)" is
   // returned.
-  static std::string ShowWideCString(const wchar_t* wide_c_str);
+  static std::string ShowWideCString(const wchar_t *wide_c_str);
 
-  // Compares two wide C strings.  Returns true iff they have the same
-  // content.
+  // Compares two wide C strings.  Returns true if and only if they have the
+  // same content.
   //
   // Unlike wcscmp(), this function can handle NULL argument(s).  A
   // NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+  static bool WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs);
 
-  // Compares two C strings, ignoring case.  Returns true iff they
-  // have the same content.
+  // Compares two C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
   //
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
+  static bool CaseInsensitiveCStringEquals(const char *lhs, const char *rhs);
 
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
+  // Compares two wide C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
   //
   // Unlike wcscasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL wide C string,
@@ -136,13 +137,13 @@
   // which compares according to LC_CTYPE category of the current locale.
   // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
   // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                               const wchar_t* rhs);
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
+                                               const wchar_t *rhs);
 
-  // Returns true iff the given string ends with the given suffix, ignoring
-  // case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
+  // Returns true if and only if the given string ends with the given suffix,
+  // ignoring case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(const std::string &str,
+                                      const std::string &suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
@@ -150,16 +151,19 @@
   // Formats an int value as "%X".
   static std::string FormatHexInt(int value);
 
+  // Formats an int value as "%X".
+  static std::string FormatHexUInt32(uint32_t value);
+
   // Formats a byte as "%02X".
   static std::string FormatByte(unsigned char value);
 
  private:
   String();  // Not meant to be instantiated.
-};  // class String
+};           // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
-GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+GTEST_API_ std::string StringStreamToString(::std::stringstream *stream);
 
 }  // namespace internal
 }  // namespace testing

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h
deleted file mode 100644
index e9b4053..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h
+++ /dev/null

@@ -1,1020 +0,0 @@
-// This file was GENERATED by command:
-//     pump.py gtest-tuple.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2009 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-
-#include <utility>  // For ::std::pair.
-
-// The compiler used in Symbian has a bug that prevents us from declaring the
-// tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
-// private as public.
-// Sun Studio versions < 12 also have the above bug.
-#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
-#else
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
-    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
-   private:
-#endif
-
-// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
-// with our own definitions. Therefore using our own tuple does not work on
-// those compilers.
-#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
-# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
-GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
-#endif
-
-// GTEST_n_TUPLE_(T) is the type of an n-tuple.
-#define GTEST_0_TUPLE_(T) tuple<>
-#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
-    void, void, void>
-#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
-    void, void, void>
-#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
-    void, void, void>
-#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
-    void, void, void>
-#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    void, void, void>
-#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, void, void>
-#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, void>
-#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, T##9>
-
-// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
-#define GTEST_0_TYPENAMES_(T)
-#define GTEST_1_TYPENAMES_(T) typename T##0
-#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
-#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
-#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3
-#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4
-#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5
-#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6
-#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
-#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8
-#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8, typename T##9
-
-// In theory, defining stuff in the ::std namespace is undefined
-// behavior.  We can do this as we are playing the role of a standard
-// library vendor.
-namespace std {
-namespace tr1 {
-
-template <typename T0 = void, typename T1 = void, typename T2 = void,
-    typename T3 = void, typename T4 = void, typename T5 = void,
-    typename T6 = void, typename T7 = void, typename T8 = void,
-    typename T9 = void>
-class tuple;
-
-// Anything in namespace gtest_internal is Google Test's INTERNAL
-// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
-namespace gtest_internal {
-
-// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
-template <typename T>
-struct ByRef { typedef const T& type; };  // NOLINT
-template <typename T>
-struct ByRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for ByRef.
-#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
-
-// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
-// is the same as tr1::add_reference<T>::type.
-template <typename T>
-struct AddRef { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for AddRef.
-#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
-
-// A helper for implementing get<k>().
-template <int k> class Get;
-
-// A helper for implementing tuple_element<k, T>.  kIndexValid is true
-// iff k < the number of fields in tuple type T.
-template <bool kIndexValid, int kIndex, class Tuple>
-struct TupleElement;
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
-  typedef T0 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
-  typedef T1 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
-  typedef T2 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
-  typedef T3 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
-  typedef T4 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
-  typedef T5 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
-  typedef T6 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
-  typedef T7 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
-  typedef T8 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
-  typedef T9 type;
-};
-
-}  // namespace gtest_internal
-
-template <>
-class tuple<> {
- public:
-  tuple() {}
-  tuple(const tuple& /* t */)  {}
-  tuple& operator=(const tuple& /* t */) { return *this; }
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-class GTEST_1_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
-
-  tuple(const tuple& t) : f0_(t.f0_) {}
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    return *this;
-  }
-
-  T0 f0_;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-class GTEST_2_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
-      f1_(f1) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
-  template <typename U0, typename U1>
-  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-  template <typename U0, typename U1>
-  tuple& operator=(const ::std::pair<U0, U1>& p) {
-    f0_ = p.first;
-    f1_ = p.second;
-    return *this;
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-class GTEST_3_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-class GTEST_4_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-class GTEST_5_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
-      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_) {}
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-class GTEST_6_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_) {}
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-class GTEST_7_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-class GTEST_8_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
-      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-class GTEST_9_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-class tuple {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
-      f9_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
-      f9_(t.f9_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    f9_ = t.f9_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-  T9 f9_;
-};
-
-// 6.1.3.2 Tuple creation functions.
-
-// Known limitations: we don't support passing an
-// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
-// implement tie().
-
-inline tuple<> make_tuple() { return tuple<>(); }
-
-template <GTEST_1_TYPENAMES_(T)>
-inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
-  return GTEST_1_TUPLE_(T)(f0);
-}
-
-template <GTEST_2_TYPENAMES_(T)>
-inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
-  return GTEST_2_TUPLE_(T)(f0, f1);
-}
-
-template <GTEST_3_TYPENAMES_(T)>
-inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
-  return GTEST_3_TUPLE_(T)(f0, f1, f2);
-}
-
-template <GTEST_4_TYPENAMES_(T)>
-inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3) {
-  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
-}
-
-template <GTEST_5_TYPENAMES_(T)>
-inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4) {
-  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
-}
-
-template <GTEST_6_TYPENAMES_(T)>
-inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5) {
-  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
-}
-
-template <GTEST_7_TYPENAMES_(T)>
-inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
-  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
-}
-
-template <GTEST_8_TYPENAMES_(T)>
-inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
-  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
-}
-
-template <GTEST_9_TYPENAMES_(T)>
-inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8) {
-  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
-}
-
-template <GTEST_10_TYPENAMES_(T)>
-inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8, const T9& f9) {
-  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
-}
-
-// 6.1.3.3 Tuple helper classes.
-
-template <typename Tuple> struct tuple_size;
-
-template <GTEST_0_TYPENAMES_(T)>
-struct tuple_size<GTEST_0_TUPLE_(T) > {
-  static const int value = 0;
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-struct tuple_size<GTEST_1_TUPLE_(T) > {
-  static const int value = 1;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-struct tuple_size<GTEST_2_TUPLE_(T) > {
-  static const int value = 2;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-struct tuple_size<GTEST_3_TUPLE_(T) > {
-  static const int value = 3;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-struct tuple_size<GTEST_4_TUPLE_(T) > {
-  static const int value = 4;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-struct tuple_size<GTEST_5_TUPLE_(T) > {
-  static const int value = 5;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-struct tuple_size<GTEST_6_TUPLE_(T) > {
-  static const int value = 6;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-struct tuple_size<GTEST_7_TUPLE_(T) > {
-  static const int value = 7;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-struct tuple_size<GTEST_8_TUPLE_(T) > {
-  static const int value = 8;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-struct tuple_size<GTEST_9_TUPLE_(T) > {
-  static const int value = 9;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct tuple_size<GTEST_10_TUPLE_(T) > {
-  static const int value = 10;
-};
-
-template <int k, class Tuple>
-struct tuple_element {
-  typedef typename gtest_internal::TupleElement<
-      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
-};
-
-#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
-
-// 6.1.3.4 Element access.
-
-namespace gtest_internal {
-
-template <>
-class Get<0> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  Field(Tuple& t) { return t.f0_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  ConstField(const Tuple& t) { return t.f0_; }
-};
-
-template <>
-class Get<1> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  Field(Tuple& t) { return t.f1_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  ConstField(const Tuple& t) { return t.f1_; }
-};
-
-template <>
-class Get<2> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  Field(Tuple& t) { return t.f2_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  ConstField(const Tuple& t) { return t.f2_; }
-};
-
-template <>
-class Get<3> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  Field(Tuple& t) { return t.f3_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  ConstField(const Tuple& t) { return t.f3_; }
-};
-
-template <>
-class Get<4> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  Field(Tuple& t) { return t.f4_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  ConstField(const Tuple& t) { return t.f4_; }
-};
-
-template <>
-class Get<5> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  Field(Tuple& t) { return t.f5_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  ConstField(const Tuple& t) { return t.f5_; }
-};
-
-template <>
-class Get<6> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  Field(Tuple& t) { return t.f6_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  ConstField(const Tuple& t) { return t.f6_; }
-};
-
-template <>
-class Get<7> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  Field(Tuple& t) { return t.f7_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  ConstField(const Tuple& t) { return t.f7_; }
-};
-
-template <>
-class Get<8> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  Field(Tuple& t) { return t.f8_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  ConstField(const Tuple& t) { return t.f8_; }
-};
-
-template <>
-class Get<9> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  Field(Tuple& t) { return t.f9_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  ConstField(const Tuple& t) { return t.f9_; }
-};
-
-}  // namespace gtest_internal
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
-get(GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::Field(t);
-}
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
-get(const GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::ConstField(t);
-}
-
-// 6.1.3.5 Relational operators
-
-// We only implement == and !=, as we don't have a need for the rest yet.
-
-namespace gtest_internal {
-
-// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
-// first k fields of t1 equals the first k fields of t2.
-// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
-// k1 != k2.
-template <int kSize1, int kSize2>
-struct SameSizeTuplePrefixComparator;
-
-template <>
-struct SameSizeTuplePrefixComparator<0, 0> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
-    return true;
-  }
-};
-
-template <int k>
-struct SameSizeTuplePrefixComparator<k, k> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
-    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
-        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
-  }
-};
-
-}  // namespace gtest_internal
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator==(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) {
-  return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_10_TUPLE_(T) >::value,
-      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
-}
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
-
-// 6.1.4 Pairs.
-// Unimplemented.
-
-}  // namespace tr1
-}  // namespace std
-
-#undef GTEST_0_TUPLE_
-#undef GTEST_1_TUPLE_
-#undef GTEST_2_TUPLE_
-#undef GTEST_3_TUPLE_
-#undef GTEST_4_TUPLE_
-#undef GTEST_5_TUPLE_
-#undef GTEST_6_TUPLE_
-#undef GTEST_7_TUPLE_
-#undef GTEST_8_TUPLE_
-#undef GTEST_9_TUPLE_
-#undef GTEST_10_TUPLE_
-
-#undef GTEST_0_TYPENAMES_
-#undef GTEST_1_TYPENAMES_
-#undef GTEST_2_TYPENAMES_
-#undef GTEST_3_TYPENAMES_
-#undef GTEST_4_TYPENAMES_
-#undef GTEST_5_TYPENAMES_
-#undef GTEST_6_TYPENAMES_
-#undef GTEST_7_TYPENAMES_
-#undef GTEST_8_TYPENAMES_
-#undef GTEST_9_TYPENAMES_
-#undef GTEST_10_TYPENAMES_
-
-#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
-#undef GTEST_BY_REF_
-#undef GTEST_ADD_REF_
-#undef GTEST_TUPLE_ELEMENT_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump
deleted file mode 100644
index 429ddfe..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump
+++ /dev/null

@@ -1,347 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 10  $$ Maximum number of tuple fields we want to support.
-$$ This meta comment fixes auto-indentation in Emacs. }}
-// Copyright 2009 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-
-#include <utility>  // For ::std::pair.
-
-// The compiler used in Symbian has a bug that prevents us from declaring the
-// tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
-// private as public.
-// Sun Studio versions < 12 also have the above bug.
-#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
-#else
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
-    template <GTEST_$(n)_TYPENAMES_(U)> friend class tuple; \
-   private:
-#endif
-
-// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
-// with our own definitions. Therefore using our own tuple does not work on
-// those compilers.
-#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
-# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
-GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
-#endif
-
-
-$range i 0..n-1
-$range j 0..n
-$range k 1..n
-// GTEST_n_TUPLE_(T) is the type of an n-tuple.
-#define GTEST_0_TUPLE_(T) tuple<>
-
-$for k [[
-$range m 0..k-1
-$range m2 k..n-1
-#define GTEST_$(k)_TUPLE_(T) tuple<$for m, [[T##$m]]$for m2 [[, void]]>
-
-]]
-
-// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
-
-$for j [[
-$range m 0..j-1
-#define GTEST_$(j)_TYPENAMES_(T) $for m, [[typename T##$m]]
-
-
-]]
-
-// In theory, defining stuff in the ::std namespace is undefined
-// behavior.  We can do this as we are playing the role of a standard
-// library vendor.
-namespace std {
-namespace tr1 {
-
-template <$for i, [[typename T$i = void]]>
-class tuple;
-
-// Anything in namespace gtest_internal is Google Test's INTERNAL
-// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
-namespace gtest_internal {
-
-// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
-template <typename T>
-struct ByRef { typedef const T& type; };  // NOLINT
-template <typename T>
-struct ByRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for ByRef.
-#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
-
-// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
-// is the same as tr1::add_reference<T>::type.
-template <typename T>
-struct AddRef { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for AddRef.
-#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
-
-// A helper for implementing get<k>().
-template <int k> class Get;
-
-// A helper for implementing tuple_element<k, T>.  kIndexValid is true
-// iff k < the number of fields in tuple type T.
-template <bool kIndexValid, int kIndex, class Tuple>
-struct TupleElement;
-
-
-$for i [[
-template <GTEST_$(n)_TYPENAMES_(T)>
-struct TupleElement<true, $i, GTEST_$(n)_TUPLE_(T) > {
-  typedef T$i type;
-};
-
-
-]]
-}  // namespace gtest_internal
-
-template <>
-class tuple<> {
- public:
-  tuple() {}
-  tuple(const tuple& /* t */)  {}
-  tuple& operator=(const tuple& /* t */) { return *this; }
-};
-
-
-$for k [[
-$range m 0..k-1
-template <GTEST_$(k)_TYPENAMES_(T)>
-class $if k < n [[GTEST_$(k)_TUPLE_(T)]] $else [[tuple]] {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : $for m, [[f$(m)_()]] {}
-
-  explicit tuple($for m, [[GTEST_BY_REF_(T$m) f$m]]) : [[]]
-$for m, [[f$(m)_(f$m)]] {}
-
-  tuple(const tuple& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
-
-  template <GTEST_$(k)_TYPENAMES_(U)>
-  tuple(const GTEST_$(k)_TUPLE_(U)& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
-
-$if k == 2 [[
-  template <typename U0, typename U1>
-  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
-
-]]
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_$(k)_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_$(k)_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-$if k == 2 [[
-  template <typename U0, typename U1>
-  tuple& operator=(const ::std::pair<U0, U1>& p) {
-    f0_ = p.first;
-    f1_ = p.second;
-    return *this;
-  }
-
-]]
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_$(k)_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_$(k)_TUPLE_(U)& t) {
-
-$for m [[
-    f$(m)_ = t.f$(m)_;
-
-]]
-    return *this;
-  }
-
-
-$for m [[
-  T$m f$(m)_;
-
-]]
-};
-
-
-]]
-// 6.1.3.2 Tuple creation functions.
-
-// Known limitations: we don't support passing an
-// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
-// implement tie().
-
-inline tuple<> make_tuple() { return tuple<>(); }
-
-$for k [[
-$range m 0..k-1
-
-template <GTEST_$(k)_TYPENAMES_(T)>
-inline GTEST_$(k)_TUPLE_(T) make_tuple($for m, [[const T$m& f$m]]) {
-  return GTEST_$(k)_TUPLE_(T)($for m, [[f$m]]);
-}
-
-]]
-
-// 6.1.3.3 Tuple helper classes.
-
-template <typename Tuple> struct tuple_size;
-
-
-$for j [[
-template <GTEST_$(j)_TYPENAMES_(T)>
-struct tuple_size<GTEST_$(j)_TUPLE_(T) > {
-  static const int value = $j;
-};
-
-
-]]
-template <int k, class Tuple>
-struct tuple_element {
-  typedef typename gtest_internal::TupleElement<
-      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
-};
-
-#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
-
-// 6.1.3.4 Element access.
-
-namespace gtest_internal {
-
-
-$for i [[
-template <>
-class Get<$i> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
-  Field(Tuple& t) { return t.f$(i)_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
-  ConstField(const Tuple& t) { return t.f$(i)_; }
-};
-
-
-]]
-}  // namespace gtest_internal
-
-template <int k, GTEST_$(n)_TYPENAMES_(T)>
-GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T)))
-get(GTEST_$(n)_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::Field(t);
-}
-
-template <int k, GTEST_$(n)_TYPENAMES_(T)>
-GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_$(n)_TUPLE_(T)))
-get(const GTEST_$(n)_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::ConstField(t);
-}
-
-// 6.1.3.5 Relational operators
-
-// We only implement == and !=, as we don't have a need for the rest yet.
-
-namespace gtest_internal {
-
-// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
-// first k fields of t1 equals the first k fields of t2.
-// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
-// k1 != k2.
-template <int kSize1, int kSize2>
-struct SameSizeTuplePrefixComparator;
-
-template <>
-struct SameSizeTuplePrefixComparator<0, 0> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
-    return true;
-  }
-};
-
-template <int k>
-struct SameSizeTuplePrefixComparator<k, k> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
-    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
-        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
-  }
-};
-
-}  // namespace gtest_internal
-
-template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
-inline bool operator==(const GTEST_$(n)_TUPLE_(T)& t,
-                       const GTEST_$(n)_TUPLE_(U)& u) {
-  return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_$(n)_TUPLE_(T) >::value,
-      tuple_size<GTEST_$(n)_TUPLE_(U) >::value>::Eq(t, u);
-}
-
-template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
-inline bool operator!=(const GTEST_$(n)_TUPLE_(T)& t,
-                       const GTEST_$(n)_TUPLE_(U)& u) { return !(t == u); }
-
-// 6.1.4 Pairs.
-// Unimplemented.
-
-}  // namespace tr1
-}  // namespace std
-
-
-$for j [[
-#undef GTEST_$(j)_TUPLE_
-
-]]
-
-
-$for j [[
-#undef GTEST_$(j)_TYPENAMES_
-
-]]
-
-#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
-#undef GTEST_BY_REF_
-#undef GTEST_ADD_REF_
-#undef GTEST_TUPLE_ELEMENT_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
index e46f7cf..3b3a651 100644
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
+++ b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h

@@ -1,7 +1,3 @@
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
 //
@@ -30,16 +26,11 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
 // Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test case.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
+// tests.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
@@ -48,1578 +39,69 @@
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
 
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
 // GetTypeName<T>() returns a human-readable name of type T.
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
 template <typename T>
 std::string GetTypeName() {
-# if GTEST_HAS_RTTI
+#if GTEST_HAS_RTTI
 
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  const char *const name = typeid(T).name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
   // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+#endif  // GTEST_HAS_CXXABI_H_
+  char *const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
-  return name_str;
-#  else
+  return CanonicalizeForStdLibVersioning(name_str);
+#else
   return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
 
-# else
+#else
 
   return "<type>";
 
-# endif  // GTEST_HAS_RTTI
+#endif  // GTEST_HAS_RTTI
 }
 
 #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
+// A unique type indicating an empty node
 struct None {};
 
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -1637,1695 +119,66 @@
   };
 };
 
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
 
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
 struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+  using Head = TemplateSel<Head_>;
+  using Tail = Templates<Tail_...>;
 };
 
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
+template <GTEST_TEMPLATE_ Head_>
+struct Templates<Head_> {
+  using Head = TemplateSel<Head_>;
+  using Tail = None;
 };
 
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_CASE() and
-// INSTANTIATE_TYPED_TEST_CASE_P().
+// Tuple-like type lists
+template <typename Head_, typename... Tail_>
+struct Types {
+  using Head = Head_;
+  using Tail = Types<Tail_...>;
+};
 
+template <typename Head_>
+struct Types<Head_> {
+  using Head = Head_;
+  using Tail = None;
+};
+
+// Helper metafunctions to tell apart a single type from types
+// generated by ::testing::Types
+template <typename... Ts>
+struct ProxyTypeList {
+  using type = Types<Ts...>;
+};
+
+template <typename>
+struct is_proxy_type_list : std::false_type {};
+
+template <typename... Ts>
+struct is_proxy_type_list<ProxyTypeList<Ts...>> : std::true_type {};
+
+// Generator which conditionally creates type lists.
+// It recognizes if a requested type list should be created
+// and prevents creating a new type list nested within another one.
 template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
+struct GenerateTypeList {
+ private:
+  using proxy = typename std::conditional<is_proxy_type_list<T>::value, T,
+                                          ProxyTypeList<T>>::type;
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+ public:
+  using type = typename proxy::type;
 };
 
 #endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
 }  // namespace internal
+
+template <typename... Ts>
+using Types = internal::ProxyTypeList<Ts...>;
+
 }  // namespace testing
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_

diff --git a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump b/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump
deleted file mode 100644
index 251fdf0..0000000
--- a/libaom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump
+++ /dev/null

@@ -1,297 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 50  $$ Maximum length of type lists we want to support.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most $n types in a list, and at most $n
-// type-parameterized tests in one type-parameterized test case.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-#include "gtest/internal/gtest-port.h"
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return name_str;
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[typename T$j]]>
-struct Types$i {
-  typedef T1 Head;
-  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-
-$range i 1..n
-template <$for i, [[typename T$i = internal::None]]>
-struct Types {
-  typedef internal::Types$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Types<$for i, [[internal::None]]> {
-  typedef internal::Types0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[typename T$j]]>
-struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
-  typedef internal::Types$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates$i {
-  typedef TemplateSel<T1> Head;
-  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-
-$range i 1..n
-template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
-struct Templates {
-  typedef Templates$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Templates<$for i, [[NoneT]]> {
-  typedef Templates0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
-  typedef Templates$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_CASE() and
-// INSTANTIATE_TYPED_TEST_CASE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-
-$range i 1..n
-template <$for i, [[typename T$i]]>
-struct TypeList<Types<$for i, [[T$i]]> > {
-  typedef typename Types<$for i, [[T$i]]>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-all.cc b/libaom/third_party/googletest/src/googletest/src/gtest-all.cc
index 0a9cee5..ad29290 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-all.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-all.cc

@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
+// Google C++ Testing and Mocking Framework (Google Test)
 //
 // Sometimes it's desirable to build Google Test by compiling a single file.
 // This file serves this purpose.
@@ -42,6 +41,7 @@
 #include "src/gtest.cc"
 #include "src/gtest-death-test.cc"
 #include "src/gtest-filepath.cc"
+#include "src/gtest-matchers.cc"
 #include "src/gtest-port.cc"
 #include "src/gtest-printers.cc"
 #include "src/gtest-test-part.cc"

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-death-test.cc b/libaom/third_party/googletest/src/googletest/src/gtest-death-test.cc
index a01a369..c38551c 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-death-test.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-death-test.cc

@@ -26,62 +26,74 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+
 //
 // This file implements death tests.
 
 #include "gtest/gtest-death-test.h"
+
+#include <utility>
+
 #include "gtest/internal/gtest-port.h"
 #include "gtest/internal/custom/gtest.h"
 
 #if GTEST_HAS_DEATH_TEST
 
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
 
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
 
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
 
-# include <stdarg.h>
+#include <stdarg.h>
 
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
 
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
 // Constants.
 
 // The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
 GTEST_DEFINE_string_(
     death_test_style,
@@ -110,8 +122,8 @@
     "Indicates the file, line number, temporal index of "
     "the single death test to run, and a file descriptor to "
     "which a success code may be sent, all separated by "
-    "the '|' characters.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
+    "the '|' characters.  This flag is specified if and only if the "
+    "current process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
 }  // namespace internal
 
@@ -121,9 +133,9 @@
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-# endif
+#endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -131,13 +143,13 @@
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
   return !GTEST_FLAG(internal_run_death_test).empty();
 
-# else
+#else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe")
     return !GTEST_FLAG(internal_run_death_test).empty();
@@ -149,40 +161,38 @@
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-# else
+#else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -193,23 +203,23 @@
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-# else
+#else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#  ifdef WCOREDUMP
+#ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#  endif
-# endif  // GTEST_OS_WINDOWS
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -220,7 +230,7 @@
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -229,13 +239,19 @@
   Message msg;
   msg << "Death tests use fork(), which is unsafe particularly"
       << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
+  if (thread_count == 0) {
     msg << "couldn't detect the number of threads.";
-  else
+  } else {
     msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -243,6 +259,13 @@
 static const char kDeathTestThrew = 'T';
 static const char kDeathTestInternalError = 'I';
 
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
 // An enumeration describing all of the possible ways that a death test can
 // conclude.  DIED means that the process died while executing the test
 // code; LIVED means that process lived beyond the end of the test code;
@@ -250,8 +273,6 @@
 // statement, which is not allowed; THREW means that the test statement
 // returned control by throwing an exception.  IN_PROGRESS means the test
 // has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
-// AbortReason, DeathTestOutcome, and flag characters above.
 enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 
 // Routine for aborting the program which is safe to call from an
@@ -259,14 +280,14 @@
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-void DeathTestAbort(const std::string& message) {
+static void DeathTestAbort(const std::string &message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
+  const InternalRunDeathTestFlag *const flag =
       GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != NULL) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+  if (flag != nullptr) {
+    FILE *parent = posix::FDOpen(flag->write_fd(), "w");
     fputc(kDeathTestInternalError, parent);
     fprintf(parent, "%s", message.c_str());
     fflush(parent);
@@ -280,14 +301,14 @@
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -297,23 +318,23 @@
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
+  return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -344,26 +365,28 @@
 // Death test constructor.  Increments the running death test count
 // for the current test.
 DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == NULL) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
+  TestInfo *const info = GetUnitTestImpl()->current_test_info();
+  if (info == nullptr) {
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
   }
 }
 
 // Creates and returns a death test by dispatching to the current
 // death test factory.
-bool DeathTest::Create(const char* statement, const RE* regex,
-                       const char* file, int line, DeathTest** test) {
+bool DeathTest::Create(const char *statement,
+                       Matcher<const std::string &> matcher, const char *file,
+                       int line, DeathTest **test) {
   return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, regex, file, line, test);
+      statement, std::move(matcher), file, line, test);
 }
 
-const char* DeathTest::LastMessage() {
+const char *DeathTest::LastMessage() {
   return last_death_test_message_.c_str();
 }
 
-void DeathTest::set_last_death_test_message(const std::string& message) {
+void DeathTest::set_last_death_test_message(const std::string &message) {
   last_death_test_message_ = message;
 }
 
@@ -372,23 +395,17 @@
 // Provides cross platform implementation for some death functionality.
 class DeathTestImpl : public DeathTest {
  protected:
-  DeathTestImpl(const char* a_statement, const RE* a_regex)
-      : statement_(a_statement),
-        regex_(a_regex),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
+  DeathTestImpl(const char *a_statement, Matcher<const std::string &> matcher)
+      : statement_(a_statement), matcher_(std::move(matcher)), spawned_(false),
+        status_(-1), outcome_(IN_PROGRESS), read_fd_(-1), write_fd_(-1) {}
 
   // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
 
-  void Abort(AbortReason reason);
-  virtual bool Passed(bool status_ok);
+  void Abort(AbortReason reason) override;
+  bool Passed(bool status_ok) override;
 
-  const char* statement() const { return statement_; }
-  const RE* regex() const { return regex_; }
+  const char *statement() const { return statement_; }
   bool spawned() const { return spawned_; }
   void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
   int status() const { return status_; }
@@ -406,13 +423,15 @@
   // case of unexpected codes.
   void ReadAndInterpretStatusByte();
 
+  // Returns stderr output from the child process.
+  virtual std::string GetErrorLogs();
+
  private:
   // The textual content of the code this object is testing.  This class
   // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // The regular expression which test output must match.  DeathTestImpl
-  // doesn't own this object and should not attempt to delete it.
-  const RE* const regex_;
+  const char *const statement_;
+  // A matcher that's expected to match the stderr output by the child process.
+  Matcher<const std::string &> matcher_;
   // True if the death test child process has been successfully spawned.
   bool spawned_;
   // The exit status of the child process.
@@ -449,15 +468,9 @@
     set_outcome(DIED);
   } else if (bytes_read == 1) {
     switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
+      case kDeathTestReturned: set_outcome(RETURNED); break;
+      case kDeathTestThrew: set_outcome(THREW); break;
+      case kDeathTestLived: set_outcome(LIVED); break;
       case kDeathTestInternalError:
         FailFromInternalError(read_fd());  // Does not return.
         break;
@@ -474,6 +487,8 @@
   set_read_fd(-1);
 }
 
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
+
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
 // Writes a status byte to the child's status file descriptor, then
@@ -482,9 +497,11 @@
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+  const char status_ch = reason == TEST_DID_NOT_DIE
+                             ? kDeathTestLived
+                             : reason == TEST_THREW_EXCEPTION
+                                   ? kDeathTestThrew
+                                   : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -501,9 +518,9 @@
 // Returns an indented copy of stderr output for a death test.
 // This makes distinguishing death test output lines from regular log lines
 // much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+static ::std::string FormatDeathTestOutput(const ::std::string &output) {
   ::std::string ret;
-  for (size_t at = 0; ; ) {
+  for (size_t at = 0;;) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -527,22 +544,20 @@
 //             in the format specified by wait(2). On Windows, this is the
 //             value supplied to the ExitProcess() API or a numeric code
 //             of the exception that terminated the program.
-//   regex:    A regular expression object to be applied to
-//             the test's captured standard error output; the death test
-//             fails if it does not match.
+//   matcher_: A matcher that's expected to match the stderr output by the child
+//             process.
 //
 // Argument:
 //   status_ok: true if exit_status is acceptable in the context of
 //              this particular death test, which fails if it is false
 //
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
+// Returns true if and only if all of the above conditions are met.  Otherwise,
+// the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
+  if (!spawned()) return false;
 
-  const std::string error_message = GetCapturedStderr();
+  const std::string error_message = GetErrorLogs();
 
   bool success = false;
   Message buffer;
@@ -551,30 +566,36 @@
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
-        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
-        if (matched) {
+        if (matcher_.Matches(error_message)) {
           success = true;
         } else {
+          std::ostringstream stream;
+          matcher_.DescribeTo(&stream);
           buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << regex()->pattern() << "\n"
-                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+                 << "  Expected: " << stream.str() << "\n"
+                 << "Actual msg:\n"
+                 << FormatDeathTestOutput(error_message);
         }
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -587,7 +608,7 @@
   return success;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -618,11 +639,11 @@
 //
 class WindowsDeathTest : public DeathTestImpl {
  public:
-  WindowsDeathTest(const char* a_statement,
-                   const RE* a_regex,
-                   const char* file,
+  WindowsDeathTest(const char *a_statement,
+                   Matcher<const std::string &> matcher, const char *file,
                    int line)
-      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
 
   // All of these virtual functions are inherited from DeathTest.
   virtual int Wait();
@@ -630,7 +651,7 @@
 
  private:
   // The name of the file in which the death test is located.
-  const char* const file_;
+  const char *const file_;
   // The line number on which the death test is located.
   const int line_;
   // Handle to the write end of the pipe to the child process.
@@ -648,21 +669,17 @@
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
   const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
+  switch (::WaitForMultipleObjects(2, wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+    case WAIT_OBJECT_0 + 1: break;
+    default: GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
   }
 
   // The child has acquired the write end of the pipe or exited.
@@ -676,9 +693,8 @@
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -693,13 +709,13 @@
 // --gtest_internal_run_death_test flags such that it knows to run the
 // current death test only.
 DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
       impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
+  const TestInfo *const info = impl->current_test_info();
   const int death_test_index = info->result()->death_test_count();
 
-  if (flag != NULL) {
+  if (flag != nullptr) {
     // ParseInternalRunDeathTestFlag() has performed all the necessary
     // processing.
     set_write_fd(flag->write_fd());
@@ -708,45 +724,43 @@
 
   // WindowsDeathTest uses an anonymous pipe to communicate results of
   // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {
-    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  SECURITY_ATTRIBUTES handles_are_inheritable = { sizeof(SECURITY_ATTRIBUTES),
+                                                  nullptr, TRUE };
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
-      TRUE,    // The event will automatically reset to non-signaled state.
-      FALSE,   // The initial state is non-signalled.
-      NULL));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
-      info->test_case_name() + "." + info->name();
+      TRUE,       // The event will automatically reset to non-signaled state.
+      FALSE,      // The initial state is non-signalled.
+      nullptr));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" +
+      file_ + "|" + StreamableToString(line_) + "|" +
       StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
+  GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
+                                                                executable_path,
+                                                                _MAX_PATH));
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -763,33 +777,279 @@
   startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
 
   PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreateProcessA(
+          executable_path, const_cast<char *>(command_line.c_str()),
+          nullptr,  // Retuned process handle is not inheritable.
+          nullptr,  // Retuned thread handle is not inheritable.
+          TRUE,  // Child inherits all inheritable handles (for write_handle_).
+          0x0,   // Default creation flags.
+          nullptr,  // Inherit the parent's environment.
+          UnitTest::GetInstance()->original_working_dir(), &startup_info,
+          &process_info) != FALSE);
   child_handle_.Reset(process_info.hProcess);
   ::CloseHandle(process_info.hThread);
   set_spawned(true);
   return OVERSEE_TEST;
 }
-# else  // We are not on Windows.
+
+#elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char *a_statement,
+                   Matcher<const std::string &> matcher, const char *file,
+                   int line)
+      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+  TestRole AssumeRole() override;
+  std::string GetErrorLogs() override;
+
+ private:
+  // The name of the file in which the death test is located.
+  const char *const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // The stderr data captured by the child process.
+  std::string captured_stderr_;
+
+  zx::process child_process_;
+  zx::channel exception_channel_;
+  zx::socket stderr_socket_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char *argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str> &arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end(); ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char *const *Argv() { return &args_[0]; }
+
+  int size() { return args_.size() - 1; }
+
+ private:
+  std::vector<char *> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  const int kProcessKey = 0;
+  const int kSocketKey = 1;
+  const int kExceptionKey = 2;
+
+  if (!spawned()) return 0;
+
+  // Create a port to wait for socket/task/exception events.
+  zx_status_t status_zx;
+  zx::port port;
+  status_zx = zx::port::create(0, &port);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the child process to terminate.
+  status_zx = child_process_.wait_async(
+      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the socket to be readable or closed.
+  status_zx = stderr_socket_.wait_async(
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+      ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for an exception.
+  status_zx = exception_channel_.wait_async(
+      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  bool process_terminated = false;
+  bool socket_closed = false;
+  do {
+    zx_port_packet_t packet = {};
+    status_zx = port.wait(zx::time::infinite(), &packet);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    if (packet.key == kExceptionKey) {
+      // Process encountered an exception. Kill it directly rather than
+      // letting other handlers process the event. We will get a kProcessKey
+      // event when the process actually terminates.
+      status_zx = child_process_.kill();
+      GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    } else if (packet.key == kProcessKey) {
+      // Process terminated.
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+      process_terminated = true;
+    } else if (packet.key == kSocketKey) {
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      if (packet.signal.observed & ZX_SOCKET_READABLE) {
+        // Read data from the socket.
+        constexpr size_t kBufferSize = 1024;
+        do {
+          size_t old_length = captured_stderr_.length();
+          size_t bytes_read = 0;
+          captured_stderr_.resize(old_length + kBufferSize);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
+          captured_stderr_.resize(old_length + bytes_read);
+        } while (status_zx == ZX_OK);
+        if (status_zx == ZX_ERR_PEER_CLOSED) {
+          socket_closed = true;
+        } else {
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
+          status_zx = stderr_socket_.wait_async(
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+              ZX_WAIT_ASYNC_ONCE);
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+        }
+      } else {
+        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
+        socket_closed = true;
+      }
+    }
+  } while (!process_terminated && !socket_closed);
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo *const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  int child_pipe_fd;
+  status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  set_read_fd(child_pipe_fd);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t spawn_actions[2] = {};
+  fdio_spawn_action_t *add_handle_action = &spawn_actions[0];
+  add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
+  add_handle_action->h.handle = child_pipe_handle;
+
+  // Create a socket pair will be used to receive the child process' stderr.
+  zx::socket stderr_producer_socket;
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  int stderr_producer_fd = -1;
+  status =
+      fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+
+  // Make the stderr socket nonblocking.
+  GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
+
+  fdio_spawn_action_t *add_stderr_action = &spawn_actions[1];
+  add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
+  add_stderr_action->fd.local_fd = stderr_producer_fd;
+  add_stderr_action->fd.target_fd = STDERR_FILENO;
+
+  // Create a child job.
+  zx_handle_t child_job = ZX_HANDLE_INVALID;
+  status = zx_job_create(zx_job_default(), 0, &child_job);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  zx_policy_basic_t policy;
+  policy.condition = ZX_POL_NEW_ANY;
+  policy.policy = ZX_POL_ACTION_ALLOW;
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception channel attached to the |child_job|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
+
+#else  // We are neither on Windows, nor on Fuchsia.
 
 // ForkingDeathTest provides implementations for most of the abstract
 // methods of the DeathTest interface.  Only the AssumeRole method is
 // left undefined.
 class ForkingDeathTest : public DeathTestImpl {
  public:
-  ForkingDeathTest(const char* statement, const RE* regex);
+  ForkingDeathTest(const char *statement, Matcher<const std::string &> matcher);
 
   // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
+  int Wait() override;
 
  protected:
   void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
@@ -800,16 +1060,15 @@
 };
 
 // Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
-    : DeathTestImpl(a_statement, a_regex),
-      child_pid_(-1) {}
+ForkingDeathTest::ForkingDeathTest(const char *a_statement,
+                                   Matcher<const std::string &> matcher)
+    : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
 
 // Waits for the child in a death test to exit, returning its exit
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -823,9 +1082,9 @@
 // in the child process.
 class NoExecDeathTest : public ForkingDeathTest {
  public:
-  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
-      ForkingDeathTest(a_statement, a_regex) { }
-  virtual TestRole AssumeRole();
+  NoExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher)
+      : ForkingDeathTest(a_statement, std::move(matcher)) {}
+  TestRole AssumeRole() override;
 };
 
 // The AssumeRole process for a fork-and-run death test.  It implements a
@@ -878,23 +1137,24 @@
 // only this specific death test to be run.
 class ExecDeathTest : public ForkingDeathTest {
  public:
-  ExecDeathTest(const char* a_statement, const RE* a_regex,
-                const char* file, int line) :
-      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  virtual TestRole AssumeRole();
+  ExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher,
+                const char *file, int line)
+      : ForkingDeathTest(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
+  TestRole AssumeRole() override;
+
  private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
-    ::std::vector<testing::internal::string> extra_args =
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
-  const char* const file_;
+  const char *const file_;
   // The line number on which the death test is located.
   const int line_;
 };
@@ -902,74 +1162,69 @@
 // Utility class for accumulating command-line arguments.
 class Arguments {
  public:
-  Arguments() {
-    args_.push_back(NULL);
-  }
+  Arguments() { args_.push_back(nullptr); }
 
   ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
          ++i) {
       free(*i);
     }
   }
-  void AddArgument(const char* argument) {
+  void AddArgument(const char *argument) {
     args_.insert(args_.end() - 1, posix::StrDup(argument));
   }
 
   template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
+  void AddArguments(const ::std::vector<Str> &arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char *const *Argv() { return &args_[0]; }
 
  private:
-  std::vector<char*> args_;
+  std::vector<char *> args_;
 };
 
 // A struct that encompasses the arguments to the child process of a
 // threadsafe-style death test process.
 struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
+  char *const *argv;  // Command-line arguments for the child's call to exec
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
+#if GTEST_OS_MAC
+inline char **GetEnviron() {
   // When Google Test is built as a framework on MacOS X, the environ variable
   // is unavailable. Apple's documentation (man environ) recommends using
   // _NSGetEnviron() instead.
   return *_NSGetEnviron();
 }
-#  else
+#else
 // Some POSIX platforms expect you to declare environ. extern "C" makes
 // it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
+extern "C" char **environ;
+inline char **GetEnviron() { return environ; }
+#endif  // GTEST_OS_MAC
 
-#  if !GTEST_OS_QNX
+#if !GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+static int ExecDeathTestChildMain(void *child_arg) {
+  ExecDeathTestArgs *const args = static_cast<ExecDeathTestArgs *>(child_arg);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
 
   // We need to execute the test program in the same environment where
   // it was originally invoked.  Therefore we change to the original
   // working directory first.
-  const char* const original_dir =
+  const char *const original_dir =
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
@@ -980,12 +1235,12 @@
   // one path separator.
   execve(args->argv[0], args->argv, GetEnviron());
   DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
+                 original_dir + " failed: " + GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // !GTEST_OS_QNX
+#endif  // !GTEST_OS_QNX
 
+#if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -995,20 +1250,26 @@
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
+static void StackLowerThanAddress(const void *ptr,
+                                  bool *result) GTEST_NO_INLINE_;
+// HWAddressSanitizer add a random tag to the MSB of the local variable address,
+// making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static void StackLowerThanAddress(const void *ptr, bool *result) {
   int dummy;
   *result = (&dummy < ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-bool StackGrowsDown() {
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static bool StackGrowsDown() {
   int dummy;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
+#endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1017,11 +1278,11 @@
 // fork supports only single-threaded environments, so this function uses
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
-static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
   ExecDeathTestArgs args = { argv, close_fd };
   pid_t child_pid = -1;
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1030,29 +1291,30 @@
   // We need to execute the test program in the same environment where
   // it was originally invoked.  Therefore we change to the original
   // working directory first.
-  const char* const original_dir =
+  const char *const original_dir =
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
-  struct inheritance inherit = {0};
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = { 0 };
   // spawn is a system call.
-  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  child_pid =
+      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
   // Restores the current working directory.
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1061,18 +1323,18 @@
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
 
-#   if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
   const bool use_fork = GTEST_FLAG(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
-    const size_t stack_size = getpagesize();
+    const auto stack_size = static_cast<size_t>(getpagesize() * 2);
     // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+    void *const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
                              MAP_ANON | MAP_PRIVATE, -1, 0);
     GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
 
@@ -1083,29 +1345,30 @@
     // than 64.  We assume stack and stack_size already have alignment of
     // kMaxStackAlignment.
     const size_t kMaxStackAlignment = 64;
-    void* const stack_top =
-        static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
-    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
-        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+    void *const stack_top =
+        static_cast<char *>(stack) +
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(
+        static_cast<size_t>(stack_size) > kMaxStackAlignment &&
+        reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
 
     child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#   else
+#else
   const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
+    ExecDeathTestChildMain(&args);
+    _exit(0);
   }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      sigaction(SIGPROF, &saved_sigprof_action, NULL));
-#  endif  // GTEST_OS_LINUX
+      sigaction(SIGPROF, &saved_sigprof_action, nullptr));
+#endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1116,13 +1379,13 @@
 // and --gtest_internal_run_death_test flags to cause only the current
 // death test to be re-run.
 DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
       impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
+  const TestInfo *const info = impl->current_test_info();
   const int death_test_index = info->result()->death_test_count();
 
-  if (flag != NULL) {
+  if (flag != nullptr) {
     set_write_fd(flag->write_fd());
     return EXECUTE_TEST;
   }
@@ -1133,14 +1396,14 @@
   // it be closed when the child process does an exec:
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
-      + info->test_case_name() + "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1161,89 +1424,94 @@
   return OVERSEE_TEST;
 }
 
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
 // by the "test" argument to its address.  If the test should be
 // skipped, sets that pointer to NULL.  Returns true, unless the
 // flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
+bool DefaultDeathTestFactory::Create(const char *statement,
+                                     Matcher<const std::string &> matcher,
+                                     const char *file, int line,
+                                     DeathTest **test) {
+  UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
 
-  if (flag != NULL) {
+  if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
       return false;
     }
 
     if (!(flag->file() == file && flag->line() == line &&
           flag->index() == death_test_index)) {
-      *test = NULL;
+      *test = nullptr;
       return true;
     }
   }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   if (GTEST_FLAG(death_test_style) == "threadsafe" ||
       GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, regex, file, line);
+    *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-# else
+#elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
+  }
+
+#else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, regex, file, line);
+    *test = new ExecDeathTest(statement, std::move(matcher), file, line);
   } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, regex);
+    *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG(death_test_style) +
+                                           "\" encountered");
     return false;
   }
 
   return true;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
   }
 
-  // TODO(vladl@google.com): Replace the following check with a
-  // compile-time assertion when available.
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
-  // The newly initialized handle is accessible only in in the parent
+  // The newly initialized handle is accessible only in the parent
   // process. To obtain one accessible within the child, we need to use
   // DuplicateHandle.
   if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
@@ -1262,9 +1530,7 @@
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1286,13 +1552,13 @@
 
   return write_fd;
 }
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
   // can use it here.
@@ -1302,35 +1568,41 @@
   SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
   int write_fd = -1;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
                    GTEST_FLAG(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
                                      event_handle_as_size_t);
-# else
 
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+#elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#else
+
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+
+#endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-filepath.cc b/libaom/third_party/googletest/src/googletest/src/gtest-filepath.cc
index 0292dc1..f9427e0 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-filepath.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-filepath.cc

@@ -26,40 +26,35 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
 
-#include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-filepath.h"
-#include "gtest/internal/gtest-port.h"
 
 #include <stdlib.h>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-message.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
 #elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
-#elif GTEST_OS_SYMBIAN
-// Symbian OpenC has PATH_MAX in sys/syslimits.h
-# include <sys/syslimits.h>
+#include <direct.h>
+#include <io.h>
 #else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
-#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
-#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
-#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
-#endif  // GTEST_OS_WINDOWS
+#include <limits.h>
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+#define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
 namespace testing {
 namespace internal {
 
@@ -71,16 +66,16 @@
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
 const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -97,23 +92,24 @@
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-  // Windows CE doesn't have a current directory, so we just return
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32
+  // These platforms do not have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
 #elif GTEST_OS_WINDOWS
   char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
   char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+  char *result = getcwd(cwd, sizeof(cwd));
+#if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
-  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
-  return FilePath(result == NULL ? "" : cwd);
+  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
+#endif  // GTEST_OS_NACL
+  return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
 
@@ -121,25 +117,25 @@
 // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
 // FilePath("dir/file"). If a case-insensitive extension is not
 // found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char* extension) const {
+FilePath FilePath::RemoveExtension(const char *extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
 
-// Returns a pointer to the last occurence of a valid path separator in
+// Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
-const char* FilePath::FindLastPathSeparator() const {
-  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+const char *FilePath::FindLastPathSeparator() const {
+  const char *const last_sep = strrchr(c_str(), kPathSeparator);
 #if GTEST_HAS_ALT_PATH_SEP_
-  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  const char *const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
   // Comparing two pointers of which only one is NULL is undefined.
-  if (last_alt_sep != NULL &&
-      (last_sep == NULL || last_alt_sep > last_sep)) {
+  if (last_alt_sep != nullptr &&
+      (last_sep == nullptr || last_alt_sep > last_sep)) {
     return last_alt_sep;
   }
 #endif
@@ -153,7 +149,7 @@
 // returns an empty FilePath ("").
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveDirectoryName() const {
-  const char* const last_sep = FindLastPathSeparator();
+  const char *const last_sep = FindLastPathSeparator();
   return last_sep ? FilePath(last_sep + 1) : *this;
 }
 
@@ -164,10 +160,10 @@
 // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveFileName() const {
-  const char* const last_sep = FindLastPathSeparator();
+  const char *const last_sep = FindLastPathSeparator();
   std::string dir;
   if (last_sep) {
-    dir = std::string(c_str(), last_sep + 1 - c_str());
+    dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
   } else {
     dir = kCurrentDirectoryString;
   }
@@ -180,26 +176,24 @@
 // extension = "xml", returns "dir/test.xml". If number is greater
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
-                                const char* extension) {
+FilePath FilePath::MakeFileName(const FilePath &directory,
+                                const FilePath &base_name, int number,
+                                const char *extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
 
 // Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
 // On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath& directory,
-                               const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
+FilePath FilePath::ConcatPaths(const FilePath &directory,
+                               const FilePath &relative_path) {
+  if (directory.IsEmpty()) return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -210,7 +204,7 @@
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   return attributes != kInvalidFileAttributes;
 #else
   posix::StatStruct file_stat;
@@ -225,24 +219,24 @@
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
+  const FilePath &path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
 #else
-  const FilePath& path(*this);
+  const FilePath &path(*this);
 #endif
 
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
   posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -252,9 +246,6 @@
 // root directory per disk drive.)
 bool FilePath::IsRootDirectory() const {
 #if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
-  // \\server\share can be a root directory, although it cannot be the
-  // current directory.  Handle this properly.
   return pathname_.length() == 3 && IsAbsolutePath();
 #else
   return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
@@ -263,13 +254,12 @@
 
 // Returns true if pathname describes an absolute path.
 bool FilePath::IsAbsolutePath() const {
-  const char* const name = pathname_.c_str();
+  const char *const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -283,9 +273,9 @@
 // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
 // There could be a race condition if two or more processes are calling this
 // function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
-                                          const FilePath& base_name,
-                                          const char* extension) {
+FilePath FilePath::GenerateUniqueFileName(const FilePath &directory,
+                                          const FilePath &base_name,
+                                          const char *extension) {
   FilePath full_pathname;
   int number = 0;
   do {
@@ -326,10 +316,13 @@
 #if GTEST_OS_WINDOWS_MOBILE
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
-  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
-  delete [] unicode;
+  int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
+  delete[] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
+#elif GTEST_OS_ESP8266
+  // do nothing
+  int result = 0;
 #else
   int result = mkdir(pathname_.c_str(), 0777);
 #endif  // GTEST_OS_WINDOWS_MOBILE
@@ -344,23 +337,21 @@
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
 void FilePath::Normalize() {
-  if (pathname_.c_str() == NULL) {
+  if (pathname_.c_str() == nullptr) {
     pathname_ = "";
     return;
   }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
+  const char *src = pathname_.c_str();
+  char *const dest = new char[pathname_.length() + 1];
+  char *dest_ptr = dest;
   memset(dest_ptr, 0, pathname_.length() + 1);
 
   while (*src != '\0') {
@@ -373,8 +364,7 @@
         *dest_ptr = kPathSeparator;
       }
 #endif
-      while (IsPathSeparator(*src))
-        src++;
+      while (IsPathSeparator(*src)) src++;
     }
     dest_ptr++;
   }

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/libaom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
index ed8a682..16d8cde 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-internal-inl.h

@@ -27,49 +27,43 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
+// Utility functions and classes used by the Google C++ testing framework.//
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
 #ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
 #define GTEST_SRC_GTEST_INTERNAL_INL_H_
 
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// If this file is included from the user's code, just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
 #include <string.h>  // For memmove.
 
 #include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"  // NOLINT
+#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // Declares the flags.
@@ -94,6 +88,7 @@
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
 const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
 const char kRepeatFlag[] = "repeat";
 const char kShuffleFlag[] = "shuffle";
@@ -105,14 +100,14 @@
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
 
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
 GTEST_API_ extern bool g_help_flag;
 
 // Returns the current time in milliseconds.
 GTEST_API_ TimeInMillis GetTimeInMillis();
 
-// Returns true iff Google Test should use colors in the output.
+// Returns true if and only if Google Test should use colors in the output.
 GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
 
 // Formats the given time in milliseconds as seconds.
@@ -128,21 +123,22 @@
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
+GTEST_API_ bool ParseInt32Flag(const char *str, const char *flag,
+                               int32_t *value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
+inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
   return normalized_seed;
 }
 
@@ -174,6 +170,7 @@
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
     print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
     repeat_ = GTEST_FLAG(repeat);
     shuffle_ = GTEST_FLAG(shuffle);
@@ -195,6 +192,7 @@
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
     GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
     GTEST_FLAG(repeat) = repeat_;
     GTEST_FLAG(shuffle) = shuffle_;
@@ -216,10 +214,11 @@
   bool list_tests_;
   std::string output_;
   bool print_time_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
+  bool print_utf8_;
+  int32_t random_seed_;
+  int32_t repeat_;
   bool shuffle_;
-  internal::Int32 stack_trace_depth_;
+  int32_t stack_trace_depth_;
   std::string stream_result_to_;
   bool throw_on_failure_;
 } GTEST_ATTRIBUTE_UNUSED_;
@@ -230,11 +229,11 @@
 // If the code_point is not a valid Unicode code point
 // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
 // to "(Invalid Unicode 0xXXXXXXXX)".
-GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
 
 // Converts a wide string to a narrow string in UTF-8 encoding.
 // The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
 //   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
 // Parameter str points to a null-terminated wide string.
 // Parameter num_chars may additionally limit the number
@@ -245,7 +244,7 @@
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+GTEST_API_ std::string WideStringToUtf8(const wchar_t *str, int num_chars);
 
 // Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
 // if the variable is present. If a file already exists at this location, this
@@ -259,49 +258,49 @@
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
+GTEST_API_ bool ShouldShard(const char *total_shards_str,
+                            const char *shard_index_str,
                             bool in_subprocess_for_death_test);
 
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
+// Parses the environment variable var as a 32-bit integer. If it is unset,
+// returns default_val. If it is not a 32-bit integer, prints an error and
 // and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+GTEST_API_ int32_t Int32FromEnvOrDie(const char *env_var, int32_t default_val);
 
 // Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
 
 // STL container utilities.
 
 // Returns the number of elements in the given container that satisfy
 // the given predicate.
 template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
+inline int CountIf(const Container &c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
   for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
+    if (predicate(*it)) ++count;
   }
   return count;
 }
 
 // Applies a function/functor to each element in the container.
 template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
+void ForEach(const Container &c, Functor functor) {
   std::for_each(c.begin(), c.end(), functor);
 }
 
 // Returns the i-th element of the vector, or default_value if i is not
 // in range [0, v.size()).
 template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+inline E GetElementOr(const std::vector<E> &v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
+                                                    : v[static_cast<size_t>(i)];
 }
 
 // Performs an in-place shuffle of a range of the vector's elements.
@@ -309,8 +308,8 @@
 // i.e. [begin, end) are shuffled, where 'end' == size() means to
 // shuffle to the end of the vector.
 template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
+void ShuffleRange(internal::Random *random, int begin, int end,
+                  std::vector<E> *v) {
   const int size = static_cast<int>(v->size());
   GTEST_CHECK_(0 <= begin && begin <= size)
       << "Invalid shuffle range start " << begin << ": must be in range [0, "
@@ -323,21 +322,24 @@
   // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
   for (int range_width = end - begin; range_width >= 2; range_width--) {
     const int last_in_range = begin + range_width - 1;
-    const int selected = begin + random->Generate(range_width);
-    std::swap((*v)[selected], (*v)[last_in_range]);
+    const int selected =
+        begin +
+        static_cast<int>(random->Generate(static_cast<uint32_t>(range_width)));
+    std::swap((*v)[static_cast<size_t>(selected)],
+              (*v)[static_cast<size_t>(last_in_range)]);
   }
 }
 
 // Performs an in-place shuffle of the vector's elements.
 template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+inline void Shuffle(internal::Random *random, std::vector<E> *v) {
   ShuffleRange(random, 0, static_cast<int>(v->size()), v);
 }
 
 // A function for deleting an object.  Handy for being used as a
 // functor.
 template <typename T>
-static void Delete(T* x) {
+static void Delete(T *x) {
   delete x;
 }
 
@@ -349,10 +351,10 @@
   // Constructor.
   //
   // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+  explicit TestPropertyKeyIs(const std::string &key) : key_(key) {}
 
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
+  // Returns true if and only if the test name of test property matches on key_.
+  bool operator()(const TestProperty &test_property) const {
     return test_property.key() == key_;
   }
 
@@ -384,16 +386,16 @@
 
   // Functions for processing the gtest_filter flag.
 
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
+  // Returns true if and only if the wildcard pattern matches the string.
+  // The first ':' or '\0' character in pattern marks the end of it.
   //
   // This recursive algorithm isn't very efficient, but is clear and
   // works well enough for matching test names, which are short.
   static bool PatternMatchesString(const char *pattern, const char *str);
 
-  // Returns true iff the user-specified filter matches the test case
-  // name and the test name.
-  static bool FilterMatchesTest(const std::string &test_case_name,
+  // Returns true if and only if the user-specified filter matches the test
+  // suite name and the test name.
+  static bool FilterMatchesTest(const std::string &test_suite_name,
                                 const std::string &test_name);
 
 #if GTEST_OS_WINDOWS
@@ -407,7 +409,7 @@
 
   // Returns true if "name" matches the ':' separated list of glob-style
   // filters in "filter".
-  static bool MatchesFilter(const std::string& name, const char* filter);
+  static bool MatchesFilter(const std::string &name, const char *filter);
 };
 
 // Returns the current application's name, removing directory path if that
@@ -426,7 +428,7 @@
   //                in the trace.
   //   skip_count - the number of top frames to be skipped; doesn't count
   //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
 
   // UponLeavingGTest() should be called immediately before Google Test calls
   // user code. It saves some information about the current stack that
@@ -435,7 +437,7 @@
 
   // This string is inserted in place of stack frames that are part of
   // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
+  static const char *const kElidedFramesMarker;
 
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
@@ -446,16 +448,26 @@
  public:
   OsStackTraceGetter() {}
 
-  virtual string CurrentStackTrace(int max_depth, int skip_count);
-  virtual void UponLeavingGTest();
+  std::string CurrentStackTrace(int max_depth, int skip_count) override;
+  void UponLeavingGTest() override;
 
  private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void *caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
 };
 
 // Information about a Google Test trace point.
 struct TraceInfo {
-  const char* file;
+  const char *file;
   int line;
   std::string message;
 };
@@ -463,15 +475,15 @@
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
+    : public TestPartResultReporterInterface {
  public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl *unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
   // result in the current test.
-  virtual void ReportTestPartResult(const TestPartResult& result);
+  void ReportTestPartResult(const TestPartResult &result) override;
 
  private:
-  UnitTestImpl* const unit_test_;
+  UnitTestImpl *const unit_test_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
 };
@@ -481,13 +493,13 @@
 class DefaultPerThreadTestPartResultReporter
     : public TestPartResultReporterInterface {
  public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl *unit_test);
   // Implements the TestPartResultReporterInterface. The implementation just
   // delegates to the current global test part result reporter of *unit_test_.
-  virtual void ReportTestPartResult(const TestPartResult& result);
+  void ReportTestPartResult(const TestPartResult &result) override;
 
  private:
-  UnitTestImpl* const unit_test_;
+  UnitTestImpl *const unit_test_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
 };
@@ -498,7 +510,7 @@
 // proper locking.
 class GTEST_API_ UnitTestImpl {
  public:
-  explicit UnitTestImpl(UnitTest* parent);
+  explicit UnitTestImpl(UnitTest *parent);
   virtual ~UnitTestImpl();
 
   // There are two different ways to register your own TestPartResultReporter.
@@ -509,35 +521,38 @@
   // test part result for the currently running test.
 
   // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+  TestPartResultReporterInterface *GetGlobalTestPartResultReporter();
 
   // Sets the global test part result reporter.
   void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
+      TestPartResultReporterInterface *reporter);
 
   // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+  TestPartResultReporterInterface *GetTestPartResultReporterForCurrentThread();
 
   // Sets the test part result reporter for the current thread.
   void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
+      TestPartResultReporterInterface *reporter);
 
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
 
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
 
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
 
-  // Gets the number of all test cases that contain at least one test
+  // Gets the number of all test suites that contain at least one test
   // that should run.
-  int test_case_to_run_count() const;
+  int test_suite_to_run_count() const;
 
   // Gets the number of successful tests.
   int successful_test_count() const;
 
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
   // Gets the number of failed tests.
   int failed_test_count() const;
 
@@ -563,50 +578,56 @@
   // Gets the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns true iff the unit test passed (i.e. all test cases passed).
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
   bool Passed() const { return !Failed(); }
 
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
   bool Failed() const {
-    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+    return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
   }
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[i];
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite *GetTestSuite(int i) const {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
   }
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i) {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[index];
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *GetTestCase(int i) const { return GetTestSuite(i); }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite *GetMutableSuiteCase(int i) {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
   }
 
   // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
+  TestEventListeners *listeners() { return &listeners_; }
 
   // Returns the TestResult for the test that's currently running, or
   // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
+  TestResult *current_test_result();
 
   // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+  const TestResult *ad_hoc_test_result() const { return &ad_hoc_test_result_; }
 
   // Sets the OS stack trace getter.
   //
   // Does nothing if the input and the current OS stack trace getter
   // are the same; otherwise, deletes the old getter and makes the
   // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface *getter);
 
   // Returns the current OS stack trace getter if it is not NULL;
   // otherwise, creates an OsStackTraceGetter, makes it the current
   // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
+  OsStackTraceGetterInterface *os_stack_trace_getter();
 
   // Returns the current OS stack trace as an std::string.
   //
@@ -620,31 +641,39 @@
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
   std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
 
-  // Finds and returns a TestCase with the given name.  If one doesn't
+  // Finds and returns a TestSuite with the given name.  If one doesn't
   // exist, creates one and returns it.
   //
   // Arguments:
   //
-  //   test_case_name: name of the test case
+  //   test_suite_name: name of the test suite
   //   type_param:     the name of the test's type parameter, or NULL if
   //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test case
-  //   tear_down_tc:   pointer to the function that tears down the test case
-  TestCase* GetTestCase(const char* test_case_name,
-                        const char* type_param,
-                        Test::SetUpTestCaseFunc set_up_tc,
-                        Test::TearDownTestCaseFunc tear_down_tc);
+  //   set_up_tc:      pointer to the function that sets up the test suite
+  //   tear_down_tc:   pointer to the function that tears down the test suite
+  TestSuite *GetTestSuite(const char *test_suite_name, const char *type_param,
+                          internal::SetUpTestSuiteFunc set_up_tc,
+                          internal::TearDownTestSuiteFunc tear_down_tc);
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  TestCase *GetTestCase(const char *test_case_name, const char *type_param,
+                        internal::SetUpTestSuiteFunc set_up_tc,
+                        internal::TearDownTestSuiteFunc tear_down_tc) {
+    return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
+  }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Adds a TestInfo to the unit test.
   //
   // Arguments:
   //
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
   //   test_info:    the TestInfo object
-  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc,
-                   TestInfo* test_info) {
+  void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
+                   internal::TearDownTestSuiteFunc tear_down_tc,
+                   TestInfo *test_info) {
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -658,34 +687,42 @@
           << "Failed to get the current working directory.";
     }
 
-    GetTestCase(test_info->test_case_name(),
-                test_info->type_param(),
-                set_up_tc,
-                tear_down_tc)->AddTestInfo(test_info);
+    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
+                 set_up_tc, tear_down_tc)
+        ->AddTestInfo(test_info);
   }
 
-#if GTEST_HAS_PARAM_TEST
-  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // Returns ParameterizedTestSuiteRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry() {
     return parameterized_test_registry_;
   }
-#endif  // GTEST_HAS_PARAM_TEST
 
-  // Sets the TestCase object for the test that's currently running.
-  void set_current_test_case(TestCase* a_current_test_case) {
-    current_test_case_ = a_current_test_case;
+  std::set<std::string> *ignored_parameterized_test_suites() {
+    return &ignored_parameterized_test_suites_;
+  }
+
+  // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
+  // type-parameterized tests and instantiations of them.
+  internal::TypeParameterizedTestSuiteRegistry &
+  type_parameterized_test_registry() {
+    return type_parameterized_test_registry_;
+  }
+
+  // Sets the TestSuite object for the test that's currently running.
+  void set_current_test_suite(TestSuite *a_current_test_suite) {
+    current_test_suite_ = a_current_test_suite;
   }
 
   // Sets the TestInfo object for the test that's currently running.  If
   // current_test_info is NULL, the assertion results will be stored in
   // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
+  void set_current_test_info(TestInfo *a_current_test_info) {
     current_test_info_ = a_current_test_info;
   }
 
   // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
   // combination. This method can be called more then once; it has guards
   // protecting from registering the tests more then once.  If
   // value-parameterized tests are disabled, RegisterParameterizedTests is
@@ -700,28 +737,23 @@
 
   // Clears the results of all tests, except the ad hoc tests.
   void ClearNonAdHocTestResult() {
-    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
-  // context of a test or a test case, or to the global property set. If the
+  // context of a test or a test suite, or to the global property set. If the
   // result already contains a property with the same key, the value will be
   // updated.
-  void RecordProperty(const TestProperty& test_property);
+  void RecordProperty(const TestProperty &test_property);
 
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
-  // result in each TestCase and TestInfo object.
+  // result in each TestSuite and TestInfo object.
   // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
   // based on sharding variables in the environment.
   // Returns the number of tests that should run.
@@ -730,19 +762,19 @@
   // Prints the names of the tests matching the user-specified filter flag.
   void ListTestsMatchingFilter();
 
-  const TestCase* current_test_case() const { return current_test_case_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
+  const TestSuite *current_test_suite() const { return current_test_suite_; }
+  TestInfo *current_test_info() { return current_test_info_; }
+  const TestInfo *current_test_info() const { return current_test_info_; }
 
   // Returns the vector of environments that need to be set-up/torn-down
   // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
+  std::vector<Environment *> &environments() { return environments_; }
 
   // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
+  std::vector<TraceInfo> &gtest_trace_stack() {
     return *(gtest_trace_stack_.pointer());
   }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
+  const std::vector<TraceInfo> &gtest_trace_stack() const {
     return gtest_trace_stack_.get();
   }
 
@@ -754,12 +786,12 @@
   // flag, or NULL if that flag was not specified.
   // This information is useful only in a death test child process.
   // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+  const InternalRunDeathTestFlag *internal_run_death_test_flag() const {
     return internal_run_death_test_flag_.get();
   }
 
   // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
+  internal::DeathTestFactory *death_test_factory() {
     return death_test_factory_.get();
   }
 
@@ -789,13 +821,13 @@
   int random_seed() const { return random_seed_; }
 
   // Gets the random number generator.
-  internal::Random* random() { return &random_; }
+  internal::Random *random() { return &random_; }
 
-  // Shuffles all test cases, and the tests within each test case,
+  // Shuffles all test suites, and the tests within each test suite,
   // making sure that death tests are still run first.
   void ShuffleTests();
 
-  // Restores the test cases and tests to their order before the first shuffle.
+  // Restores the test suites and tests to their order before the first shuffle.
   void UnshuffleTests();
 
   // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
@@ -810,7 +842,7 @@
   void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
 
   // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
+  UnitTest *const parent_;
 
   // The working directory when the first TEST() or TEST_F() was
   // executed.
@@ -822,52 +854,56 @@
       default_per_thread_test_part_result_reporter_;
 
   // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
+  TestPartResultReporterInterface *global_test_part_result_repoter_;
 
   // Protects read and write access to global_test_part_result_reporter_.
   internal::Mutex global_test_part_result_reporter_mutex_;
 
   // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
+  internal::ThreadLocal<TestPartResultReporterInterface *>
       per_thread_test_part_result_reporter_;
 
   // The vector of environments that need to be set-up/torn-down
   // before/after the tests are run.
-  std::vector<Environment*> environments_;
+  std::vector<Environment *> environments_;
 
-  // The vector of TestCases in their original order.  It owns the
+  // The vector of TestSuites in their original order.  It owns the
   // elements in the vector.
-  std::vector<TestCase*> test_cases_;
+  std::vector<TestSuite *> test_suites_;
 
-  // Provides a level of indirection for the test case list to allow
-  // easy shuffling and restoring the test case order.  The i-th
-  // element of this vector is the index of the i-th test case in the
+  // Provides a level of indirection for the test suite list to allow
+  // easy shuffling and restoring the test suite order.  The i-th
+  // element of this vector is the index of the i-th test suite in the
   // shuffled order.
-  std::vector<int> test_case_indices_;
+  std::vector<int> test_suite_indices_;
 
-#if GTEST_HAS_PARAM_TEST
   // ParameterizedTestRegistry object used to register value-parameterized
   // tests.
-  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+  internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+  internal::TypeParameterizedTestSuiteRegistry
+      type_parameterized_test_registry_;
+
+  // The set holding the name of parameterized
+  // test suites that may go uninstantiated.
+  std::set<std::string> ignored_parameterized_test_suites_;
 
   // Indicates whether RegisterParameterizedTests() has been called already.
   bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
 
-  // Index of the last death test case registered.  Initially -1.
-  int last_death_test_case_;
+  // Index of the last death test suite registered.  Initially -1.
+  int last_death_test_suite_;
 
-  // This points to the TestCase for the currently running test.  It
-  // changes as Google Test goes through one test case after another.
+  // This points to the TestSuite for the currently running test.  It
+  // changes as Google Test goes through one test suite after another.
   // When no test is running, this is set to NULL and Google Test
   // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestCase* current_test_case_;
+  TestSuite *current_test_suite_;
 
   // This points to the TestInfo for the currently running test.  It
   // changes as Google Test goes through one test after another.  When
   // no test is running, this is set to NULL and Google Test stores
   // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
+  TestInfo *current_test_info_;
 
   // Normally, a user only writes assertions inside a TEST or TEST_F,
   // or inside a function called by a TEST or TEST_F.  Since Google
@@ -887,9 +923,9 @@
   // object is destructed.  By default, an OsStackTraceGetter is used,
   // but the user can set this field to use a custom getter if that is
   // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
+  OsStackTraceGetterInterface *os_stack_trace_getter_;
 
-  // True iff PostFlagParsingInit() has been called.
+  // True if and only if PostFlagParsingInit() has been called.
   bool post_flag_parse_init_performed_;
 
   // The random number seed used at the beginning of the test run.
@@ -908,8 +944,8 @@
 #if GTEST_HAS_DEATH_TEST
   // The decomposed components of the gtest_internal_run_death_test flag,
   // parsed when RUN_ALL_TESTS is called.
-  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+  std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
 #endif  // GTEST_HAS_DEATH_TEST
 
   // A per-thread stack of traces created by the SCOPED_TRACE() macro.
@@ -924,7 +960,7 @@
 
 // Convenience function for accessing the global UnitTest
 // implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
+inline UnitTestImpl *GetUnitTestImpl() {
   return UnitTest::GetInstance()->impl();
 }
 
@@ -932,7 +968,7 @@
 
 // Internal helper functions for implementing the simple regular
 // expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsInSet(char ch, const char *str);
 GTEST_API_ bool IsAsciiDigit(char ch);
 GTEST_API_ bool IsAsciiPunct(char ch);
 GTEST_API_ bool IsRepeat(char ch);
@@ -940,18 +976,19 @@
 GTEST_API_ bool IsAsciiWordChar(char ch);
 GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+GTEST_API_ bool ValidateRegex(const char *regex);
+GTEST_API_ bool MatchRegexAtHead(const char *regex, const char *str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char *regex,
+                                              const char *str);
+GTEST_API_ bool MatchRegexAnywhere(const char *regex, const char *str);
 
 #endif  // GTEST_USES_SIMPLE_RE
 
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, char **argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv);
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -964,7 +1001,7 @@
 // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
 // it here.
 template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+bool ParseNaturalNumber(const ::std::string &str, Integer *number) {
   // Fail fast if the given string does not begin with a digit;
   // this bypasses strtoXXX's "optional leading whitespace and plus
   // or minus sign" semantics, which are undesirable here.
@@ -973,27 +1010,14 @@
   }
   errno = 0;
 
-  char* end;
+  char *end;
   // BiggestConvertible is the largest integer type that system-provided
   // string-to-number conversion routines can return.
+  using BiggestConvertible = unsigned long long;  // NOLINT
 
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);  // NOLINT
   const bool parse_success = *end == '\0' && errno == 0;
 
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
-  // available.
   GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
 
   const Integer result = static_cast<Integer>(parsed);
@@ -1013,18 +1037,18 @@
 // constructs. Do not use it in user tests, either directly or indirectly.
 class TestResultAccessor {
  public:
-  static void RecordProperty(TestResult* test_result,
-                             const std::string& xml_element,
-                             const TestProperty& property) {
+  static void RecordProperty(TestResult *test_result,
+                             const std::string &xml_element,
+                             const TestProperty &property) {
     test_result->RecordProperty(xml_element, property);
   }
 
-  static void ClearTestPartResults(TestResult* test_result) {
+  static void ClearTestPartResults(TestResult *test_result) {
     test_result->ClearTestPartResults();
   }
 
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
+  static const std::vector<testing::TestPartResult> &test_part_results(
+      const TestResult &test_result) {
     return test_result.test_part_results();
   }
 };
@@ -1032,7 +1056,7 @@
 #if GTEST_CAN_STREAM_RESULTS_
 
 // Streams test results to the given port on the given host machine.
-class GTEST_API_ StreamingListener : public EmptyTestEventListener {
+class StreamingListener : public EmptyTestEventListener {
  public:
   // Abstract base class for writing strings to a socket.
   class AbstractSocketWriter {
@@ -1040,40 +1064,36 @@
     virtual ~AbstractSocketWriter() {}
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
+    virtual void Send(const std::string &message) = 0;
 
     // Closes the socket.
     virtual void CloseConnection() {}
 
     // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
+    void SendLn(const std::string &message) { Send(message + "\n"); }
   };
 
   // Concrete class for actually writing strings to a socket.
   class SocketWriter : public AbstractSocketWriter {
    public:
-    SocketWriter(const string& host, const string& port)
+    SocketWriter(const std::string &host, const std::string &port)
         : sockfd_(-1), host_name_(host), port_num_(port) {
       MakeConnection();
     }
 
-    virtual ~SocketWriter() {
-      if (sockfd_ != -1)
-        CloseConnection();
+    ~SocketWriter() override {
+      if (sockfd_ != -1) CloseConnection();
     }
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) {
+    void Send(const std::string &message) override {
       GTEST_CHECK_(sockfd_ != -1)
           << "Send() can be called only when there is a connection.";
 
-      const int len = static_cast<int>(message.length());
-      if (write(sockfd_, message.c_str(), len) != len) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
+      const auto len = static_cast<size_t>(message.length());
+      if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1082,7 +1102,7 @@
     void MakeConnection();
 
     // Closes the socket.
-    void CloseConnection() {
+    void CloseConnection() override {
       GTEST_CHECK_(sockfd_ != -1)
           << "CloseConnection() can be called only when there is a connection.";
 
@@ -1091,26 +1111,30 @@
     }
 
     int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
+    const std::string host_name_;
+    const std::string port_num_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
+  static std::string UrlEncode(const char *str);
 
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+  StreamingListener(const std::string &host, const std::string &port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
 
-  explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
+  explicit StreamingListener(AbstractSocketWriter *socket_writer)
+      : socket_writer_(socket_writer) {
+    Start();
+  }
 
-  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+  void OnTestProgramStart(const UnitTest & /* unit_test */) override {
     SendLn("event=TestProgramStart");
   }
 
-  void OnTestProgramEnd(const UnitTest& unit_test) {
+  void OnTestProgramEnd(const UnitTest &unit_test) override {
     // Note that Google Test current only report elapsed time for each
     // test iteration, not for the entire test program.
     SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
@@ -1119,42 +1143,46 @@
     socket_writer_->CloseConnection();
   }
 
-  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+  void OnTestIterationStart(const UnitTest & /* unit_test */,
+                            int iteration) override {
     SendLn("event=TestIterationStart&iteration=" +
            StreamableToString(iteration));
   }
 
-  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
+  void OnTestIterationEnd(const UnitTest &unit_test,
+                          int /* iteration */) override {
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
   }
 
-  void OnTestCaseStart(const TestCase& test_case) {
+  // Note that "event=TestCaseStart" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseStart(const TestCase &test_case) override {
     SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
   }
 
-  void OnTestCaseEnd(const TestCase& test_case) {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
-           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
-           + "ms");
+  // Note that "event=TestCaseEnd" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseEnd(const TestCase &test_case) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+           "ms");
   }
 
-  void OnTestStart(const TestInfo& test_info) {
+  void OnTestStart(const TestInfo &test_info) override {
     SendLn(std::string("event=TestStart&name=") + test_info.name());
   }
 
-  void OnTestEnd(const TestInfo& test_info) {
+  void OnTestEnd(const TestInfo &test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
-  void OnTestPartResult(const TestPartResult& test_part_result) {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
+  void OnTestPartResult(const TestPartResult &test_part_result) override {
+    const char *file_name = test_part_result.file_name();
+    if (file_name == nullptr) file_name = "";
     SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
            "&line=" + StreamableToString(test_part_result.line_number()) +
            "&message=" + UrlEncode(test_part_result.message()));
@@ -1162,15 +1190,15 @@
 
  private:
   // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+  void SendLn(const std::string &message) { socket_writer_->SendLn(message); }
 
   // Called at the start of streaming to notify the receiver what
   // protocol we are using.
   void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
 
-  string FormatBool(bool value) { return value ? "1" : "0"; }
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
 
-  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+  const std::unique_ptr<AbstractSocketWriter> socket_writer_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
 };  // class StreamingListener
@@ -1180,4 +1208,6 @@
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-matchers.cc b/libaom/third_party/googletest/src/googletest/src/gtest-matchers.cc
new file mode 100644
index 0000000..27aaa2b
--- /dev/null
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-matchers.cc

@@ -0,0 +1,97 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-matchers.h"
+
+#include <string>
+
+namespace testing {
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string &>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string &>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const char *s) { *this = Eq(std::string(s)); }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(const std::string &s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+}  // namespace testing

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-port.cc b/libaom/third_party/googletest/src/googletest/src/gtest-port.cc
index e5bf3dd..adfdbef 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-port.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-port.cc

@@ -26,56 +26,64 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+#include <cstdint>
 #include <fstream>
+#include <memory>
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
+#include <windows.h>
+#include <io.h>
+#include <sys/stat.h>
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
 #else
-# include <unistd.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
+#endif
+
 #if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
+#if GTEST_OS_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
 #include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 namespace internal {
@@ -93,7 +101,7 @@
 
 namespace {
 template <typename T>
-T ReadProcFileField(const string& filename, int field) {
+T ReadProcFileField(const std::string &filename, int field) {
   std::string dummy;
   std::ifstream file(filename.c_str());
   while (field-- > 0) {
@@ -107,9 +115,9 @@
 
 // Returns the number of active threads, or 0 when there is an error.
 size_t GetThreadCount() {
-  const string filename =
+  const std::string filename =
       (Message() << "/proc/" << getpid() << "/stat").GetString();
-  return ReadProcFileField<int>(filename, 19);
+  return ReadProcFileField<size_t>(filename, 19);
 }
 
 #elif GTEST_OS_MAC
@@ -122,8 +130,7 @@
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -131,6 +138,80 @@
   }
 }
 
+#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD
+
+#if GTEST_OS_NETBSD
+#undef KERN_PROC
+#define KERN_PROC KERN_PROC2
+#define kinfo_proc kinfo_proc2
+#endif
+
+#if GTEST_OS_DRAGONFLY
+#define KP_NLWP(kp) (kp.kp_nthreads)
+#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#define KP_NLWP(kp) (kp.ki_numthreads)
+#elif GTEST_OS_NETBSD
+#define KP_NLWP(kp) (kp.p_nlwps)
+#endif
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID,
+    getpid(),
+#if GTEST_OS_NETBSD
+    sizeof(struct kinfo_proc),
+    1,
+#endif
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+  struct kinfo_proc info;
+  size_t size = sizeof(info);
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+  return static_cast<size_t>(KP_NLWP(info));
+}
+#elif GTEST_OS_OPENBSD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+    getpid(),
+    sizeof(struct kinfo_proc),
+    0,
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+
+  // get number of structs
+  size_t size;
+  if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
+    return 0;
+  }
+  mib[5] = size / mib[4];
+
+  // populate array of structs
+  struct kinfo_proc info[mib[5]];
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+
+  // exclude empty members
+  int nthreads = 0;
+  for (int i = 0; i < size / mib[4]; i++) {
+    if (info[i].p_tid != -1) nthreads++;
+  }
+  return nthreads;
+}
+
 #elif GTEST_OS_QNX
 
 // Returns the number of threads running in the process, or 0 to indicate that
@@ -142,7 +223,7 @@
   }
   procfs_info process_info;
   const int status =
-      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
   close(fd);
   if (status == EOK) {
     return static_cast<size_t>(process_info.num_threads);
@@ -156,7 +237,7 @@
 size_t GetThreadCount() {
   struct procentry64 entry;
   pid_t pid = getpid();
-  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
   if (status == 1) {
     return entry.pi_thcount;
   } else {
@@ -164,6 +245,21 @@
   }
 }
 
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
 #else
 
 size_t GetThreadCount() {
@@ -176,27 +272,17 @@
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) {
-  ::Sleep(n);
-}
+void SleepMilliseconds(int n) { ::Sleep(static_cast<DWORD>(n)); }
 
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
 
-AutoHandle::~AutoHandle() {
-  Reset();
-}
+AutoHandle::~AutoHandle() { Reset(); }
 
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
 
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -208,37 +294,32 @@
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
+           "and thus not allowed.";
   }
 }
 
 bool AutoHandle::IsCloseable() const {
   // Different Windows APIs may use either of these values to represent an
   // invalid handle.
-  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+  return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
 }
 
 Notification::Notification()
-    : event_(::CreateEvent(NULL,   // Default security attributes.
-                           TRUE,   // Do not reset automatically.
-                           FALSE,  // Initially unset.
-                           NULL)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != NULL);
+    : event_(::CreateEvent(nullptr,     // Default security attributes.
+                           TRUE,        // Do not reset automatically.
+                           FALSE,       // Initially unset.
+                           nullptr)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != nullptr);
 }
 
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
+void Notification::Notify() { GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); }
 
 void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+  GTEST_CHECK_(::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
 }
 
 Mutex::Mutex()
-    : owner_thread_id_(0),
-      type_(kDynamic),
-      critical_section_init_phase_(0),
+    : owner_thread_id_(0), type_(kDynamic), critical_section_init_phase_(0),
       critical_section_(new CRITICAL_SECTION) {
   ::InitializeCriticalSection(critical_section_);
 }
@@ -246,13 +327,10 @@
 Mutex::~Mutex() {
   // Static mutexes are leaked intentionally. It is not thread-safe to try
   // to clean them up.
-  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
-  // nothing to clean it up but is available only on Vista and later.
-  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
   if (type_ == kDynamic) {
     ::DeleteCriticalSection(critical_section_);
     delete critical_section_;
-    critical_section_ = NULL;
+    critical_section_ = nullptr;
   }
 }
 
@@ -279,6 +357,40 @@
       << "The current thread is not holding the mutex @" << this;
 }
 
+namespace {
+
+#ifdef _MSC_VER
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated {
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+  }
+
+  ~MemoryIsNotDeallocated() {
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+#endif  // _MSC_VER
+
+}  // namespace
+
 // Initializes owner_thread_id_ and critical_section_ in static mutexes.
 void Mutex::ThreadSafeLazyInit() {
   // Dynamic mutexes are initialized in the constructor.
@@ -289,19 +401,23 @@
         // If critical_section_init_phase_ was 0 before the exchange, we
         // are the first to test it and need to perform the initialization.
         owner_thread_id_ = 0;
-        critical_section_ = new CRITICAL_SECTION;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+#ifdef _MSC_VER
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+          critical_section_ = new CRITICAL_SECTION;
+        }
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -309,8 +425,7 @@
         }
         break;
 
-      case 2:
-        break;  // The mutex is already initialized and ready for use.
+      case 2: break;  // The mutex is already initialized and ready for use.
 
       default:
         GTEST_CHECK_(false)
@@ -324,21 +439,20 @@
 
 class ThreadWithParamSupport : public ThreadWithParamBase {
  public:
-  static HANDLE CreateThread(Runnable* runnable,
-                             Notification* thread_can_start) {
-    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+  static HANDLE CreateThread(Runnable *runnable,
+                             Notification *thread_can_start) {
+    ThreadMainParam *param = new ThreadMainParam(runnable, thread_can_start);
     DWORD thread_id;
-    // TODO(yukawa): Consider to use _beginthreadex instead.
     HANDLE thread_handle = ::CreateThread(
-        NULL,    // Default security.
-        0,       // Default stack size.
+        nullptr,  // Default security.
+        0,        // Default stack size.
         &ThreadWithParamSupport::ThreadMain,
-        param,   // Parameter to ThreadMainStatic
-        0x0,     // Default creation flags.
+        param,        // Parameter to ThreadMainStatic
+        0x0,          // Default creation flags.
         &thread_id);  // Need a valid pointer for the call to work under Win98.
-    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
-                                        << ::GetLastError() << ".";
-    if (thread_handle == NULL) {
+    GTEST_CHECK_(thread_handle != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
+    if (thread_handle == nullptr) {
       delete param;
     }
     return thread_handle;
@@ -346,19 +460,17 @@
 
  private:
   struct ThreadMainParam {
-    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
-    scoped_ptr<Runnable> runnable_;
+    ThreadMainParam(Runnable *runnable, Notification *thread_can_start)
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
+    std::unique_ptr<Runnable> runnable_;
     // Does not own.
-    Notification* thread_can_start_;
+    Notification *thread_can_start_;
   };
 
-  static DWORD WINAPI ThreadMain(void* ptr) {
+  static DWORD WINAPI ThreadMain(void *ptr) {
     // Transfers ownership.
-    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
-    if (param->thread_can_start_ != NULL)
+    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam *>(ptr));
+    if (param->thread_can_start_ != nullptr)
       param->thread_can_start_->WaitForNotification();
     param->runnable_->Run();
     return 0;
@@ -373,14 +485,11 @@
 }  // namespace
 
 ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
-                                         Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
+                                         Notification *thread_can_start)
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
 
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -395,20 +504,25 @@
  public:
   // Registers thread_local_instance as having value on the current thread.
   // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
+      const ThreadLocalBase *thread_local_instance) {
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
     DWORD current_thread = ::GetCurrentThreadId();
     MutexLock lock(&mutex_);
-    ThreadIdToThreadLocals* const thread_to_thread_locals =
+    ThreadIdToThreadLocals *const thread_to_thread_locals =
         GetThreadLocalsMapLocked();
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
       StartWatcherThreadFor(current_thread);
     }
-    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues &thread_local_values = thread_local_pos->second;
     ThreadLocalValues::iterator value_pos =
         thread_local_values.find(thread_local_instance);
     if (value_pos == thread_local_values.end()) {
@@ -416,7 +530,7 @@
           thread_local_values
               .insert(std::make_pair(
                   thread_local_instance,
-                  linked_ptr<ThreadLocalValueHolderBase>(
+                  std::shared_ptr<ThreadLocalValueHolderBase>(
                       thread_local_instance->NewValueForCurrentThread())))
               .first;
     }
@@ -424,19 +538,18 @@
   }
 
   static void OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
-    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+      const ThreadLocalBase *thread_local_instance) {
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
     // Clean up the ThreadLocalValues data structure while holding the lock, but
     // defer the destruction of the ThreadLocalValueHolderBases.
     {
       MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals* const thread_to_thread_locals =
+      ThreadIdToThreadLocals *const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
-        ThreadLocalValues& thread_local_values = it->second;
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
+        ThreadLocalValues &thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
         if (value_pos != thread_local_values.end()) {
@@ -453,21 +566,20 @@
 
   static void OnThreadExit(DWORD thread_id) {
     GTEST_CHECK_(thread_id != 0) << ::GetLastError();
-    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
     // Clean up the ThreadIdToThreadLocals data structure while holding the
     // lock, but defer the destruction of the ThreadLocalValueHolderBases.
     {
       MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals* const thread_to_thread_locals =
+      ThreadIdToThreadLocals *const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       ThreadIdToThreadLocals::iterator thread_local_pos =
           thread_to_thread_locals->find(thread_id);
       if (thread_local_pos != thread_to_thread_locals->end()) {
-        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        ThreadLocalValues &thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -479,8 +591,9 @@
 
  private:
   // In a particular thread, maps a ThreadLocal object to its value.
-  typedef std::map<const ThreadLocalBase*,
-                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  typedef std::map<const ThreadLocalBase *,
+                   std::shared_ptr<ThreadLocalValueHolderBase> >
+      ThreadLocalValues;
   // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
   // thread's ID.
   typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
@@ -492,21 +605,19 @@
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
-    GTEST_CHECK_(thread != NULL);
-    // We need to to pass a valid thread ID pointer into CreateThread for it
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
+    GTEST_CHECK_(thread != nullptr);
+    // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
     DWORD watcher_thread_id;
     HANDLE watcher_thread = ::CreateThread(
-        NULL,   // Default security.
-        0,      // Default stack size
+        nullptr,  // Default security.
+        0,        // Default stack size
         &ThreadLocalRegistryImpl::WatcherThreadFunc,
         reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
-        CREATE_SUSPENDED,
-        &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != NULL);
+        CREATE_SUSPENDED, &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != nullptr);
     // Give the watcher thread the same priority as ours to avoid being
     // blocked by it.
     ::SetThreadPriority(watcher_thread,
@@ -518,10 +629,9 @@
   // Monitors exit from a given thread and notifies those
   // ThreadIdToThreadLocals about thread termination.
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
-    const ThreadIdAndHandle* tah =
-        reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    const ThreadIdAndHandle *tah =
+        reinterpret_cast<const ThreadIdAndHandle *>(param);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -529,9 +639,12 @@
   }
 
   // Returns map of thread local instances.
-  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+  static ThreadIdToThreadLocals *GetThreadLocalsMapLocked() {
     mutex_.AssertHeld();
-    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    static ThreadIdToThreadLocals *map = new ThreadIdToThreadLocals();
     return map;
   }
 
@@ -544,14 +657,14 @@
 Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
 Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
 
-ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+ThreadLocalValueHolderBase *ThreadLocalRegistry::GetValueOnCurrentThread(
+    const ThreadLocalBase *thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase *thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -570,20 +683,20 @@
     regfree(&partial_regex_);
     regfree(&full_regex_);
   }
-  free(const_cast<char*>(pattern_));
+  free(const_cast<char *>(pattern_));
 }
 
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char *str, const RE &re) {
   if (!re.is_valid_) return false;
 
   regmatch_t match;
   return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
 }
 
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char *str, const RE &re) {
   if (!re.is_valid_) return false;
 
   regmatch_t match;
@@ -591,13 +704,13 @@
 }
 
 // Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
+void RE::Init(const char *regex) {
   pattern_ = posix::StrDup(regex);
 
   // Reserves enough bytes to hold the regular expression used for a
   // full match.
   const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
+  char *const full_pattern = new char[full_regex_len];
 
   snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
   is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
@@ -610,7 +723,7 @@
   // versions of Cygwin) doesn't accept the empty string as a valid
   // regex.  We change it to an equivalent form "()" to be safe.
   if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    const char *const partial_regex = (*regex == '\0') ? "()" : regex;
     is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
   }
   EXPECT_TRUE(is_valid_)
@@ -622,14 +735,14 @@
 
 #elif GTEST_USES_SIMPLE_RE
 
-// Returns true iff ch appears anywhere in str (excluding the
+// Returns true if and only if ch appears anywhere in str (excluding the
 // terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != NULL;
+bool IsInSet(char ch, const char *str) {
+  return ch != '\0' && strchr(str, ch) != nullptr;
 }
 
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
+// Returns true if and only if ch belongs to the given classification.
+// Unlike similar functions in <ctype.h>, these aren't affected by the
 // current locale.
 bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
 bool IsAsciiPunct(char ch) {
@@ -639,16 +752,16 @@
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
+         ('0' <= ch && ch <= '9') || ch == '_';
 }
 
-// Returns true iff "\\c" is a supported escape sequence.
+// Returns true if and only if "\\c" is a supported escape sequence.
 bool IsValidEscape(char c) {
   return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
 }
 
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
+// Returns true if and only if the given atom (specified by escaped and
+// pattern) matches ch.  The result is undefined if the atom is invalid.
 bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
   if (escaped) {  // "\\p" where p is pattern_char.
     switch (pattern_char) {
@@ -671,25 +784,23 @@
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
+static std::string FormatRegexSyntaxError(const char *regex, int index) {
   return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
 // otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
-    // assertion failures to match where the regex is used in user
-    // code.
+bool ValidateRegex(const char *regex) {
+  if (regex == nullptr) {
     ADD_FAILURE() << "NULL is not a valid simple regular expression.";
     return false;
   }
 
   bool is_valid = true;
 
-  // True iff ?, *, or + can follow the previous atom.
+  // True if and only if ?, *, or + can follow the previous atom.
   bool prev_repeatable = false;
   for (int i = 0; regex[i]; i++) {
     if (regex[i] == '\\') {  // An escape sequence
@@ -718,12 +829,12 @@
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -741,12 +852,10 @@
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char *regex, const char *str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -759,62 +868,56 @@
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
   }
   return false;
 }
 
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
+// Returns true if and only if regex matches a prefix of str. regex must
+// be a valid simple regular expression and not start with "^", or the
 // result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
+bool MatchRegexAtHead(const char *regex, const char *str) {
   if (*regex == '\0')  // An empty regex matches a prefix of anything.
     return true;
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
+  if (*regex == '$') return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
+  if (escaped) ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
+           MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
+// Returns true if and only if regex matches any substring of str.  regex must
+// be a valid simple regular expression, or the result is undefined.
 //
 // The algorithm is recursive, but the recursion depth doesn't exceed
 // the regex length, so we won't need to worry about running out of
 // stack space normally.  In rare cases the time complexity can be
 // exponential with respect to the regex length + the string length,
 // but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == NULL || str == NULL)
-    return false;
+bool MatchRegexAnywhere(const char *regex, const char *str) {
+  if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
+    if (MatchRegexAtHead(regex, str)) return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -822,25 +925,25 @@
 // Implements the RE class.
 
 RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
+  free(const_cast<char *>(pattern_));
+  free(const_cast<char *>(full_pattern_));
 }
 
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char *str, const RE &re) {
   return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
 }
 
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char *str, const RE &re) {
   return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
 }
 
 // Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = NULL;
-  if (regex != NULL) {
+void RE::Init(const char *regex) {
+  pattern_ = full_pattern_ = nullptr;
+  if (regex != nullptr) {
     pattern_ = posix::StrDup(regex);
   }
 
@@ -854,7 +957,7 @@
   // Reserves enough bytes to hold the regular expression used for a
   // full match: we need space to prepend a '^', append a '$', and
   // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
+  char *buffer = static_cast<char *>(malloc(len + 3));
   full_pattern_ = buffer;
 
   if (*regex != '^')
@@ -877,8 +980,8 @@
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
+GTEST_API_ ::std::string FormatFileLocation(const char *file, int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0) {
     return file_name + ":";
@@ -895,9 +998,9 @@
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
+                                                               int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
     return file_name;
@@ -905,14 +1008,17 @@
     return file_name + ":" + StreamableToString(line);
 }
 
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+GTestLog::GTestLog(GTestLogSeverity severity, const char *file, int line)
     : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
+  const char *const marker =
+      severity == GTEST_INFO
+          ? "[  INFO ]"
+          : severity == GTEST_WARNING
+                ? "[WARNING]"
+                : severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -923,9 +1029,10 @@
     posix::Abort();
   }
 }
+
 // Disable Microsoft deprecation warnings for POSIX functions called from
 // this class (creat, dup, dup2, and close)
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 #if GTEST_HAS_STREAM_REDIRECTION
 
@@ -934,27 +1041,26 @@
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };   // NOLINT
     char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
     filename_ = temp_file_path;
-# else
+#else
     // There's no guarantee that a test has write access to the current
     // directory, so we create the temporary file in the /tmp directory
     // instead. We use /tmp on most systems, and /sdcard on Android.
     // That's because Android doesn't have /tmp.
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -963,38 +1069,42 @@
     // code as part of a regular standalone executable, which doesn't
     // run in a Dalvik process (e.g. when running it through 'adb shell').
     //
-    // The location /sdcard is directly accessible from native code
-    // and is the only location (unofficially) supported by the Android
-    // team. It's generally a symlink to the real SD Card mount point
-    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
-    // other OEM-customized locations. Never rely on these, and always
-    // use /sdcard.
-    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
-#  else
+    // The location /data/local/tmp is directly accessible from native code.
+    // '/sdcard' and other variants cannot be relied on, as they are not
+    // guaranteed to be mounted, or may have a delay in mounting.
+    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
+#else
     char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
+#endif  // GTEST_OS_LINUX_ANDROID
     const int captured_fd = mkstemp(name_template);
+    if (captured_fd == -1) {
+      GTEST_LOG_(WARNING)
+          << "Failed to create tmp file " << name_template
+          << " for test; does the test have access to the /tmp directory?";
+    }
     filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(NULL);
+#endif  // GTEST_OS_WINDOWS
+    fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
+  ~CapturedStream() { remove(filename_.c_str()); }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
       // Restores the original stream.
-      fflush(NULL);
+      fflush(nullptr);
       dup2(uncaptured_fd_, fd_);
       close(uncaptured_fd_);
       uncaptured_fd_ = -1;
     }
 
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    FILE *const file = posix::FOpen(filename_.c_str(), "r");
+    if (file == nullptr) {
+      GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
+                        << " for capturing stream.";
+    }
     const std::string content = ReadEntireFile(file);
     posix::FClose(file);
     return content;
@@ -1009,14 +1119,15 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
 };
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
-static CapturedStream* g_captured_stderr = NULL;
-static CapturedStream* g_captured_stdout = NULL;
+static CapturedStream *g_captured_stderr = nullptr;
+static CapturedStream *g_captured_stdout = nullptr;
 
 // Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
-  if (*stream != NULL) {
+static void CaptureStream(int fd, const char *stream_name,
+                          CapturedStream **stream) {
+  if (*stream != nullptr) {
     GTEST_LOG_(FATAL) << "Only one " << stream_name
                       << " capturer can exist at a time.";
   }
@@ -1024,11 +1135,11 @@
 }
 
 // Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
+static std::string GetCapturedStream(CapturedStream **captured_stream) {
   const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
-  *captured_stream = NULL;
+  *captured_stream = nullptr;
 
   return content;
 }
@@ -1055,32 +1166,14 @@
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-std::string TempDir() {
-#if GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = posix::GetEnv("TEMP");
-  if (temp_dir == NULL || temp_dir[0] == '\0')
-    return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
-    return temp_dir;
-  else
-    return std::string(temp_dir) + "\\";
-#elif GTEST_OS_LINUX_ANDROID
-  return "/sdcard/";
-#else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-size_t GetFileSize(FILE* file) {
+size_t GetFileSize(FILE *file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
 }
 
-std::string ReadEntireFile(FILE* file) {
+std::string ReadEntireFile(FILE *file) {
   const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
+  char *const buffer = new char[file_size];
 
   size_t bytes_last_read = 0;  // # of bytes read in the last fread()
   size_t bytes_read = 0;       // # of bytes read so far
@@ -1090,7 +1183,8 @@
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1101,22 +1195,30 @@
 }
 
 #if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string> *g_injected_test_argvs =
+    nullptr;  // Owned.
 
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
-
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
-}
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
-  if (g_injected_test_argvs != NULL) {
+std::vector<std::string> GetInjectableArgvs() {
+  if (g_injected_test_argvs != nullptr) {
     return *g_injected_test_argvs;
   }
   return GetArgvs();
 }
+
+void SetInjectableArgvs(const std::vector<std::string> *new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string> &new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = nullptr;
+}
 #endif  // GTEST_HAS_DEATH_TEST
 
 #if GTEST_OS_WINDOWS_MOBILE
@@ -1131,7 +1233,7 @@
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "GTEST_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
+static std::string FlagToEnvVar(const char *flag) {
   const std::string full_flag =
       (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
 
@@ -1146,9 +1248,9 @@
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+bool ParseInt32(const Message &src_text, const char *str, int32_t *value) {
   // Parses the environment variable as a decimal integer.
-  char* end = NULL;
+  char *end = nullptr;
   const long long_value = strtol(str, &end, 10);  // NOLINT
 
   // Has strtol() consumed all characters in the string?
@@ -1163,14 +1265,14 @@
     return false;
   }
 
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
+  // Is the parsed value in the range of an int32_t?
+  const auto result = static_cast<int32_t>(long_value);
   if (long_value == LONG_MAX || long_value == LONG_MIN ||
       // The parsed value overflows as a long.  (strtol() returns
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
+      // The parsed value overflows as an int32_t.
+  ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1187,34 +1289,35 @@
 // Reads and returns the Boolean environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
 //
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
+// The value is considered true if and only if it's not "0".
+bool BoolFromGTestEnv(const char *flag, bool default_value) {
 #if defined(GTEST_GET_BOOL_FROM_ENV_)
   return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == NULL ?
-      default_value : strcmp(string_value, "0") != 0;
+  const char *const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
 }
 
 // Reads and returns a 32-bit integer stored in the environment
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+int32_t Int32FromGTestEnv(const char *flag, int32_t default_value) {
 #if defined(GTEST_GET_INT32_FROM_ENV_)
   return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == NULL) {
+  const char *const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
     // The environment variable is not set.
     return default_value;
   }
 
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
+  int32_t result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1222,37 +1325,36 @@
   }
 
   return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar() {
+  std::string default_value_for_output_flag = "";
+  const char *xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (nullptr != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
 }
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
-std::string StringFromGTestEnv(const char* flag, const char* default_value) {
+const char *StringFromGTestEnv(const char *flag, const char *default_value) {
 #if defined(GTEST_GET_STRING_FROM_ENV_)
   return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
-  const char* value = posix::GetEnv(env_var.c_str());
-  if (value != NULL) {
-    return value;
-  }
-
-  // As a special case for the 'output' flag, if GTEST_OUTPUT is not
-  // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
-  // system.  The value of XML_OUTPUT_FILE is a filename without the
-  // "xml:" prefix of GTEST_OUTPUT.
-  //
-  // The net priority order after flag processing is thus:
-  //   --gtest_output command line flag
-  //   GTEST_OUTPUT environment variable
-  //   XML_OUTPUT_FILE environment variable
-  //   'default_value'
-  if (strcmp(flag, "output") == 0) {
-    value = posix::GetEnv("XML_OUTPUT_FILE");
-    if (value != NULL) {
-      return std::string("xml:") + value;
-    }
-  }
-  return default_value;
+  const char *const value = posix::GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 }
 
 }  // namespace internal

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-printers.cc b/libaom/third_party/googletest/src/googletest/src/gtest-printers.cc
index a2df412..8399386 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-printers.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-printers.cc

@@ -26,10 +26,8 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -43,12 +41,13 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
-#include <ctype.h>
 #include <stdio.h>
+#include <cctype>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
 #include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
 
 namespace testing {
 
@@ -59,9 +58,10 @@
 // Prints a segment of bytes in the given object.
 GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
-                                size_t count, ostream* os) {
+void PrintByteSegmentInObjectTo(const unsigned char *obj_bytes, size_t start,
+                                size_t count, ostream *os) {
   char text[5] = "";
   for (size_t i = 0; i != count; i++) {
     const size_t j = start + i;
@@ -79,8 +79,8 @@
 }
 
 // Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
-                              ostream* os) {
+void PrintBytesInObjectToImpl(const unsigned char *obj_bytes, size_t count,
+                              ostream *os) {
   // Tells the user how big the object is.
   *os << count << "-byte object <";
 
@@ -89,14 +89,13 @@
   // If the object size is bigger than kThreshold, we'll have to omit
   // some details by printing only the first and the last kChunkSize
   // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
   if (count < kThreshold) {
     PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
   } else {
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
@@ -111,8 +110,8 @@
 // uses the << operator and thus is easier done outside of the
 // ::testing::internal namespace, which contains a << operator that
 // sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
-                          ostream* os) {
+void PrintBytesInObjectTo(const unsigned char *obj_bytes, size_t count,
+                          ostream *os) {
   PrintBytesInObjectToImpl(obj_bytes, count, os);
 }
 
@@ -123,64 +122,42 @@
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
 // Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
+inline bool IsPrintableAscii(wchar_t c) { return 0x20 <= c && c <= 0x7E; }
 
 // Prints a wide or narrow char c as a character literal without the
 // quotes, escaping it when necessary; returns how c was formatted.
 // The template argument UnsignedChar is the unsigned version of Char,
 // which is the type of c.
 template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  switch (static_cast<wchar_t>(c)) {
-    case L'\0':
-      *os << "\\0";
-      break;
-    case L'\'':
-      *os << "\\'";
-      break;
-    case L'\\':
-      *os << "\\\\";
-      break;
-    case L'\a':
-      *os << "\\a";
-      break;
-    case L'\b':
-      *os << "\\b";
-      break;
-    case L'\f':
-      *os << "\\f";
-      break;
-    case L'\n':
-      *os << "\\n";
-      break;
-    case L'\r':
-      *os << "\\r";
-      break;
-    case L'\t':
-      *os << "\\t";
-      break;
-    case L'\v':
-      *os << "\\v";
-      break;
+static CharFormat PrintAsCharLiteralTo(Char c, ostream *os) {
+  wchar_t w_c = static_cast<wchar_t>(c);
+  switch (w_c) {
+    case L'\0': *os << "\\0"; break;
+    case L'\'': *os << "\\'"; break;
+    case L'\\': *os << "\\\\"; break;
+    case L'\a': *os << "\\a"; break;
+    case L'\b': *os << "\\b"; break;
+    case L'\f': *os << "\\f"; break;
+    case L'\n': *os << "\\n"; break;
+    case L'\r': *os << "\\r"; break;
+    case L'\t': *os << "\\t"; break;
+    case L'\v': *os << "\\v"; break;
     default:
-      if (IsPrintableAscii(c)) {
+      if (IsPrintableAscii(w_c)) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
         return kHexEscape;
       }
   }
@@ -189,22 +166,17 @@
 
 // Prints a wchar_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream *os) {
   switch (c) {
-    case L'\'':
-      *os << "'";
-      return kAsIs;
-    case L'"':
-      *os << "\\\"";
-      return kSpecialEscape;
-    default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
+    case L'\'': *os << "'"; return kAsIs;
+    case L'"': *os << "\\\""; return kSpecialEscape;
+    default: return PrintAsCharLiteralTo<wchar_t>(c, os);
   }
 }
 
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(char c, ostream *os) {
   return PrintAsStringLiteralTo(
       static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
 }
@@ -214,7 +186,7 @@
 // using the standard C++ escape sequence.  The template argument
 // UnsignedChar is the unsigned version of Char, which is the type of c.
 template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream* os) {
+void PrintCharAndCodeTo(Char c, ostream *os) {
   // First, print c as a literal in the most readable form we can find.
   *os << ((sizeof(c) > 1) ? "L'" : "'");
   const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
@@ -223,47 +195,44 @@
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0)
-    return;
+  if (c == 0) return;
   *os << " (" << static_cast<int>(c);
 
-  // For more convenience, we print c's code again in hexidecimal,
+  // For more convenience, we print c's code again in hexadecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
     // Do nothing.
   } else {
-    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+    *os << ", 0x" << String::FormatHexInt(static_cast<int>(c));
   }
   *os << ")";
 }
 
-void PrintTo(unsigned char c, ::std::ostream* os) {
+void PrintTo(unsigned char c, ::std::ostream *os) {
   PrintCharAndCodeTo<unsigned char>(c, os);
 }
-void PrintTo(signed char c, ::std::ostream* os) {
+void PrintTo(signed char c, ::std::ostream *os) {
   PrintCharAndCodeTo<unsigned char>(c, os);
 }
 
 // Prints a wchar_t as a symbol if it is printable or as its internal
 // code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
-}
+void PrintTo(wchar_t wc, ostream *os) { PrintCharAndCodeTo<wchar_t>(wc, os); }
 
 // Prints the given array of characters to the ostream.  CharType must be either
 // char or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType *begin, size_t len, ostream *os) {
+  const char *const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
   *os << kQuoteBegin;
   bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
     const CharType cur = begin[index];
     if (is_previous_hex && IsXDigit(cur)) {
@@ -273,18 +242,23 @@
       *os << "\" " << kQuoteBegin;
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
   }
   *os << "\"";
+  return print_format;
 }
 
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType *begin, size_t len,
+                                ostream *os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -306,22 +280,22 @@
 }
 
 // Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+void UniversalPrintArray(const char *begin, size_t len, ostream *os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
 // Prints a (const) wchar_t array of 'len' elements, starting at address
 // 'begin'.
-void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+void UniversalPrintArray(const wchar_t *begin, size_t len, ostream *os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
 // Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
-  if (s == NULL) {
+void PrintTo(const char *s, ostream *os) {
+  if (s == nullptr) {
     *os << "NULL";
   } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    *os << ImplicitCast_<const void *>(s) << " pointing to ";
     PrintCharsAsStringTo(s, strlen(s), os);
   }
 }
@@ -334,36 +308,89 @@
 // wchar_t is implemented as a native type.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == NULL) {
+void PrintTo(const wchar_t *s, ostream *os) {
+  if (s == nullptr) {
     *os << "NULL";
   } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, std::wcslen(s), os);
+    *os << ImplicitCast_<const void *>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
   }
 }
 #endif  // wchar_t is native
 
-// Prints a ::string object.
-#if GTEST_HAS_GLOBAL_STRING
-void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
+namespace {
 
-void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+bool ContainsUnprintableControlCodes(const char *str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+      switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r': break;
+        default: return true;
+      }
+    }
+  }
+  return false;
 }
 
-// Prints a ::wstring object.
-#if GTEST_HAS_GLOBAL_WSTRING
-void PrintWideStringTo(const ::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
+
+bool IsValidUTF8(const char *str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
 }
-#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+void ConditionalPrintAsText(const char *str, size_t length, ostream *os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
+void PrintStringTo(const ::std::string &s, ostream *os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
 
 #if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+void PrintWideStringTo(const ::std::wstring &s, ostream *os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
 }
 #endif  // GTEST_HAS_STD_WSTRING

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-test-part.cc b/libaom/third_party/googletest/src/googletest/src/gtest-test-part.cc
index fb0e354..44b0e2b 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-test-part.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-test-part.cc

@@ -26,21 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest-test-part.h"
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
+#include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
@@ -48,35 +41,40 @@
 
 // Gets the summary of the failure message by omitting the stack trace
 // in it.
-std::string TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? message :
-      std::string(message, stack_trace);
+std::string TestPartResult::ExtractSummary(const char *message) {
+  const char *const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == nullptr ? message : std::string(message, stack_trace);
 }
 
 // Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os
-      << result.file_name() << ":" << result.line_number() << ": "
-      << (result.type() == TestPartResult::kSuccess ? "Success" :
-          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
-          "Non-fatal failure") << ":\n"
-      << result.message() << std::endl;
+std::ostream &operator<<(std::ostream &os, const TestPartResult &result) {
+  return os << internal::FormatFileLocation(result.file_name(),
+                                            result.line_number())
+            << " "
+            << (result.type() == TestPartResult::kSuccess
+                    ? "Success"
+                    : result.type() == TestPartResult::kSkip
+                          ? "Skipped"
+                          : result.type() == TestPartResult::kFatalFailure
+                                ? "Fatal failure"
+                                : "Non-fatal failure")
+            << ":\n"
+            << result.message() << std::endl;
 }
 
 // Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
+void TestPartResultArray::Append(const TestPartResult &result) {
   array_.push_back(result);
 }
 
 // Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+const TestPartResult &TestPartResultArray::GetTestPartResult(int index) const {
   if (index < 0 || index >= size()) {
     printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
     internal::posix::Abort();
   }
 
-  return array_[index];
+  return array_[static_cast<size_t>(index)];
 }
 
 // Returns the number of TestPartResult objects in the array.
@@ -88,8 +86,8 @@
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -99,9 +97,8 @@
 }
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
+    const TestPartResult &result) {
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/libaom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
index df1eef4..04effad 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest-typed-test.cc

@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
 #include "gtest/gtest-typed-test.h"
+
 #include "gtest/gtest.h"
 
 namespace testing {
@@ -39,16 +38,15 @@
 
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
-static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
+static const char *SkipSpaces(const char *str) {
+  while (IsSpace(*str)) str++;
   return str;
 }
 
-static std::vector<std::string> SplitIntoTestNames(const char* src) {
+static std::vector<std::string> SplitIntoTestNames(const char *src) {
   std::vector<std::string> name_vec;
   src = SkipSpaces(src);
-  for (; src != NULL; src = SkipComma(src)) {
+  for (; src != nullptr; src = SkipComma(src)) {
     name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
   }
   return name_vec;
@@ -57,8 +55,11 @@
 // Verifies that registered_tests match the test names in
 // registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
-const char* TypedTestCasePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
+const char *TypedTestSuitePState::VerifyRegisteredTestNames(
+    const char *test_suite_name, const char *file, int line,
+    const char *registered_tests) {
+  RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
+
   typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
   registered_ = true;
 
@@ -69,7 +70,7 @@
   std::set<std::string> tests;
   for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
        name_it != name_vec.end(); ++name_it) {
-    const std::string& name = *name_it;
+    const std::string &name = *name_it;
     if (tests.count(name) != 0) {
       errors << "Test " << name << " is listed more than once.\n";
       continue;
@@ -77,8 +78,7 @@
 
     bool found = false;
     for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end();
-         ++it) {
+         it != registered_tests_.end(); ++it) {
       if (name == it->first) {
         found = true;
         break;
@@ -89,19 +89,18 @@
       tests.insert(name);
     } else {
       errors << "No test named " << name
-             << " can be found in this test case.\n";
+             << " can be found in this test suite.\n";
     }
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
+       it != registered_tests_.end(); ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
   }
 
-  const std::string& errors_str = errors.GetString();
+  const std::string &errors_str = errors.GetString();
   if (errors_str != "") {
     fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
             errors_str.c_str());

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest.cc b/libaom/third_party/googletest/src/googletest/src/gtest.cc
index 5a8932c..5b4037f 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest.cc

@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
 #include "gtest/internal/custom/gtest.h"
@@ -45,6 +44,7 @@
 #include <wctype.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <iomanip>
 #include <limits>
 #include <list>
@@ -55,97 +55,93 @@
 
 #if GTEST_OS_LINUX
 
-// TODO(kenton@google.com): Use autoconf to detect availability of
-// gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
+#define GTEST_HAS_GETTIMEOFDAY_ 1
 
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_SYMBIAN
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+#include <string>
 
 #elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+#include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
+#include <strings.h>   // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
+#include <windows.h>  // NOLINT
+#undef min
 
-# if GTEST_OS_WINDOWS_MINGW
+#ifdef _MSC_VER
+#include <crtdbg.h>    // NOLINT
+#include <debugapi.h>  // NOLINT
+#endif
+
+#include <io.h>         // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
+#include <sys/stat.h>   // NOLINT
+
+#if GTEST_OS_WINDOWS_MINGW
 // MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
-//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
-//   supports these.  consider using them instead.
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-# undef min
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
 
 #else
 
 // Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
+#define GTEST_HAS_GETTIMEOFDAY_ 1
 
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
 #endif
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 #if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 using internal::CountIf;
@@ -155,20 +151,22 @@
 
 // Constants.
 
-// A test whose test case name or test name matches this filter is
+// A test whose test suite name or test name matches this filter is
 // disabled and not run.
 static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
 
-// A test case whose name matches this filter is considered a death
-// test case and will be run before test cases whose name doesn't
+// A test suite whose name matches this filter is considered a death
+// test suite and will be run before test suites whose name doesn't
 // match this filter.
-static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
 
 // A test filter that matches everything.
 static const char kUniversalFilter[] = "*";
 
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
 
 // The environment variable name for the test shard index.
 static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
@@ -183,19 +181,35 @@
 // stack trace.
 const char kStackTraceMarker[] = "\nStack trace:\n";
 
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
 bool g_help_flag = false;
 
+// Utilty function to Open File for Writing
+static FILE *OpenFileForWriting(const std::string &output_file) {
+  FILE *fileout = nullptr;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == nullptr) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
 }  // namespace internal
 
-static const char* GetDefaultFilter() {
-#ifdef GTEST_TEST_FILTER_ENV_VAR_
-  const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_);
-  if (testbridge_test_only != NULL) {
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char *GetDefaultFilter() {
+  const char *const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != nullptr) {
     return testbridge_test_only;
   }
-#endif  // GTEST_TEST_FILTER_ENV_VAR_
   return kUniversalFilter;
 }
 
@@ -205,76 +219,84 @@
     "Run disabled tests too, in addition to the tests normally being run.");
 
 GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
+    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    "True if and only if a failed assertion should be a debugger "
+    "break-point.");
 
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
+GTEST_DEFINE_bool_(catch_exceptions,
+                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   "True if and only if " GTEST_NAME_
+                   " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
+    color, internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
     "is set to a terminal type that supports colors.");
 
 GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    filter, internal::StringFromGTestEnv("filter", GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
     "exclude).  A test is run if it matches one of the positive "
     "patterns and does not match any of the negative patterns.");
 
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
 
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
+    internal::StringFromGTestEnv("output",
+                                 internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
     "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
     "If a directory is specified, output files will be created "
     "within that directory, with file-names based on the test "
     "executable's name and, if necessary, made unique by adding "
     "digits.");
 
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
+GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+                   "True if and only if " GTEST_NAME_
+                   " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+                   "True if and only if " GTEST_NAME_
+                   " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed, internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
+    repeat, internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
+GTEST_DEFINE_bool_(show_internal_stack_frames, false,
+                   "True if and only if " GTEST_NAME_
+                   " should include internal stack frames when "
+                   "printing test failure stack traces.");
 
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
+GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+                   "True if and only if " GTEST_NAME_
+                   " should randomize tests' order on every run.");
 
 GTEST_DEFINE_int32_(
     stack_trace_depth,
@@ -283,23 +305,20 @@
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
+    stream_result_to, internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
+    throw_on_failure, internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
+    "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
+    flagfile, internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
@@ -308,13 +327,12 @@
 // Generates a random number from [0, range), using a Linear
 // Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
 // than kMaxRange.
-GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
-UInt32 Random::Generate(UInt32 range) {
+uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -325,16 +343,16 @@
   return state_ % range;
 }
 
-// GTestIsInitialized() returns true iff the user has initialized
+// GTestIsInitialized() returns true if and only if the user has initialized
 // Google Test.  Useful for catching the user mistake of not initializing
 // Google Test before calling RUN_ALL_TESTS().
 static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
 
-// Iterates over a vector of TestCases, keeping a running sum of the
+// Iterates over a vector of TestSuites, keeping a running sum of the
 // results of calling a given int-returning method on each.
 // Returns the sum.
-static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
-                               int (TestCase::*method)() const) {
+static int SumOverTestSuiteList(const std::vector<TestSuite *> &case_list,
+                                int (TestSuite::*method)() const) {
   int sum = 0;
   for (size_t i = 0; i < case_list.size(); i++) {
     sum += (case_list[i]->*method)();
@@ -342,55 +360,203 @@
   return sum;
 }
 
-// Returns true iff the test case passed.
-static bool TestCasePassed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Passed();
+// Returns true if and only if the test suite passed.
+static bool TestSuitePassed(const TestSuite *test_suite) {
+  return test_suite->should_run() && test_suite->Passed();
 }
 
-// Returns true iff the test case failed.
-static bool TestCaseFailed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Failed();
+// Returns true if and only if the test suite failed.
+static bool TestSuiteFailed(const TestSuite *test_suite) {
+  return test_suite->should_run() && test_suite->Failed();
 }
 
-// Returns true iff test_case contains at least one test that should
-// run.
-static bool ShouldRunTestCase(const TestCase* test_case) {
-  return test_case->should_run();
+// Returns true if and only if test_suite contains at least one test that
+// should run.
+static bool ShouldRunTestSuite(const TestSuite *test_suite) {
+  return test_suite->should_run();
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char *file,
+                           int line, const char *message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
 
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
 
 // Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
+void AssertHelper::operator=(const Message &message) const {
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
 }
 
-// Mutex for linked pointers.
-GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+namespace {
+
+// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
+// to creates test cases for it, a syntetic test case is
+// inserted to report ether an error or a log message.
+//
+// This configuration bit will likely be removed at some point.
+constexpr bool kErrorOnUninstantiatedParameterizedTest = false;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false;
+
+// A test that fails at a given file/line location with a given message.
+class FailureTest : public Test {
+ public:
+  explicit FailureTest(const CodeLocation &loc, std::string error_message,
+                       bool as_error)
+      : loc_(loc), error_message_(std::move(error_message)),
+        as_error_(as_error) {}
+
+  void TestBody() override {
+    if (as_error_) {
+      AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(),
+                   loc_.line, "") = Message() << error_message_;
+    } else {
+      std::cout << error_message_ << std::endl;
+    }
+  }
+
+ private:
+  const CodeLocation loc_;
+  const std::string error_message_;
+  const bool as_error_;
+};
+
+}  // namespace
+
+std::set<std::string> *GetIgnoredParameterizedTestSuites() {
+  return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
+}
+
+// Add a given test_suit to the list of them allow to go un-instantiated.
+MarkAsIgnored::MarkAsIgnored(const char *test_suite) {
+  GetIgnoredParameterizedTestSuites()->insert(test_suite);
+}
+
+// If this parameterized test suite has no instantiations (and that
+// has not been marked as okay), emit a test case reporting that.
+void InsertSyntheticTestCase(const std::string &name, CodeLocation location,
+                             bool has_test_p) {
+  const auto &ignored = *GetIgnoredParameterizedTestSuites();
+  if (ignored.find(name) != ignored.end()) return;
+
+  const char kMissingInstantiation[] =  //
+      " is defined via TEST_P, but never instantiated. None of the test cases "
+      "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only "
+      "ones provided expand to nothing."
+      "\n\n"
+      "Ideally, TEST_P definitions should only ever be included as part of "
+      "binaries that intend to use them. (As opposed to, for example, being "
+      "placed in a library that may be linked in to get other utilities.)";
+
+  const char kMissingTestCase[] =  //
+      " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are "
+      "defined via TEST_P . No test cases will run."
+      "\n\n"
+      "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from "
+      "code that always depend on code that provides TEST_P. Failing to do "
+      "so is often an indication of dead code, e.g. the last TEST_P was "
+      "removed but the rest got left behind.";
+
+  std::string message =
+      "Paramaterized test suite " + name +
+      (has_test_p ? kMissingInstantiation : kMissingTestCase) +
+      "\n\n"
+      "To suppress this error for this test suite, insert the following line "
+      "(in a non-header) in the namespace it is defined in:"
+      "\n\n"
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
+
+  std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">";
+  RegisterTest(  //
+      "GoogleTestVerification", full_name.c_str(),
+      nullptr,  // No type parameter.
+      nullptr,  // No value parameter.
+      location.file.c_str(), location.line, [message, location] {
+        return new FailureTest(location, message,
+                               kErrorOnUninstantiatedParameterizedTest);
+      });
+}
+
+void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+                                        CodeLocation code_location) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
+      test_suite_name, code_location);
+}
+
+void RegisterTypeParameterizedTestSuiteInstantiation(const char *case_name) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
+    const char *test_suite_name, CodeLocation code_location) {
+  suites_.emplace(std::string(test_suite_name),
+                  TypeParameterizedTestSuiteInfo(code_location));
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
+    const char *test_suite_name) {
+  auto it = suites_.find(std::string(test_suite_name));
+  if (it != suites_.end()) {
+    it->second.instantiated = true;
+  } else {
+    GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '"
+                      << test_suite_name << "'";
+  }
+}
+
+void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
+  const auto &ignored = *GetIgnoredParameterizedTestSuites();
+  for (const auto &testcase : suites_) {
+    if (testcase.second.instantiated) continue;
+    if (ignored.find(testcase.first) != ignored.end()) continue;
+
+    std::string message =
+        "Type paramaterized test suite " + testcase.first +
+        " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
+        "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
+        "\n\n"
+        "Ideally, TYPED_TEST_P definitions should only ever be included as "
+        "part of binaries that intend to use them. (As opposed to, for "
+        "example, being placed in a library that may be linked in to get other "
+        "utilities.)"
+        "\n\n"
+        "To suppress this error for this test suite, insert the following line "
+        "(in a non-header) in the namespace it is definedin in:"
+        "\n\n"
+        "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+        testcase.first + ");";
+
+    std::string full_name =
+        "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">";
+    RegisterTest(  //
+        "GoogleTestVerification", full_name.c_str(),
+        nullptr,  // No type parameter.
+        nullptr,  // No value parameter.
+        testcase.second.code_location.file.c_str(),
+        testcase.second.code_location.line, [message, testcase] {
+          return new FailureTest(testcase.second.code_location, message,
+                                 kErrorOnUninstantiatedTypeParameterizedTest);
+        });
+  }
+}
 
 // A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
+static ::std::vector<std::string> g_argvs;
 
-const ::std::vector<testing::internal::string>& GetArgvs() {
+::std::vector<std::string> GetArgvs() {
 #if defined(GTEST_CUSTOM_GET_ARGVS_)
-  return GTEST_CUSTOM_GET_ARGVS_();
-#else  // defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto &custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
   return g_argvs;
 #endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
 }
@@ -400,7 +566,7 @@
 FilePath GetCurrentExecutableName() {
   FilePath result;
 
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS || GTEST_OS_OS2
   result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
 #else
   result.Set(FilePath(GetArgvs()[0]));
@@ -413,41 +579,37 @@
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == NULL) ?
-      std::string(gtest_output_flag) :
-      std::string(gtest_output_flag, colon - gtest_output_flag);
+  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char *const colon = strchr(gtest_output_flag, ':');
+  return (colon == nullptr)
+             ? std::string(gtest_output_flag)
+             : std::string(gtest_output_flag,
+                           static_cast<size_t>(colon - gtest_output_flag));
 }
 
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
+  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
 
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
+  std::string format = GetOutputFormat();
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
+
+  const char *const colon = strchr(gtest_output_flag, ':');
+  if (colon == nullptr)
+    return internal::FilePath::MakeFileName(
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
-    // path (as its meaning depends on the current drive), yet the
-    // following logic for turning it into an absolute path is wrong.
-    // Fix it.
     output_name = internal::FilePath::ConcatPaths(
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory())
-    return output_name.string();
+  if (!output_name.IsDirectory()) return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -455,8 +617,8 @@
   return result.string();
 }
 
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
+// Returns true if and only if the wildcard pattern matches the string.
+// The first ':' or '\0' character in pattern marks the end of it.
 //
 // This recursive algorithm isn't very efficient, but is clear and
 // works well enough for matching test names, which are short.
@@ -470,15 +632,14 @@
       return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
     case '*':  // Matches any string (possibly empty) of characters.
       return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
+             PatternMatchesString(pattern + 1, str);
     default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
+      return *pattern == *str && PatternMatchesString(pattern + 1, str + 1);
   }
 }
 
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
+bool UnitTestOptions::MatchesFilter(const std::string &name,
+                                    const char *filter) {
   const char *cur_pattern = filter;
   for (;;) {
     if (PatternMatchesString(cur_pattern, name.c_str())) {
@@ -489,7 +650,7 @@
     cur_pattern = strchr(cur_pattern, ':');
 
     // Returns if no more pattern can be found.
-    if (cur_pattern == NULL) {
+    if (cur_pattern == nullptr) {
       return false;
     }
 
@@ -498,19 +659,19 @@
   }
 }
 
-// Returns true iff the user-specified filter matches the test case
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+// Returns true if and only if the user-specified filter matches the test
+// suite name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_suite_name,
                                         const std::string &test_name) {
-  const std::string& full_name = test_case_name + "." + test_name.c_str();
+  const std::string &full_name = test_suite_name + "." + test_name.c_str();
 
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
+  const char *const p = GTEST_FLAG(filter).c_str();
+  const char *const dash = strchr(p, '-');
   std::string positive;
   std::string negative;
-  if (dash == NULL) {
+  if (dash == nullptr) {
     positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
     negative = "";
   } else {
@@ -562,9 +723,8 @@
 // Google Test.  The 'result' parameter specifies where to report the
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
+    TestPartResultArray *result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
   Init();
 }
 
@@ -572,14 +732,13 @@
 // Google Test.  The 'result' parameter specifies where to report the
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
+    InterceptMode intercept_mode, TestPartResultArray *result)
+    : intercept_mode_(intercept_mode), result_(result) {
   Init();
 }
 
 void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
   if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
     old_reporter_ = impl->GetGlobalTestPartResultReporter();
     impl->SetGlobalTestPartResultReporter(this);
@@ -592,7 +751,7 @@
 // The d'tor restores the test part result reporter used by Google Test
 // before.
 ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
   if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
     impl->SetGlobalTestPartResultReporter(old_reporter_);
   } else {
@@ -603,7 +762,7 @@
 // Increments the test part result count and remembers the result.
 // This method is from the TestPartResultReporterInterface interface.
 void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
+    const TestPartResult &result) {
   result_->Append(result);
 }
 
@@ -618,9 +777,7 @@
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -629,15 +786,15 @@
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
 // given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
+static AssertionResult HasOneFailure(const char * /* results_expr */,
+                                     const char * /* type_expr */,
+                                     const char * /* substr_expr */,
+                                     const TestPartResultArray &results,
+                                     TestPartResult::Type type,
+                                     const std::string &substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -648,18 +805,18 @@
     return AssertionFailure() << msg;
   }
 
-  const TestPartResult& r = results.GetTestPartResult(0);
+  const TestPartResult &r = results.GetTestPartResult(0);
   if (r.type() != type) {
     return AssertionFailure() << "Expected: " << expected << "\n"
                               << "  Actual:\n"
                               << r;
   }
 
-  if (strstr(r.message(), substr.c_str()) == NULL) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
+  if (strstr(r.message(), substr.c_str()) == nullptr) {
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
   }
 
   return AssertionSuccess();
@@ -668,13 +825,10 @@
 // The constructor of SingleFailureChecker remembers where to look up
 // test part results, what type of failure we expect, and what
 // substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray *results,
+                                           TestPartResult::Type type,
+                                           const std::string &substr)
+    : results_(results), type_(type), substr_(substr) {}
 
 // The destructor of SingleFailureChecker verifies that the given
 // TestPartResultArray contains exactly one failure that has the given
@@ -685,24 +839,26 @@
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl *unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
+    const TestPartResult &result) {
   unit_test_->current_test_result()->AddTestPartResult(result);
   unit_test_->listeners()->repeater()->OnTestPartResult(result);
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl *unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
+    const TestPartResult &result) {
   unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
 }
 
 // Returns the global test part result reporter.
-TestPartResultReporterInterface*
+TestPartResultReporterInterface *
 UnitTestImpl::GetGlobalTestPartResultReporter() {
   internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
   return global_test_part_result_repoter_;
@@ -710,78 +866,83 @@
 
 // Sets the global test part result reporter.
 void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
+    TestPartResultReporterInterface *reporter) {
   internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
   global_test_part_result_repoter_ = reporter;
 }
 
 // Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
+TestPartResultReporterInterface *
 UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
   return per_thread_test_part_result_reporter_.get();
 }
 
 // Sets the test part result reporter for the current thread.
 void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
+    TestPartResultReporterInterface *reporter) {
   per_thread_test_part_result_reporter_.set(reporter);
 }
 
-// Gets the number of successful test cases.
-int UnitTestImpl::successful_test_case_count() const {
-  return CountIf(test_cases_, TestCasePassed);
+// Gets the number of successful test suites.
+int UnitTestImpl::successful_test_suite_count() const {
+  return CountIf(test_suites_, TestSuitePassed);
 }
 
-// Gets the number of failed test cases.
-int UnitTestImpl::failed_test_case_count() const {
-  return CountIf(test_cases_, TestCaseFailed);
+// Gets the number of failed test suites.
+int UnitTestImpl::failed_test_suite_count() const {
+  return CountIf(test_suites_, TestSuiteFailed);
 }
 
-// Gets the number of all test cases.
-int UnitTestImpl::total_test_case_count() const {
-  return static_cast<int>(test_cases_.size());
+// Gets the number of all test suites.
+int UnitTestImpl::total_test_suite_count() const {
+  return static_cast<int>(test_suites_.size());
 }
 
-// Gets the number of all test cases that contain at least one test
+// Gets the number of all test suites that contain at least one test
 // that should run.
-int UnitTestImpl::test_case_to_run_count() const {
-  return CountIf(test_cases_, ShouldRunTestCase);
+int UnitTestImpl::test_suite_to_run_count() const {
+  return CountIf(test_suites_, ShouldRunTestSuite);
 }
 
 // Gets the number of successful tests.
 int UnitTestImpl::successful_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
+}
+
+// Gets the number of skipped tests.
+int UnitTestImpl::skipped_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
 }
 
 // Gets the number of failed tests.
 int UnitTestImpl::failed_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
 }
 
 // Gets the number of disabled tests that will be reported in the XML report.
 int UnitTestImpl::reportable_disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_,
-                             &TestCase::reportable_disabled_test_count);
+  return SumOverTestSuiteList(test_suites_,
+                              &TestSuite::reportable_disabled_test_count);
 }
 
 // Gets the number of disabled tests.
 int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
 }
 
 // Gets the number of tests to be printed in the XML report.
 int UnitTestImpl::reportable_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
 }
 
 // Gets the number of all tests.
 int UnitTestImpl::total_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
 }
 
 // Gets the number of tests that should run.
 int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
 }
 
 // Returns the current OS stack trace as an std::string.
@@ -796,11 +957,10 @@
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)), skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-      );  // NOLINT
+  );  // NOLINT
 }
 
 // Returns the current time in milliseconds.
@@ -809,20 +969,18 @@
   // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
   // http://analogous.blogspot.com/2005/04/epoch.html
   const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+      static_cast<TimeInMillis>(116444736UL) * 100000UL;
   const DWORD kTenthMicrosInMilliSecond = 10000;
 
   SYSTEMTIME now_systime;
   FILETIME now_filetime;
   ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
-  //   GetSystemTimeAsFileTime()?
   GetSystemTime(&now_systime);
   if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
     now_int64.LowPart = now_filetime.dwLowDateTime;
     now_int64.HighPart = now_filetime.dwHighDateTime;
     now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
+                         kJavaEpochToWinFileTimeDelta;
     return now_int64.QuadPart;
   }
   return 0;
@@ -831,19 +989,17 @@
 
   // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
   // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
-  //   SystemTimeToFileTime()
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
   _ftime64(&now);
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
   return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
 #elif GTEST_HAS_GETTIMEOFDAY_
   struct timeval now;
-  gettimeofday(&now, NULL);
+  gettimeofday(&now, nullptr);
   return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
 #else
-# error "Don't know how to get the current time on your system."
+#error "Don't know how to get the current time on your system."
 #endif
 }
 
@@ -856,15 +1012,13 @@
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the wide string, or NULL if the
 // input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return NULL;
+LPCWSTR String::AnsiToUtf16(const char *ansi) {
+  if (!ansi) return nullptr;
   const int length = strlen(ansi);
   const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                          NULL, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
+      MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
+  WCHAR *unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -873,44 +1027,43 @@
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return NULL;
-  const int ansi_length =
-      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                          NULL, 0, NULL, NULL);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                      ansi, ansi_length, NULL, NULL);
+const char *String::Utf16ToAnsi(LPCWSTR utf16_str) {
+  if (!utf16_str) return nullptr;
+  const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
+                                              0, nullptr, nullptr);
+  char *ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
+                      nullptr);
   ansi[ansi_length] = 0;
   return ansi;
 }
 
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
-// Compares two C strings.  Returns true iff they have the same content.
+// Compares two C strings.  Returns true if and only if they have the same
+// content.
 //
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if ( lhs == NULL ) return rhs == NULL;
+bool String::CStringEquals(const char *lhs, const char *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
 
-  if ( rhs == NULL ) return false;
+  if (rhs == nullptr) return false;
 
   return strcmp(lhs, rhs) == 0;
 }
 
-#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+#if GTEST_HAS_STD_WSTRING
 
 // Converts an array of wide chars to a narrow string using the UTF-8
 // encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
+static void StreamWideCharsToMessage(const wchar_t *wstr, size_t length,
+                                     Message *msg) {
+  for (size_t i = 0; i != length;) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
+      while (i != length && wstr[i] != L'\0') i++;
     } else {
       *msg << '\0';
       i++;
@@ -918,10 +1071,10 @@
   }
 }
 
-#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+#endif  // GTEST_HAS_STD_WSTRING
 
-void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest) {
+void SplitString(const ::std::string &str, char delimiter,
+                 ::std::vector< ::std::string> *dest) {
   ::std::vector< ::std::string> parsed;
   ::std::string::size_type pos = 0;
   while (::testing::internal::AlwaysTrue()) {
@@ -952,31 +1105,22 @@
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message &Message::operator<<(const wchar_t *wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message &Message::operator<<(wchar_t *wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message &Message::operator<<(const ::std::wstring &wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
-#if GTEST_HAS_GLOBAL_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
 // Gets the text streamed to this object so far as an std::string.
 // Each '\0' character in the buffer is replaced with "\\0".
 std::string Message::GetString() const {
@@ -985,15 +1129,14 @@
 
 // AssertionResult constructors.
 // Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
+AssertionResult::AssertionResult(const AssertionResult &other)
     : success_(other.success_),
-      message_(other.message_.get() != NULL ?
-               new ::std::string(*other.message_) :
-               static_cast< ::std::string*>(NULL)) {
-}
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string *>(nullptr)) {}
 
 // Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
+void AssertionResult::swap(AssertionResult &other) {
   using std::swap;
   swap(success_, other.success_);
   swap(message_, other.message_);
@@ -1002,32 +1145,27 @@
 // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
 AssertionResult AssertionResult::operator!() const {
   AssertionResult negation(!success_);
-  if (message_.get() != NULL)
-    negation << *message_;
+  if (message_.get() != nullptr) negation << *message_;
   return negation;
 }
 
 // Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
 
 // Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
+AssertionResult AssertionFailure() { return AssertionResult(false); }
 
 // Makes a failed assertion result with the given failure message.
 // Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
+AssertionResult AssertionFailure(const Message &message) {
   return AssertionFailure() << message;
 }
 
 namespace internal {
 
 namespace edit_distance {
-std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
-                                            const std::vector<size_t>& right) {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t> &left,
+                                            const std::vector<size_t> &right) {
   std::vector<std::vector<double> > costs(
       left.size() + 1, std::vector<double>(right.size() + 1));
   std::vector<std::vector<EditType> > best_move(
@@ -1088,7 +1226,7 @@
 // Helper class to convert string into ids with deduplication.
 class InternalStrings {
  public:
-  size_t GetId(const std::string& str) {
+  size_t GetId(const std::string &str) {
     IdMap::iterator it = ids_.find(str);
     if (it != ids_.end()) return it->second;
     size_t id = ids_.size();
@@ -1103,8 +1241,8 @@
 }  // namespace
 
 std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string>& left,
-    const std::vector<std::string>& right) {
+    const std::vector<std::string> &left,
+    const std::vector<std::string> &right) {
   std::vector<size_t> left_ids, right_ids;
   {
     InternalStrings intern_table;
@@ -1127,13 +1265,10 @@
 class Hunk {
  public:
   Hunk(size_t left_start, size_t right_start)
-      : left_start_(left_start),
-        right_start_(right_start),
-        adds_(),
-        removes_(),
+      : left_start_(left_start), right_start_(right_start), adds_(), removes_(),
         common_() {}
 
-  void PushLine(char edit, const char* line) {
+  void PushLine(char edit, const char *line) {
     switch (edit) {
       case ' ':
         ++common_;
@@ -1151,10 +1286,10 @@
     }
   }
 
-  void PrintTo(std::ostream* os) {
+  void PrintTo(std::ostream *os) {
     PrintHeader(os);
     FlushEdits();
-    for (std::list<std::pair<char, const char*> >::const_iterator it =
+    for (std::list<std::pair<char, const char *> >::const_iterator it =
              hunk_.begin();
          it != hunk_.end(); ++it) {
       *os << it->first << it->second << "\n";
@@ -1172,8 +1307,8 @@
   // Print a unified diff header for one hunk.
   // The format is
   //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
-  // where the left/right parts are ommitted if unnecessary.
-  void PrintHeader(std::ostream* ss) const {
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream *ss) const {
     *ss << "@@ ";
     if (removes_) {
       *ss << "-" << left_start_ << "," << (removes_ + common_);
@@ -1189,7 +1324,7 @@
 
   size_t left_start_, right_start_;
   size_t adds_, removes_, common_;
-  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+  std::list<std::pair<char, const char *> > hunk_, hunk_adds_, hunk_removes_;
 };
 
 }  // namespace
@@ -1201,8 +1336,8 @@
 // 'context' represents the desired unchanged prefix/suffix around the diff.
 // If two hunks are close enough that their contexts overlap, then they are
 // joined into one hunk.
-std::string CreateUnifiedDiff(const std::vector<std::string>& left,
-                              const std::vector<std::string>& right,
+std::string CreateUnifiedDiff(const std::vector<std::string> &left,
+                              const std::vector<std::string> &right,
                               size_t context) {
   const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
 
@@ -1229,9 +1364,10 @@
     for (; edit_i < edits.size(); ++edit_i) {
       if (n_suffix >= context) {
         // Continue only if the next hunk is very close.
-        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        auto it = edits.begin() + static_cast<int>(edit_i);
         while (it != edits.end() && *it == kMatch) ++it;
-        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+        if (it == edits.end() ||
+            static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
           // There is no next edit or it is too far away.
           break;
         }
@@ -1270,7 +1406,7 @@
 // The string representation of the values received in EqFailure() are already
 // escaped. Split them on escaped '\n' boundaries. Leave all other escaped
 // characters the same.
-std::vector<std::string> SplitEscapedString(const std::string& str) {
+std::vector<std::string> SplitEscapedString(const std::string &str) {
   std::vector<std::string> lines;
   size_t start = 0, end = str.size();
   if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
@@ -1307,22 +1443,22 @@
 //   lhs_value:      "5"
 //   rhs_value:      "6"
 //
-// The ignoring_case parameter is true iff the assertion is a
+// The ignoring_case parameter is true if and only if the assertion is a
 // *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
 // be inserted into the message.
-AssertionResult EqFailure(const char* lhs_expression,
-                          const char* rhs_expression,
-                          const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
+AssertionResult EqFailure(const char *lhs_expression,
+                          const char *rhs_expression,
+                          const std::string &lhs_value,
+                          const std::string &rhs_value, bool ignoring_case) {
   Message msg;
-  msg << "      Expected: " << lhs_expression;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
   if (lhs_value != lhs_expression) {
-    msg << "\n      Which is: " << lhs_value;
+    msg << "\n    Which is: " << lhs_value;
   }
-  msg << "\nTo be equal to: " << rhs_expression;
+  msg << "\n  " << rhs_expression;
   if (rhs_value != rhs_expression) {
-    msg << "\n      Which is: " << rhs_value;
+    msg << "\n    Which is: " << rhs_value;
   }
 
   if (ignoring_case) {
@@ -1330,10 +1466,8 @@
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1345,47 +1479,36 @@
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
+    const AssertionResult &assertion_result, const char *expression_text,
+    const char *actual_predicate_value, const char *expected_predicate_value) {
+  const char *actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
+AssertionResult DoubleNearPredFormat(const char *expr1, const char *expr2,
+                                     const char *abs_error_expr, double val1,
+                                     double val2, double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
-  // TODO(wan): do not print the value of an expression if it's
-  // already a literal.
   return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
-
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
+AssertionResult FloatingPointLE(const char *expr1, const char *expr2,
+                                RawType val1, RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1410,24 +1533,24 @@
           << val2;
 
   return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
+AssertionResult FloatLE(const char *expr1, const char *expr2, float val1,
+                        float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
+AssertionResult DoubleLE(const char *expr1, const char *expr2, double val1,
+                         double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
@@ -1435,36 +1558,33 @@
 
 // The helper function for {ASSERT|EXPECT}_EQ with int or enum
 // arguments.
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            BiggestInt lhs,
+AssertionResult CmpHelperEQ(const char *lhs_expression,
+                            const char *rhs_expression, BiggestInt lhs,
                             BiggestInt rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
 // A macro for implementing the helper functions needed to implement
 // ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
 // just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                    \
+  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2,     \
+                                     BiggestInt val1, BiggestInt val2) {       \
+    if (val1 op val2) {                                                        \
+      return AssertionSuccess();                                               \
+    } else {                                                                   \
+      return AssertionFailure()                                                \
+             << "Expected: (" << expr1 << ") " #op " (" << expr2               \
+             << "), actual: " << FormatForComparisonFailureMessage(val1, val2) \
+             << " vs " << FormatForComparisonFailureMessage(val2, val1);       \
+    }                                                                          \
+  }
 
 // Implements the helper function for {ASSERT|EXPECT}_NE with int or
 // enum arguments.
@@ -1474,74 +1594,63 @@
 GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT with int or
 // enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE with int or
 // enum arguments.
 GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT with int or
 // enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
-                               const char* rhs) {
+AssertionResult CmpHelperSTREQ(const char *lhs_expression,
+                               const char *rhs_expression, const char *lhs,
+                               const char *rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
-                                   const char* rhs) {
+AssertionResult CmpHelperSTRCASEEQ(const char *lhs_expression,
+                                   const char *rhs_expression, const char *lhs,
+                                   const char *rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
+AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                               const char *s2_expression, const char *s1,
+                               const char *s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
+AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
+                                   const char *s2_expression, const char *s1,
+                                   const char *s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1551,28 +1660,25 @@
 
 // Helper functions for implementing IsSubString() and IsNotSubstring().
 
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
+// This group of overloaded functions return true if and only if needle
+// is a substring of haystack.  NULL is considered a substring of
+// itself only.
 
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
+bool IsSubstringPred(const char *needle, const char *haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
 
-  return strstr(haystack, needle) != NULL;
+  return strstr(haystack, needle) != nullptr;
 }
 
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
+bool IsSubstringPred(const wchar_t *needle, const wchar_t *haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
 
-  return wcsstr(haystack, needle) != NULL;
+  return wcsstr(haystack, needle) != nullptr;
 }
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
+bool IsSubstringPred(const StringType &needle, const StringType &haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1581,21 +1687,22 @@
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char *needle_expr,
+                                const char *haystack_expr,
+                                const StringType &needle,
+                                const StringType &haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  const char *const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1604,52 +1711,52 @@
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const char *needle, const char *haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const wchar_t *needle, const wchar_t *haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr, const char *needle,
+                               const char *haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr, const wchar_t *needle,
+                               const wchar_t *haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const ::std::string &needle,
+                            const ::std::string &haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr,
+                               const ::std::string &needle,
+                               const ::std::string &haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const ::std::wstring &needle,
+                            const ::std::wstring &haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr,
+                               const ::std::wstring &needle,
+                               const ::std::wstring &haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1661,55 +1768,54 @@
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
+AssertionResult HRESULTFailureHelper(const char *expr, const char *expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-# else
+#else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
   char error_text[kBufSize] = { '\0' };
   DWORD message_length = ::FormatMessageA(kFlags,
                                           0,  // no source, we're asking system
-                                          hr,  // the error
+                                          static_cast<DWORD>(hr),  // the error
                                           0,  // no line width restrictions
                                           error_text,  // output buffer
-                                          kBufSize,  // buf size
-                                          NULL);  // no arguments for inserts
+                                          kBufSize,    // buf size
+                                          nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
+       --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
 
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+AssertionResult IsHRESULTSuccess(const char *expr, long hr) {  // NOLINT
   if (SUCCEEDED(hr)) {
     return AssertionSuccess();
   }
   return HRESULTFailureHelper(expr, "succeeds", hr);
 }
 
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+AssertionResult IsHRESULTFailure(const char *expr, long hr) {  // NOLINT
   if (FAILED(hr)) {
     return AssertionSuccess();
   }
@@ -1721,7 +1827,7 @@
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -1731,41 +1837,43 @@
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
 // shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+inline uint32_t ChopLowBits(uint32_t *bits, int n) {
+  const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
   *bits >>= n;
   return low_bits;
 }
 
 // Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
+// code_point parameter is of type uint32_t because wchar_t may not be
 // wide enough to contain a code point.
 // If the code_point is not a valid Unicode code point
 // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
 // to "(Invalid Unicode 0xXXXXXXXX)".
-std::string CodePointToUtf8(UInt32 code_point) {
+std::string CodePointToUtf8(uint32_t code_point) {
   if (code_point > kMaxCodePoint4) {
-    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+    return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
   }
 
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1785,32 +1893,35 @@
   return str;
 }
 
-// The following two functions only make sense if the the system
+// The following two functions only make sense if the system
 // uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
 
 // Determines if the arguments constitute UTF-16 surrogate pair
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2) ?
-      (((first & mask) << 10) | (second & mask)) + 0x10000 :
-      // This function should not be called when the condition is
-      // false, but we provide a sensible default in case it is.
-      static_cast<UInt32>(first);
+inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                      wchar_t second) {
+  const auto first_u = static_cast<uint32_t>(first);
+  const auto second_u = static_cast<uint32_t>(second);
+  const uint32_t mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2)
+             ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
+             :
+             // This function should not be called when the condition is
+             // false, but we provide a sensible default in case it is.
+             first_u;
 }
 
 // Converts a wide string to a narrow string in UTF-8 encoding.
 // The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
 //   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
 // Parameter str points to a null-terminated wide string.
 // Parameter num_chars may additionally limit the number
@@ -1821,22 +1932,21 @@
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
+std::string WideStringToUtf8(const wchar_t *str, int num_chars) {
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
+    uint32_t unicode_code_point;
 
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
       i++;
     } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
+      unicode_code_point = static_cast<uint32_t>(str[i]);
     }
 
     stream << CodePointToUtf8(unicode_code_point);
@@ -1846,88 +1956,80 @@
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL)  return "(null)";
+std::string String::ShowWideCString(const wchar_t *wide_c_str) {
+  if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
 }
 
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
+// Compares two wide C strings.  Returns true if and only if they have the
+// same content.
 //
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == NULL) return rhs == NULL;
+bool String::WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
 
-  if (rhs == NULL) return false;
+  if (rhs == nullptr) return false;
 
   return wcscmp(lhs, rhs) == 0;
 }
 
 // Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
-                               const wchar_t* rhs) {
+AssertionResult CmpHelperSTREQ(const char *lhs_expression,
+                               const char *rhs_expression, const wchar_t *lhs,
+                               const wchar_t *rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
+AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                               const char *s2_expression, const wchar_t *s1,
+                               const wchar_t *s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
 }
 
-// Compares two C strings, ignoring case.  Returns true iff they have
+// Compares two C strings, ignoring case.  Returns true if and only if they have
 // the same content.
 //
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == NULL)
-    return rhs == NULL;
-  if (rhs == NULL)
-    return false;
+bool String::CaseInsensitiveCStringEquals(const char *lhs, const char *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+  if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
 }
 
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == NULL) return rhs == NULL;
+// Compares two wide C strings, ignoring case.  Returns true if and only if they
+// have the same content.
+//
+// Unlike wcscasecmp(), this function can handle NULL argument(s).
+// A NULL C string is considered different to any non-NULL wide C string,
+// including the empty string.
+// NB: The implementations on different platforms slightly differ.
+// On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+// environment variable. On GNU platform this method uses wcscasecmp
+// which compares according to LC_CTYPE category of the current locale.
+// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+// current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
+                                              const wchar_t *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
 
-  if (rhs == NULL) return false;
+  if (rhs == nullptr) return false;
 
 #if GTEST_OS_WINDOWS
   return _wcsicmp(lhs, rhs) == 0;
@@ -1938,17 +2040,17 @@
   // Other unknown OSes may not define it either.
   wint_t left, right;
   do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
+    left = towlower(static_cast<wint_t>(*lhs++));
+    right = towlower(static_cast<wint_t>(*rhs++));
   } while (left && left == right);
   return left == right;
 #endif  // OS selector
 }
 
-// Returns true iff str ends with the given suffix, ignoring case.
+// Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string &str,
+                                     const std::string &suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -1964,12 +2066,17 @@
 }
 
 // Formats an int value as "%X".
-std::string String::FormatHexInt(int value) {
+std::string String::FormatHexUInt32(uint32_t value) {
   std::stringstream ss;
   ss << std::hex << std::uppercase << value;
   return ss.str();
 }
 
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  return FormatHexUInt32(static_cast<uint32_t>(value));
+}
+
 // Formats a byte as "%02X".
 std::string String::FormatByte(unsigned char value) {
   std::stringstream ss;
@@ -1980,14 +2087,14 @@
 
 // Converts the buffer in a stringstream to an std::string, converting NUL
 // bytes to "\\0" along the way.
-std::string StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
+std::string StringStreamToString(::std::stringstream *ss) {
+  const ::std::string &str = ss->str();
+  const char *const start = str.c_str();
+  const char *const end = start + str.length();
 
   std::string result;
-  result.reserve(2 * (end - start));
-  for (const char* ch = start; ch != end; ++ch) {
+  result.reserve(static_cast<size_t>(2 * (end - start)));
+  for (const char *ch = start; ch != end; ++ch) {
     if (*ch == '\0') {
       result += "\\0";  // Replaces NUL with "\\0";
     } else {
@@ -1999,8 +2106,8 @@
 }
 
 // Appends the user-supplied message to the Google-Test-generated message.
-std::string AppendUserMessage(const std::string& gtest_msg,
-                              const Message& user_msg) {
+std::string AppendUserMessage(const std::string &gtest_msg,
+                              const Message &user_msg) {
   // Appends the user message if it's non-empty.
   const std::string user_msg_string = user_msg.GetString();
   if (user_msg_string.empty()) {
@@ -2016,47 +2123,40 @@
 
 // Creates an empty TestResult.
 TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
+    : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(i);
+const TestPartResult &TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
+  return test_part_results_.at(static_cast<size_t>(i));
 }
 
 // Returns the i-th test property. i can range from 0 to
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(i);
+const TestProperty &TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
+  return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
 
 // Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+void TestResult::AddTestPartResult(const TestPartResult &test_part_result) {
   test_part_results_.push_back(test_part_result);
 }
 
 // Adds a test property to the list. If a property with the same key as the
 // supplied property is already represented, the value of this test_property
 // replaces the old value for that key.
-void TestResult::RecordProperty(const std::string& xml_element,
-                                const TestProperty& test_property) {
+void TestResult::RecordProperty(const std::string &xml_element,
+                                const TestProperty &test_property) {
   if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
@@ -2073,45 +2173,37 @@
 
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
-static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
+static const char *const kReservedTestSuitesAttributes[] = {
+  "disabled",    "errors", "failures", "name",
+  "random_seed", "tests",  "time",     "timestamp"
 };
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
-static const char* const kReservedTestSuiteAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "tests",
-  "time"
+static const char *const kReservedTestSuiteAttributes[] = {
+  "disabled", "errors", "failures", "name", "tests", "time", "timestamp"
 };
 
 // The list of reserved attributes used in the <testcase> element of XML output.
-static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
+static const char *const kReservedTestCaseAttributes[] = {
+  "classname",  "name",        "status", "time",
+  "type_param", "value_param", "file",   "line"
+};
+
+// Use a slightly different set for allowed output to ensure existing tests can
+// still RecordProperty("result") or "RecordProperty(timestamp")
+static const char *const kReservedOutputTestCaseAttributes[] = {
+  "classname",   "name", "status", "time",   "type_param",
+  "value_param", "file", "line",   "result", "timestamp"
 };
 
 template <int kSize>
-std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+std::vector<std::string> ArrayAsVector(const char *const (&array)[kSize]) {
   return std::vector<std::string>(array, array + kSize);
 }
 
 static std::vector<std::string> GetReservedAttributesForElement(
-    const std::string& xml_element) {
+    const std::string &xml_element) {
   if (xml_element == "testsuites") {
     return ArrayAsVector(kReservedTestSuitesAttributes);
   } else if (xml_element == "testsuite") {
@@ -2125,7 +2217,23 @@
   return std::vector<std::string>();
 }
 
-static std::string FormatWordList(const std::vector<std::string>& words) {
+// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
+static std::vector<std::string> GetReservedOutputAttributesForElement(
+    const std::string &xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedOutputTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string> &words) {
   Message word_list;
   for (size_t i = 0; i < words.size(); ++i) {
     if (i > 0 && words.size() > 2) {
@@ -2139,10 +2247,11 @@
   return word_list.GetString();
 }
 
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
+static bool ValidateTestPropertyName(
+    const std::string &property_name,
+    const std::vector<std::string> &reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
+      reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2153,8 +2262,8 @@
 
 // Adds a failure if the key is a reserved attribute of the element named
 // xml_element.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const std::string& xml_element,
-                                      const TestProperty& test_property) {
+bool TestResult::ValidateTestProperty(const std::string &xml_element,
+                                      const TestProperty &test_property) {
   return ValidateTestPropertyName(test_property.key(),
                                   GetReservedAttributesForElement(xml_element));
 }
@@ -2167,31 +2276,40 @@
   elapsed_time_ = 0;
 }
 
-// Returns true iff the test failed.
+// Returns true off the test part was skipped.
+static bool TestPartSkipped(const TestPartResult &result) {
+  return result.skipped();
+}
+
+// Returns true if and only if the test was skipped.
+bool TestResult::Skipped() const {
+  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
+}
+
+// Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
+    if (GetTestPartResult(i).failed()) return true;
   }
   return false;
 }
 
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
+// Returns true if and only if the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult &result) {
   return result.fatally_failed();
 }
 
-// Returns true iff the test fatally failed.
+// Returns true if and only if the test fatally failed.
 bool TestResult::HasFatalFailure() const {
   return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
 }
 
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+// Returns true if and only if the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult &result) {
   return result.nonfatally_failed();
 }
 
-// Returns true iff the test has a non-fatal failure.
+// Returns true if and only if the test has a non-fatal failure.
 bool TestResult::HasNonfatalFailure() const {
   return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
 }
@@ -2212,35 +2330,30 @@
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {
-}
+Test::~Test() {}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, const std::string& value) {
+void Test::RecordProperty(const std::string &key, const std::string &value) {
   UnitTest::GetInstance()->RecordProperty(key, value);
 }
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, int value) {
+void Test::RecordProperty(const std::string &key, int value) {
   Message value_message;
   value_message << value;
   RecordProperty(key, value_message.GetString().c_str());
@@ -2249,37 +2362,37 @@
 namespace internal {
 
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message) {
+                                    const std::string &message) {
   // This function is a friend of UnitTest and as such has access to
   // AddTestPartResult.
   UnitTest::GetInstance()->AddTestPartResult(
       result_type,
-      NULL,  // No info about the source file where the exception occurred.
-      -1,    // We have no info on which line caused the exception.
+      nullptr,  // No info about the source file where the exception occurred.
+      -1,       // We have no info on which line caused the exception.
       message,
-      "");   // No stack trace, either.
+      "");  // No stack trace, either.
 }
 
 }  // namespace internal
 
-// Google Test requires all tests in the same test case to use the same test
+// Google Test requires all tests in the same test suite to use the same test
 // fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test case.  If
+// same fixture class as the first test in the current test suite.  If
 // yes, it returns true; otherwise it generates a Google Test failure and
 // returns false.
 bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestCase* const test_case = impl->current_test_case();
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  const TestSuite *const test_suite = impl->current_test_suite();
 
-  // Info about the first test in the current test case.
-  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  // Info about the first test in the current test suite.
+  const TestInfo *const first_test_info = test_suite->test_info_list()[0];
   const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
+  const char *const first_test_name = first_test_info->name();
 
   // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
+  const TestInfo *const this_test_info = impl->current_test_info();
   const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
+  const char *const this_test_name = this_test_info->name();
 
   if (this_fixture_id != first_fixture_id) {
     // Is the first test defined using TEST?
@@ -2288,21 +2401,21 @@
     const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
 
     if (first_is_TEST || this_is_TEST) {
-      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Both TEST and TEST_F appear in same test suite, which is incorrect.
       // Tell the user how to fix this.
 
       // Gets the name of the TEST and the name of the TEST_F.  Note
       // that first_is_TEST and this_is_TEST cannot both be true, as
       // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
+      const char *const TEST_name =
           first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
+      const char *const TEST_F_name =
           first_is_TEST ? this_test_name : first_test_name;
 
       ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test case is\n"
-          << "illegal.  In test case " << this_test_info->test_case_name()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test suite is\n"
+          << "illegal.  In test suite " << this_test_info->test_suite_name()
           << ",\n"
           << "test " << TEST_F_name << " is defined using TEST_F but\n"
           << "test " << TEST_name << " is defined using TEST.  You probably\n"
@@ -2312,15 +2425,15 @@
       // Two fixture classes with the same name appear in two different
       // namespaces, which is not allowed. Tell the user how to fix this.
       ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class.  However, in test case "
-          << this_test_info->test_case_name() << ",\n"
-          << "you defined test " << first_test_name
-          << " and test " << this_test_name << "\n"
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class.  However, in test suite "
+          << this_test_info->test_suite_name() << ",\n"
+          << "you defined test " << first_test_name << " and test "
+          << this_test_name << "\n"
           << "using two different test fixture classes.  This can happen if\n"
           << "the two classes are from different namespaces or translation\n"
           << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test cases.";
+          << "of the classes to put the tests into different test suites.";
     }
     return false;
   }
@@ -2334,11 +2447,11 @@
 // function returns its result via an output parameter pointer because VC++
 // prohibits creation of objects with destructors on stack in functions
 // using __try (see error C2712).
-static std::string* FormatSehExceptionMessage(DWORD exception_code,
-                                              const char* location) {
+static std::string *FormatSehExceptionMessage(DWORD exception_code,
+                                              const char *location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2350,10 +2463,10 @@
 #if GTEST_HAS_EXCEPTIONS
 
 // Adds an "exception thrown" fatal failure to the current test.
-static std::string FormatCxxExceptionMessage(const char* description,
-                                             const char* location) {
+static std::string FormatCxxExceptionMessage(const char *description,
+                                             const char *location) {
   Message message;
-  if (description != NULL) {
+  if (description != nullptr) {
     message << "C++ exception with description \"" << description << "\"";
   } else {
     message << "Unknown C++ exception";
@@ -2364,10 +2477,10 @@
 }
 
 static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
+    const TestPartResult &test_part_result);
 
 GoogleTestFailureException::GoogleTestFailureException(
-    const TestPartResult& failure)
+    const TestPartResult &failure)
     : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
 
 #endif  // GTEST_HAS_EXCEPTIONS
@@ -2381,8 +2494,8 @@
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
+                                              const char *location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2391,8 +2504,8 @@
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
+    std::string *exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2408,8 +2521,8 @@
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
+                                           const char *location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2437,19 +2550,21 @@
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+    } catch (const AssertionException &) {  // NOLINT
+      // This failure was reported already.
+    } catch (const internal::GoogleTestFailureException &) {  // NOLINT
       // This exception type can only be thrown by a failed Google
       // Test assertion with the intention of letting another testing
       // framework catch it.  Therefore we just re-throw it.
       throw;
-    } catch (const std::exception& e) {  // NOLINT
+    } catch (const std::exception &e) {  // NOLINT
       internal::ReportFailureInUnknownLocation(
           TestPartResult::kFatalFailure,
           FormatCxxExceptionMessage(e.what(), location));
     } catch (...) {  // NOLINT
       internal::ReportFailureInUnknownLocation(
           TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(NULL, location));
+          FormatCxxExceptionMessage(nullptr, location));
     }
     return static_cast<Result>(0);
 #else
@@ -2466,57 +2581,58 @@
 void Test::Run() {
   if (!HasSameFixtureClass()) return;
 
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful.
-  if (!HasFatalFailure()) {
+  // We will run the test only if SetUp() was successful and didn't call
+  // GTEST_SKIP().
+  if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
 }
 
-// Returns true iff the current test has a fatal failure.
+// Returns true if and only if the current test has a fatal failure.
 bool Test::HasFatalFailure() {
   return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
 }
 
-// Returns true iff the current test has a non-fatal failure.
+// Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
+}
+
+// Returns true if and only if the current test was skipped.
+bool Test::IsSkipped() {
+  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
 }
 
 // class TestInfo
 
 // Constructs a TestInfo object. It assumes ownership of the test factory
 // object.
-TestInfo::TestInfo(const std::string& a_test_case_name,
-                   const std::string& a_name,
-                   const char* a_type_param,
-                   const char* a_value_param,
+TestInfo::TestInfo(const std::string &a_test_suite_name,
+                   const std::string &a_name, const char *a_type_param,
+                   const char *a_value_param,
                    internal::CodeLocation a_code_location,
                    internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_case_name_(a_test_case_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
-      location_(a_code_location),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
+                   internal::TestFactoryBase *factory)
+    : test_suite_name_(a_test_suite_name), name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
+      location_(a_code_location), fixture_class_id_(fixture_class_id),
+      should_run_(false), is_disabled_(false), matches_filter_(false),
+      factory_(factory), result_() {}
 
 // Destructs a TestInfo object.
 TestInfo::~TestInfo() { delete factory_; }
@@ -2528,7 +2644,7 @@
 //
 // Arguments:
 //
-//   test_case_name:   name of the test case
+//   test_suite_name:   name of the test suite
 //   name:             name of the test
 //   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -2536,49 +2652,40 @@
 //                     or NULL if this is not a value-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
 //   factory:          pointer to the factory that creates a test object.
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    CodeLocation code_location,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_case_name, name, type_param, value_param,
+TestInfo *MakeAndRegisterTestInfo(
+    const char *test_suite_name, const char *name, const char *type_param,
+    const char *value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory) {
+  TestInfo *const test_info =
+      new TestInfo(test_suite_name, name, type_param, value_param,
                    code_location, fixture_class_id, factory);
   GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
   return test_info;
 }
 
-#if GTEST_HAS_PARAM_TEST
-void ReportInvalidTestCaseType(const char* test_case_name,
-                               CodeLocation code_location) {
+void ReportInvalidTestSuiteType(const char *test_suite_name,
+                                CodeLocation code_location) {
   Message errors;
   errors
-      << "Attempted redefinition of test case " << test_case_name << ".\n"
-      << "All tests in the same test case must use the same test fixture\n"
-      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "Attempted redefinition of test suite " << test_suite_name << ".\n"
+      << "All tests in the same test suite must use the same test fixture\n"
+      << "class.  However, in test suite " << test_suite_name << ", you tried\n"
       << "to define a test using a fixture class different from the one\n"
       << "used earlier. This can happen if the two fixture classes are\n"
       << "from different namespaces and have the same name. You should\n"
       << "probably rename one of the classes to put the tests into different\n"
-      << "test cases.";
+      << "test suites.";
 
-  fprintf(stderr, "%s %s",
-          FormatFileLocation(code_location.file.c_str(),
-                             code_location.line).c_str(),
-          errors.GetString().c_str());
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
 }
-#endif  // GTEST_HAS_PARAM_TEST
-
 }  // namespace internal
 
 namespace {
@@ -2586,7 +2693,7 @@
 // A predicate that checks the test name of a TestInfo against a known
 // value.
 //
-// This is used for implementation of the TestCase class only.  We put
+// This is used for implementation of the TestSuite class only.  We put
 // it in the anonymous namespace to prevent polluting the outer
 // namespace.
 //
@@ -2596,11 +2703,10 @@
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
+  explicit TestNameIs(const char *name) : name_(name) {}
 
-  // Returns true iff the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
+  // Returns true if and only if the test name of test_info matches name_.
+  bool operator()(const TestInfo *test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2613,15 +2719,14 @@
 namespace internal {
 
 // This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
 // This will be done just once during the program runtime.
 void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
   if (!parameterized_tests_registered_) {
     parameterized_test_registry_.RegisterTests();
+    type_parameterized_test_registry_.CheckForInstantiations();
     parameterized_tests_registered_ = true;
   }
-#endif
 }
 
 }  // namespace internal
@@ -2632,10 +2737,10 @@
   if (!should_run_) return;
 
   // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
 
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
@@ -2645,23 +2750,27 @@
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+  Test *const test = internal::HandleExceptionsInMethodIfSupported(
       factory_, &internal::TestFactoryBase::CreateTest,
       "the test fixture's constructor");
 
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
+  // Runs the test if the constructor didn't generate a fatal failure or invoke
+  // GTEST_SKIP().
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
     // This doesn't throw as all user code that can throw are wrapped into
     // exception handling code.
     test->Run();
   }
 
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
+  if (test != nullptr) {
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
+  }
 
+  result_.set_start_timestamp(start);
   result_.set_elapsed_time(internal::GetTimeInMillis() - start);
 
   // Notifies the unit test event listener that a test has just finished.
@@ -2669,134 +2778,148 @@
 
   // Tells UnitTest to stop associating assertion results to this
   // test.
-  impl->set_current_test_info(NULL);
+  impl->set_current_test_info(nullptr);
 }
 
-// class TestCase
+// class TestSuite
 
-// Gets the number of successful tests in this test case.
-int TestCase::successful_test_count() const {
+// Gets the number of successful tests in this test suite.
+int TestSuite::successful_test_count() const {
   return CountIf(test_info_list_, TestPassed);
 }
 
-// Gets the number of failed tests in this test case.
-int TestCase::failed_test_count() const {
+// Gets the number of successful tests in this test suite.
+int TestSuite::skipped_test_count() const {
+  return CountIf(test_info_list_, TestSkipped);
+}
+
+// Gets the number of failed tests in this test suite.
+int TestSuite::failed_test_count() const {
   return CountIf(test_info_list_, TestFailed);
 }
 
 // Gets the number of disabled tests that will be reported in the XML report.
-int TestCase::reportable_disabled_test_count() const {
+int TestSuite::reportable_disabled_test_count() const {
   return CountIf(test_info_list_, TestReportableDisabled);
 }
 
-// Gets the number of disabled tests in this test case.
-int TestCase::disabled_test_count() const {
+// Gets the number of disabled tests in this test suite.
+int TestSuite::disabled_test_count() const {
   return CountIf(test_info_list_, TestDisabled);
 }
 
 // Gets the number of tests to be printed in the XML report.
-int TestCase::reportable_test_count() const {
+int TestSuite::reportable_test_count() const {
   return CountIf(test_info_list_, TestReportable);
 }
 
-// Get the number of tests in this test case that should run.
-int TestCase::test_to_run_count() const {
+// Get the number of tests in this test suite that should run.
+int TestSuite::test_to_run_count() const {
   return CountIf(test_info_list_, ShouldRunTest);
 }
 
 // Gets the number of all tests.
-int TestCase::total_test_count() const {
+int TestSuite::total_test_count() const {
   return static_cast<int>(test_info_list_.size());
 }
 
-// Creates a TestCase with the given name.
+// Creates a TestSuite with the given name.
 //
 // Arguments:
 //
-//   name:         name of the test case
-//   a_type_param: the name of the test case's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test case.
-//   set_up_tc:    pointer to the function that sets up the test case
-//   tear_down_tc: pointer to the function that tears down the test case
-TestCase::TestCase(const char* a_name, const char* a_type_param,
-                   Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc)
+//   name:         name of the test suite
+//   a_type_param: the name of the test suite's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test suite.
+//   set_up_tc:    pointer to the function that sets up the test suite
+//   tear_down_tc: pointer to the function that tears down the test suite
+TestSuite::TestSuite(const char *a_name, const char *a_type_param,
+                     internal::SetUpTestSuiteFunc set_up_tc,
+                     internal::TearDownTestSuiteFunc tear_down_tc)
     : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {
-}
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      set_up_tc_(set_up_tc), tear_down_tc_(tear_down_tc), should_run_(false),
+      start_timestamp_(0), elapsed_time_(0) {}
 
-// Destructor of TestCase.
-TestCase::~TestCase() {
+// Destructor of TestSuite.
+TestSuite::~TestSuite() {
   // Deletes every Test in the collection.
   ForEach(test_info_list_, internal::Delete<TestInfo>);
 }
 
 // Returns the i-th test among all the tests. i can range from 0 to
 // total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestCase::GetTestInfo(int i) const {
+const TestInfo *TestSuite::GetTestInfo(int i) const {
   const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
 }
 
 // Returns the i-th test among all the tests. i can range from 0 to
 // total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestCase::GetMutableTestInfo(int i) {
+TestInfo *TestSuite::GetMutableTestInfo(int i) {
   const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
 }
 
-// Adds a test to this test case.  Will delete the test upon
-// destruction of the TestCase object.
-void TestCase::AddTestInfo(TestInfo * test_info) {
+// Adds a test to this test suite.  Will delete the test upon
+// destruction of the TestSuite object.
+void TestSuite::AddTestInfo(TestInfo *test_info) {
   test_info_list_.push_back(test_info);
   test_indices_.push_back(static_cast<int>(test_indices_.size()));
 }
 
-// Runs every test in this TestCase.
-void TestCase::Run() {
+// Runs every test in this TestSuite.
+void TestSuite::Run() {
   if (!should_run_) return;
 
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_case(this);
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
 
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
   repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  start_timestamp_ = internal::GetTimeInMillis();
   for (int i = 0; i < total_test_count(); i++) {
     GetMutableTestInfo(i)->Run();
   }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
+  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
 
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
   repeater->OnTestCaseEnd(*this);
-  impl->set_current_test_case(NULL);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
+  impl->set_current_test_suite(nullptr);
 }
 
-// Clears the results of all tests in this test case.
-void TestCase::ClearResult() {
+// Clears the results of all tests in this test suite.
+void TestSuite::ClearResult() {
   ad_hoc_test_result_.Clear();
   ForEach(test_info_list_, TestInfo::ClearTestResult);
 }
 
-// Shuffles the tests in this test case.
-void TestCase::ShuffleTests(internal::Random* random) {
+// Shuffles the tests in this test suite.
+void TestSuite::ShuffleTests(internal::Random *random) {
   Shuffle(random, &test_indices_);
 }
 
 // Restores the test order to before the first shuffle.
-void TestCase::UnshuffleTests() {
+void TestSuite::UnshuffleTests() {
   for (size_t i = 0; i < test_indices_.size(); i++) {
     test_indices_[i] = static_cast<int>(i);
   }
@@ -2807,11 +2930,10 @@
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char *singular_form,
+                                       const char *plural_form) {
   return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
+         (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -2819,19 +2941,19 @@
   return FormatCountableNoun(test_count, "test", "tests");
 }
 
-// Formats the count of test cases.
-static std::string FormatTestCaseCount(int test_case_count) {
-  return FormatCountableNoun(test_case_count, "test case", "test cases");
+// Formats the count of test suites.
+static std::string FormatTestSuiteCount(int test_suite_count) {
+  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
 }
 
 // Converts a TestPartResult::Type enum to human-friendly string
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char *TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
-    case TestPartResult::kSuccess:
-      return "Success";
+    case TestPartResult::kSkip: return "Skipped";
+    case TestPartResult::kSuccess: return "Success";
 
     case TestPartResult::kNonFatalFailure:
     case TestPartResult::kFatalFailure:
@@ -2840,8 +2962,7 @@
 #else
       return "Failure\n";
 #endif
-    default:
-      return "Unknown result type";
+    default: return "Unknown result type";
   }
 }
 
@@ -2849,18 +2970,19 @@
 
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
+    const TestPartResult &test_part_result) {
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
 }
 
 // Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
+static void PrintTestPartResult(const TestPartResult &test_part_result) {
+  const std::string &result = PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -2877,54 +2999,77 @@
 }
 
 // class PrettyUnitTestResultPrinter
-
-enum GTestColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW
-};
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
+static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_RED: return FOREGROUND_RED;
+    case COLOR_GREEN: return FOREGROUND_GREEN;
     case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
+    default: return 0;
   }
 }
 
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
 #else
 
 // Returns the ANSI color code for the given color.  COLOR_DEFAULT is
 // an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
+static const char *GetAnsiColorCode(GTestColor color) {
   switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-  };
+    case COLOR_RED: return "1";
+    case COLOR_GREEN: return "2";
+    case COLOR_YELLOW: return "3";
+    default: return nullptr;
+  }
 }
 
 #endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
 
-// Returns true iff Google Test should use colors in the output.
+// Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
+  const char *const gtest_color = GTEST_FLAG(color).c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
     // On Windows the TERM variable is usually not set, but the
     // console there does support colors.
     return stdout_is_tty;
 #else
     // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
+    const char *const term = posix::GetEnv("TERM");
     const bool term_supports_color =
         String::CStringEquals(term, "xterm") ||
         String::CStringEquals(term, "xterm-color") ||
@@ -2942,9 +3087,9 @@
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -2954,19 +3099,18 @@
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+void ColoredPrintf(GTestColor color, const char *fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
-    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
   const bool use_color = AlwaysFalse();
 #else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
   const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
     vprintf(fmt, args);
@@ -2974,21 +3118,22 @@
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
   const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
   vprintf(fmt, args);
 
   fflush(stdout);
@@ -3002,23 +3147,22 @@
   va_end(args);
 }
 
-// Text printed in Google Test's text output and --gunit_list_tests
+// Text printed in Google Test's text output and --gtest_list_tests
 // output to label the type parameter and value parameter for a test.
 static const char kTypeParamLabel[] = "TypeParam";
 static const char kValueParamLabel[] = "GetParam()";
 
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
+static void PrintFullTestCommentIfPresent(const TestInfo &test_info) {
+  const char *const type_param = test_info.type_param();
+  const char *const value_param = test_info.value_param();
 
-  if (type_param != NULL || value_param != NULL) {
+  if (type_param != nullptr || value_param != nullptr) {
     printf(", where ");
-    if (type_param != NULL) {
+    if (type_param != nullptr) {
       printf("%s = %s", kTypeParamLabel, type_param);
-      if (value_param != NULL)
-        printf(" and ");
+      if (value_param != nullptr) printf(" and ");
     }
-    if (value_param != NULL) {
+    if (value_param != nullptr) {
       printf("%s = %s", kValueParamLabel, value_param);
     }
   }
@@ -3030,48 +3174,59 @@
 class PrettyUnitTestResultPrinter : public TestEventListener {
  public:
   PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char * test_case, const char * test) {
-    printf("%s.%s", test_case, test);
+  static void PrintTestName(const char *test_suite, const char *test) {
+    printf("%s.%s", test_suite, test);
   }
 
   // The following methods override what's in the TestEventListener class.
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase &test_case) override;
+#else
+  void OnTestSuiteStart(const TestSuite &test_suite) override;
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo &test_info) override;
+
+  void OnTestPartResult(const TestPartResult &result) override;
+  void OnTestEnd(const TestInfo &test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase &test_case) override;
+#else
+  void OnTestSuiteEnd(const TestSuite &test_suite) override;
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
 
  private:
-  static void PrintFailedTests(const UnitTest& unit_test);
+  static void PrintFailedTests(const UnitTest &unit_test);
+  static void PrintFailedTestSuites(const UnitTest &unit_test);
+  static void PrintSkippedTests(const UnitTest &unit_test);
 };
 
-  // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
+    const UnitTest &unit_test, int iteration) {
   if (GTEST_FLAG(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char* const filter = GTEST_FLAG(filter).c_str();
+  const char *const filter = GTEST_FLAG(filter).c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
   if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+    ColoredPrintf(COLOR_YELLOW, "Note: %s filter = %s\n", GTEST_NAME_, filter);
   }
 
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
+    const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW, "Note: This is test shard %d of %s.\n",
                   static_cast<int>(shard_index) + 1,
                   internal::posix::GetEnv(kTestTotalShards));
   }
@@ -3082,147 +3237,226 @@
                   unit_test.random_seed());
   }
 
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(COLOR_GREEN, "[==========] ");
   printf("Running %s from %s.\n",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
   fflush(stdout);
 }
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+    const UnitTest & /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
   printf("Global test environment set-up.\n");
   fflush(stdout);
 }
 
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase &test_case) {
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
   ColoredPrintf(COLOR_GREEN, "[----------] ");
   printf("%s from %s", counts.c_str(), test_case.name());
-  if (test_case.type_param() == NULL) {
+  if (test_case.type_param() == nullptr) {
     printf("\n");
   } else {
     printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
   }
   fflush(stdout);
 }
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteStart(
+    const TestSuite &test_suite) {
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_suite.name());
+  if (test_suite.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
+  }
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_info.test_case_name(), test_info.name());
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo &test_info) {
+  ColoredPrintf(COLOR_GREEN, "[ RUN      ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
   printf("\n");
   fflush(stdout);
 }
 
 // Called after an assertion failure.
 void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  // If the test part succeeded, we don't need to do anything.
-  if (result.type() == TestPartResult::kSuccess)
-    return;
-
-  // Print failure message from the assertion (e.g. expected this and got that).
-  PrintTestPartResult(result);
-  fflush(stdout);
+    const TestPartResult &result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess: return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
 }
 
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo &test_info) {
   if (test_info.result()->Passed()) {
     ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else if (test_info.result()->Skipped()) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
   } else {
     ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
   }
-  PrintTestName(test_info.test_case_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
 
   if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
   } else {
     printf("\n");
   }
   fflush(stdout);
 }
 
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase &test_case) {
   if (!GTEST_FLAG(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
   ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case.name(),
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
 }
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite &test_suite) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
+         internal::StreamableToString(test_suite.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+    const UnitTest & /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
   printf("Global test environment tear-down\n");
   fflush(stdout);
 }
 
 // Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest &unit_test) {
   const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
+  ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
 
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
-    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
       continue;
     }
-    for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
-      if (!test_info.should_run() || test_info.result()->Passed()) {
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Failed()) {
         continue;
       }
       ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_case.name(), test_info.name());
+      printf("%s.%s", test_suite.name(), test_info.name());
       PrintFullTestCommentIfPresent(test_info);
       printf("\n");
     }
   }
+  printf("\n%2d FAILED %s\n", failed_test_count,
+         failed_test_count == 1 ? "TEST" : "TESTS");
+}
+
+// Internal helper for printing the list of test suite failures not covered by
+// PrintFailedTests.
+void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
+    const UnitTest &unit_test) {
+  int suite_failure_count = 0;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run()) {
+      continue;
+    }
+    if (test_suite.ad_hoc_test_result().Failed()) {
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
+      ++suite_failure_count;
+    }
+  }
+  if (suite_failure_count > 0) {
+    printf("\n%2d FAILED TEST %s\n", suite_failure_count,
+           suite_failure_count == 1 ? "SUITE" : "SUITES");
+  }
+}
+
+// Internal helper for printing the list of skipped tests.
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest &unit_test) {
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Skipped()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      printf("\n");
+    }
+  }
 }
 
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
                                                      int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(COLOR_GREEN, "[==========] ");
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
   if (GTEST_FLAG(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
   printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  ColoredPrintf(COLOR_GREEN, "[  PASSED  ] ");
   printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
 
-  int num_failures = unit_test.failed_test_count();
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
+    PrintSkippedTests(unit_test);
+  }
+
   if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
     PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
+    PrintFailedTestSuites(unit_test);
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
   if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
+    if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
+    ColoredPrintf(COLOR_YELLOW, "  YOU HAVE %d DISABLED %s\n\n", num_disabled,
                   num_disabled == 1 ? "TEST" : "TESTS");
   }
   // Ensure that Google Test output is printed before, e.g., heapchecker output.
@@ -3237,35 +3471,43 @@
 class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
-  virtual ~TestEventRepeater();
+  ~TestEventRepeater() override;
   void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
+  TestEventListener *Release(TestEventListener *listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
   // in death test child processes.
   bool forwarding_enabled() const { return forwarding_enabled_; }
   void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
 
-  virtual void OnTestProgramStart(const UnitTest& unit_test);
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+  void OnTestProgramStart(const UnitTest &unit_test) override;
+  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestSuite &parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteStart(const TestSuite &parameter) override;
+  void OnTestStart(const TestInfo &test_info) override;
+  void OnTestPartResult(const TestPartResult &result) override;
+  void OnTestEnd(const TestInfo &test_info) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase &parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteEnd(const TestSuite &parameter) override;
+  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) override;
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest &unit_test) override;
 
  private:
   // Controls whether events will be forwarded to listeners_. Set to false
   // in death test child processes.
   bool forwarding_enabled_;
   // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
+  std::vector<TestEventListener *> listeners_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
 };
@@ -3278,55 +3520,62 @@
   listeners_.push_back(listener);
 }
 
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener *TestEventRepeater::Release(TestEventListener *listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + i);
+      listeners_.erase(listeners_.begin() + static_cast<int>(i));
       return listener;
     }
   }
 
-  return NULL;
+  return nullptr;
 }
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type &parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
 // This defines a member that forwards the call to all listeners in reverse
 // order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
+  void TestEventRepeater::Name(const Type &parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = listeners_.size(); i != 0; i--) { \
+        listeners_[i - 1]->Name(parameter);             \
+      }                                                 \
+    }                                                   \
+  }
 
 GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
 GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
 GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
 GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
 GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
 GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
 
 #undef GTEST_REPEATER_METHOD_
 #undef GTEST_REVERSE_REPEATER_METHOD_
 
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+void TestEventRepeater::OnTestIterationStart(const UnitTest &unit_test,
                                              int iteration) {
   if (forwarding_enabled_) {
     for (size_t i = 0; i < listeners_.size(); i++) {
@@ -3335,11 +3584,11 @@
   }
 }
 
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+void TestEventRepeater::OnTestIterationEnd(const UnitTest &unit_test,
                                            int iteration) {
   if (forwarding_enabled_) {
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
-      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    for (size_t i = listeners_.size(); i > 0; i--) {
+      listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
     }
   }
 }
@@ -3349,9 +3598,14 @@
 // This class generates an XML output file.
 class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
+  explicit XmlUnitTestResultPrinter(const char *output_file);
 
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void ListTestsMatchingFilter(const std::vector<TestSuite *> &test_suites);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream *stream,
+                                const std::vector<TestSuite *> &test_suites);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -3369,49 +3623,54 @@
   // is_attribute is true, the text is meant to appear as an attribute
   // value, and normalizable whitespace is preserved by replacing it
   // with character references.
-  static std::string EscapeXml(const std::string& str, bool is_attribute);
+  static std::string EscapeXml(const std::string &str, bool is_attribute);
 
   // Returns the given string with all characters invalid in XML removed.
-  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+  static std::string RemoveInvalidXmlCharacters(const std::string &str);
 
   // Convenience wrapper around EscapeXml when str is an attribute value.
-  static std::string EscapeXmlAttribute(const std::string& str) {
+  static std::string EscapeXmlAttribute(const std::string &str) {
     return EscapeXml(str, true);
   }
 
   // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static std::string EscapeXmlText(const char* str) {
+  static std::string EscapeXmlText(const char *str) {
     return EscapeXml(str, false);
   }
 
   // Verifies that the given attribute belongs to the given element and
   // streams the attribute as XML.
-  static void OutputXmlAttribute(std::ostream* stream,
-                                 const std::string& element_name,
-                                 const std::string& name,
-                                 const std::string& value);
+  static void OutputXmlAttribute(std::ostream *stream,
+                                 const std::string &element_name,
+                                 const std::string &name,
+                                 const std::string &value);
 
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+  static void OutputXmlCDataSection(::std::ostream *stream, const char *data);
 
   // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_case_name,
-                                const TestInfo& test_info);
+  static void OutputXmlTestInfo(::std::ostream *stream,
+                                const char *test_suite_name,
+                                const TestInfo &test_info);
 
-  // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(::std::ostream* stream,
-                               const TestCase& test_case);
+  // Prints an XML representation of a TestSuite object
+  static void PrintXmlTestSuite(::std::ostream *stream,
+                                const TestSuite &test_suite);
 
   // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(::std::ostream* stream,
-                               const UnitTest& unit_test);
+  static void PrintXmlUnitTest(::std::ostream *stream,
+                               const UnitTest &unit_test);
 
   // Produces a string representing the test properties in a result as space
   // delimited XML attributes based on the property key="value" pairs.
   // When the std::string is not empty, it includes a space at the beginning,
   // to delimit this attribute from prior attributes.
-  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+  static std::string TestPropertiesAsXmlAttributes(const TestResult &result);
+
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream *stream,
+                                      const TestResult &result);
 
   // The output file.
   const std::string output_file_;
@@ -3420,48 +3679,32 @@
 };
 
 // Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char *output_file)
     : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
   }
 }
 
 // Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
                                                   int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
+  FILE *xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlUnitTest(&stream, unit_test);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
   fclose(xmlout);
 }
 
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestSuite *> &test_suites) {
+  FILE *xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_suites);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
 // Returns an XML-escaped copy of the input string str.  If is_attribute
 // is true, the text is meant to appear as an attribute value, and
 // normalizable whitespace is preserved by replacing it with character
@@ -3472,24 +3715,16 @@
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
-// escaping scheme for invalid characters, rather than dropping them.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string &str,
+                                                bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
     const char ch = str[i];
     switch (ch) {
-      case '<':
-        m << "&lt;";
-        break;
-      case '>':
-        m << "&gt;";
-        break;
-      case '&':
-        m << "&amp;";
-        break;
+      case '<': m << "&lt;"; break;
+      case '>': m << "&gt;"; break;
+      case '&': m << "&amp;"; break;
       case '\'':
         if (is_attribute)
           m << "&apos;";
@@ -3521,23 +3756,23 @@
 // Currently invalid characters are dropped from the string. An
 // alternative is to replace them with certain characters such as . or ?.
 std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
-    const std::string& str) {
+    const std::string &str) {
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
+    if (IsValidXmlCharacter(*it)) output.push_back(*it);
 
   return output;
 }
 
 // The following routines generate an XML representation of a UnitTest
 // object.
+// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
 // <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestSuite object
 //     <testcase name="test-name">     <-- corresponds to a TestInfo object
 //       <failure message="...">...</failure>
 //       <failure message="...">...</failure>
@@ -3554,19 +3789,18 @@
   return ss.str();
 }
 
-static bool PortableLocaltime(time_t seconds, struct tm* out) {
+static bool PortableLocaltime(time_t seconds, struct tm *out) {
 #if defined(_MSC_VER)
   return localtime_s(out, &seconds) == 0;
 #elif defined(__MINGW32__) || defined(__MINGW64__)
   // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
   // Windows' localtime(), which has a thread-local tm buffer.
-  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
-  if (tm_ptr == NULL)
-    return false;
+  struct tm *tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == nullptr) return false;
   *out = *tm_ptr;
   return true;
 #else
-  return localtime_r(&seconds, out) != NULL;
+  return localtime_r(&seconds, out) != nullptr;
 #endif
 }
 
@@ -3578,23 +3812,23 @@
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec);
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream *stream,
+                                                     const char *data) {
+  const char *segment = data;
   *stream << "<![CDATA[";
   for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != NULL) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
+    const char *const next_segment = strstr(segment, "]]>");
+    if (next_segment != nullptr) {
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -3606,15 +3840,13 @@
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedAttributesForElement(element_name);
+    std::ostream *stream, const std::string &element_name,
+    const std::string &name, const std::string &value) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
@@ -3622,85 +3854,111 @@
 }
 
 // Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_case_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  const std::string kTestcase = "testcase";
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
+                                                 const char *test_suite_name,
+                                                 const TestInfo &test_info) {
+  const TestResult &result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
 
   *stream << "    <testcase";
-  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+  OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
 
-  if (test_info.value_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "value_param",
+  if (test_info.value_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "value_param",
                        test_info.value_param());
   }
-  if (test_info.type_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  if (test_info.type_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "type_param",
+                       test_info.type_param());
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestsuite, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
   }
 
-  OutputXmlAttribute(stream, kTestcase, "status",
+  OutputXmlAttribute(stream, kTestsuite, "status",
                      test_info.should_run() ? "run" : "notrun");
-  OutputXmlAttribute(stream, kTestcase, "time",
+  OutputXmlAttribute(stream, kTestsuite, "result",
+                     test_info.should_run()
+                         ? (result.Skipped() ? "skipped" : "completed")
+                         : "suppressed");
+  OutputXmlAttribute(stream, kTestsuite, "time",
                      FormatTimeInMillisAsSeconds(result.elapsed_time()));
-  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
+  OutputXmlAttribute(
+      stream, kTestsuite, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
 
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
+    const TestPartResult &part = result.GetTestPartResult(i);
     if (part.failed()) {
       if (++failures == 1) {
         *stream << ">\n";
       }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
-              << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
+              << EscapeXmlAttribute(summary.c_str()) << "\" type=\"\">";
+      const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
     }
   }
 
-  if (failures == 0)
+  if (failures == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
-  else
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
     *stream << "    </testcase>\n";
+  }
 }
 
-// Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
-                                                const TestCase& test_case) {
+// Prints an XML representation of a TestSuite object
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream,
+                                                 const TestSuite &test_suite) {
   const std::string kTestsuite = "testsuite";
   *stream << "  <" << kTestsuite;
-  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
-                     StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
-  for (int i = 0; i < test_case.total_test_count(); ++i) {
-    if (test_case.GetTestInfo(i)->is_reportable())
-      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+                     StreamableToString(test_suite.reportable_test_count()));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_suite.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp()));
+    *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
+  }
+  *stream << ">\n";
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
   }
   *stream << "  </" << kTestsuite << ">\n";
 }
 
 // Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
-                                                const UnitTest& unit_test) {
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream *stream,
+                                                const UnitTest &unit_test) {
   const std::string kTestsuites = "testsuites";
 
   *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
@@ -3714,25 +3972,46 @@
       stream, kTestsuites, "disabled",
       StreamableToString(unit_test.reportable_disabled_test_count()));
   OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
   OutputXmlAttribute(
       stream, kTestsuites, "timestamp",
       FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
-  OutputXmlAttribute(stream, kTestsuites, "time",
-                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
 
   if (GTEST_FLAG(shuffle)) {
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
-
   *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
 
   OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
   *stream << ">\n";
 
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
-      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
+      PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (auto test_suite : test_suites) {
+    PrintXmlTestSuite(stream, *test_suite);
   }
   *stream << "</" << kTestsuites << ">\n";
 }
@@ -3740,18 +4019,394 @@
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
 std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
+    const TestResult &result) {
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
+    const TestProperty &property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
 
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream *stream, const TestResult &result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty &property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "</" << kProperties << ">\n";
+}
+
 // End XmlUnitTestResultPrinter
 
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char *output_file);
+
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream *stream,
+                                const std::vector<TestSuite *> &test_suites);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string &str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream *stream,
+                            const std::string &element_name,
+                            const std::string &name, const std::string &value,
+                            const std::string &indent, bool comma = true);
+  static void OutputJsonKey(std::ostream *stream,
+                            const std::string &element_name,
+                            const std::string &name, int value,
+                            const std::string &indent, bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream *stream,
+                                 const char *test_suite_name,
+                                 const TestInfo &test_info);
+
+  // Prints a JSON representation of a TestSuite object
+  static void PrintJsonTestSuite(::std::ostream *stream,
+                                 const TestSuite &test_suite);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream *stream,
+                                const UnitTest &unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult &result,
+                                          const std::string &indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char *output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+                                                   int /*iteration*/) {
+  FILE *jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string &str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/': m << '\\' << ch; break;
+      case '\b': m << "\\b"; break;
+      case '\t': m << "\\t"; break;
+      case '\n': m << "\\n"; break;
+      case '\f': m << "\\f"; break;
+      case '\r': m << "\\r"; break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(size_t width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream *stream,
+                                              const std::string &element_name,
+                                              const std::string &name,
+                                              const std::string &value,
+                                              const std::string &indent,
+                                              bool comma) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma) *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream *stream, const std::string &element_name,
+    const std::string &name, int value, const std::string &indent, bool comma) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma) *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream,
+                                                   const char *test_suite_name,
+                                                   const TestInfo &test_info) {
+  const TestResult &result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
+                  kIndent);
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestsuite, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestsuite, "result",
+                test_info.should_run()
+                    ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
+                    : "SUPPRESSED",
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
+                false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult &part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0) *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestSuite object
+void JsonUnitTestResultPrinter::PrintJsonTestSuite(
+    std::ostream *stream, const TestSuite &test_suite) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures",
+                  test_suite.failed_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_suite.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()),
+        kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream *stream,
+                                                  const UnitTest &unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
+    }
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_suites.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestSuite(stream, *test_suites[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult &result, const std::string &indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty &property = result.GetTestProperty(i);
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
 #if GTEST_CAN_STREAM_RESULTS_
 
 // Checks if str contains '=', '&', '%' or '\n' characters. If yes,
@@ -3759,8 +4414,8 @@
 // example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
 // in both time and space -- important as the input str may contain an
 // arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
+std::string StreamingListener::UrlEncode(const char *str) {
+  std::string result;
   result.reserve(strlen(str) + 1);
   for (char ch = *str; ch != '\0'; ch = *++str) {
     switch (ch) {
@@ -3770,9 +4425,7 @@
       case '\n':
         result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
         break;
-      default:
-        result.push_back(ch);
-        break;
+      default: result.push_back(ch); break;
     }
   }
   return result;
@@ -3784,24 +4437,24 @@
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
+  addrinfo *servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
   }
 
   // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+  for (addrinfo *cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -3822,64 +4475,106 @@
 // End of class Streaming Listener
 #endif  // GTEST_CAN_STREAM_RESULTS__
 
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-
 // class OsStackTraceGetter
 
-const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+const char *const OsStackTraceGetterInterface::kElidedFramesMarker =
     "... " GTEST_NAME_ " internal frames ...";
 
-string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/,
-                                             int /*skip_count*/) {
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void *> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void *caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char *symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else   // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
   return "";
+#endif  // GTEST_HAS_ABSL
 }
 
-void OsStackTraceGetter::UponLeavingGTest() {}
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void *caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
 
 // A helper class that creates the premature-exit file in its
 // constructor and deletes the file in its destructor.
 class ScopedPrematureExitFile {
  public:
-  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
+  explicit ScopedPrematureExitFile(const char *premature_exit_filepath)
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+    if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE *pfile = posix::FOpen(premature_exit_filepath, "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
   }
 
   ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
+#if !defined GTEST_OS_ESP8266
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
     }
+#endif
   }
 
  private:
-  const char* const premature_exit_filepath_;
+  const std::string premature_exit_filepath_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
 };
@@ -3890,9 +4585,7 @@
 
 TestEventListeners::TestEventListeners()
     : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(NULL),
-      default_xml_generator_(NULL) {
-}
+      default_result_printer_(nullptr), default_xml_generator_(nullptr) {}
 
 TestEventListeners::~TestEventListeners() { delete repeater_; }
 
@@ -3900,38 +4593,37 @@
 // output.  Can be removed from the listeners list to shut down default
 // console output.  Note that removing this object from the listener list
 // with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
+void TestEventListeners::Append(TestEventListener *listener) {
   repeater_->Append(listener);
 }
 
 // Removes the given event listener from the list and returns it.  It then
 // becomes the caller's responsibility to delete the listener. Returns
 // NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+TestEventListener *TestEventListeners::Release(TestEventListener *listener) {
   if (listener == default_result_printer_)
-    default_result_printer_ = NULL;
+    default_result_printer_ = nullptr;
   else if (listener == default_xml_generator_)
-    default_xml_generator_ = NULL;
+    default_xml_generator_ = nullptr;
   return repeater_->Release(listener);
 }
 
 // Returns repeater that broadcasts the TestEventListener events to all
 // subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
+TestEventListener *TestEventListeners::repeater() { return repeater_; }
 
 // Sets the default_result_printer attribute to the provided listener.
 // The listener is also added to the listener list and previous
 // default_result_printer is removed from it and deleted. The listener can
 // also be NULL in which case it will not be added to the list. Does
 // nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener *listener) {
   if (default_result_printer_ != listener) {
     // It is an error to pass this method a listener that is already in the
     // list.
     delete Release(default_result_printer_);
     default_result_printer_ = listener;
-    if (listener != NULL)
-      Append(listener);
+    if (listener != nullptr) Append(listener);
   }
 }
 
@@ -3940,14 +4632,13 @@
 // default_xml_generator is removed from it and deleted. The listener can
 // also be NULL in which case it will not be added to the list. Does
 // nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener *listener) {
   if (default_xml_generator_ != listener) {
     // It is an error to pass this method a listener that is already in the
     // list.
     delete Release(default_xml_generator_);
     default_xml_generator_ = listener;
-    if (listener != NULL)
-      Append(listener);
+    if (listener != nullptr) Append(listener);
   }
 }
 
@@ -3970,53 +4661,67 @@
 // We don't protect this under mutex_ as a user is not supposed to
 // call this before main() starts, from which point on the return
 // value will never change.
-UnitTest* UnitTest::GetInstance() {
-  // When compiled with MSVC 7.1 in optimized mode, destroying the
-  // UnitTest object upon exiting the program messes up the exit code,
-  // causing successful tests to appear failed.  We have to use a
-  // different implementation in this case to bypass the compiler bug.
-  // This implementation makes the compiler happy, at the cost of
-  // leaking the UnitTest object.
-
+UnitTest *UnitTest::GetInstance() {
   // CodeGear C++Builder insists on a public destructor for the
   // default implementation.  Use this implementation to keep good OO
   // design with private destructor.
 
-#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
+#if defined(__BORLANDC__)
+  static UnitTest *const instance = new UnitTest;
   return instance;
 #else
   static UnitTest instance;
   return &instance;
-#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+#endif  // defined(__BORLANDC__)
 }
 
-// Gets the number of successful test cases.
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_case_count();
+// Gets the number of successful test suites.
+int UnitTest::successful_test_suite_count() const {
+  return impl()->successful_test_suite_count();
 }
 
-// Gets the number of failed test cases.
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_case_count();
+// Gets the number of failed test suites.
+int UnitTest::failed_test_suite_count() const {
+  return impl()->failed_test_suite_count();
 }
 
-// Gets the number of all test cases.
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_case_count();
+// Gets the number of all test suites.
+int UnitTest::total_test_suite_count() const {
+  return impl()->total_test_suite_count();
 }
 
-// Gets the number of all test cases that contain at least one test
+// Gets the number of all test suites that contain at least one test
 // that should run.
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_case_to_run_count();
+int UnitTest::test_suite_to_run_count() const {
+  return impl()->test_suite_to_run_count();
 }
 
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_suite_count();
+}
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_suite_count();
+}
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_suite_count();
+}
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
 // Gets the number of successful tests.
 int UnitTest::successful_test_count() const {
   return impl()->successful_test_count();
 }
 
+// Gets the number of skipped tests.
+int UnitTest::skipped_test_count() const {
+  return impl()->skipped_test_count();
+}
+
 // Gets the number of failed tests.
 int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
 
@@ -4044,7 +4749,7 @@
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
+  return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -4052,36 +4757,42 @@
   return impl()->elapsed_time();
 }
 
-// Returns true iff the unit test passed (i.e. all test cases passed).
+// Returns true if and only if the unit test passed (i.e. all test suites
+// passed).
 bool UnitTest::Passed() const { return impl()->Passed(); }
 
-// Returns true iff the unit test failed (i.e. some test case failed
-// or something outside of all tests failed).
+// Returns true if and only if the unit test failed (i.e. some test suite
+// failed or something outside of all tests failed).
 bool UnitTest::Failed() const { return impl()->Failed(); }
 
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+const TestSuite *UnitTest::GetTestSuite(int i) const {
+  return impl()->GetTestSuite(i);
 }
 
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase *UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
 // Returns the TestResult containing information on test failures and
-// properties logged outside of individual test cases.
-const TestResult& UnitTest::ad_hoc_test_result() const {
+// properties logged outside of individual test suites.
+const TestResult &UnitTest::ad_hoc_test_result() const {
   return *impl()->ad_hoc_test_result();
 }
 
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-TestCase* UnitTest::GetMutableTestCase(int i) {
-  return impl()->GetMutableTestCase(i);
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+TestSuite *UnitTest::GetMutableTestSuite(int i) {
+  return impl()->GetMutableSuiteCase(i);
 }
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
+TestEventListeners &UnitTest::listeners() { return *impl()->listeners(); }
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -4093,9 +4804,9 @@
 //
 // We don't protect this under mutex_, as we only support calling it
 // from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == NULL) {
-    return NULL;
+Environment *UnitTest::AddEnvironment(Environment *env) {
+  if (env == nullptr) {
+    return nullptr;
   }
 
   impl_->environments().push_back(env);
@@ -4106,12 +4817,11 @@
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char *file_name, int line_number,
+                                 const std::string &message,
+                                 const std::string &os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -4119,25 +4829,25 @@
   if (impl_->gtest_trace_stack().size() > 0) {
     msg << "\n" << GTEST_NAME_ << " trace:";
 
-    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
-         i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
+    for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
+      const internal::TraceInfo &trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
     }
   }
 
-  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+  if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
     msg << internal::kStackTraceMarker << os_stack_trace;
   }
 
-  const TestPartResult result =
-    TestPartResult(result_type, file_name, line_number,
-                   msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
+  const TestPartResult result = TestPartResult(
+      result_type, file_name, line_number, msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
 
-  if (result_type != TestPartResult::kSuccess) {
+  if (result_type != TestPartResult::kSuccess &&
+      result_type != TestPartResult::kSkip) {
     // gtest_break_on_failure takes precedence over
     // gtest_throw_on_failure.  This allows a user to set the latter
     // in the code (perhaps in order to use Google Test assertions
@@ -4149,12 +4859,16 @@
       // when a failure happens and both the --gtest_break_on_failure and
       // the --gtest_catch_exceptions flags are specified.
       DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
 #else
-      // Dereference NULL through a volatile pointer to prevent the compiler
+      // Dereference nullptr through a volatile pointer to prevent the compiler
       // from removing. We use this rather than abort() or __builtin_trap() for
-      // portability: Symbian doesn't implement abort() well, and some debuggers
-      // don't correctly trap abort().
-      *static_cast<volatile int*>(NULL) = 1;
+      // portability: some debuggers don't correctly trap abort().
+      *static_cast<volatile int *>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
     } else if (GTEST_FLAG(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
@@ -4169,12 +4883,12 @@
 }
 
 // Adds a TestProperty to the current TestResult object when invoked from
-// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
-// from SetUpTestCase or TearDownTestCase, or to the global property set
+// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+// from SetUpTestSuite or TearDownTestSuite, or to the global property set
 // when invoked elsewhere.  If the result already contains a property with
 // the same key, the value will be updated.
-void UnitTest::RecordProperty(const std::string& key,
-                              const std::string& value) {
+void UnitTest::RecordProperty(const std::string &key,
+                              const std::string &value) {
   impl_->RecordProperty(TestProperty(key, value));
 }
 
@@ -4209,75 +4923,90 @@
   // that understands the premature-exit-file protocol to report the
   // test as having failed.
   const internal::ScopedPrematureExitFile premature_exit_file(
-      in_death_test_child_process ?
-      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+      in_death_test_child_process
+          ? nullptr
+          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
 
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
   impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
 
-#if GTEST_HAS_SEH
+#if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
   // tests or this is executing in the context of death test child
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
 
-# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
     // executed. Google Test will notify the user of any unexpected
     // failure via stderr.
-    //
-    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
-    // Users of prior VC versions shall suffer the agony and pain of
-    // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
-    // debug mode when compiled with VC 7.1 or lower.
     if (!GTEST_FLAG(break_on_failure))
       _set_abort_behavior(
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
+
+    // In debug mode, the Windows CRT can crash with an assertion over invalid
+    // input (e.g. passing an invalid file descriptor).  The default handling
+    // for these assertions is to pop up a dialog and wait for user input.
+    // Instead ask the CRT to dump such assertions to stderr non-interactively.
+    if (!IsDebuggerPresent()) {
+      (void)_CrtSetReportMode(_CRT_ASSERT,
+                              _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+      (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+    }
+#endif
   }
-#endif  // GTEST_HAS_SEH
+#endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
 // executed.
-const char* UnitTest::original_working_dir() const {
+const char *UnitTest::original_working_dir() const {
   return impl_->original_working_dir_.c_str();
 }
 
-// Returns the TestCase object for the test that's currently running,
+// Returns the TestSuite object for the test that's currently running,
 // or NULL if no test is running.
-const TestCase* UnitTest::current_test_case() const
+const TestSuite *UnitTest::current_test_suite() const
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
-  return impl_->current_test_case();
+  return impl_->current_test_suite();
 }
 
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase *UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+#endif
+
 // Returns the TestInfo object for the test that's currently running,
 // or NULL if no test is running.
-const TestInfo* UnitTest::current_test_info() const
+const TestInfo *UnitTest::current_test_info() const
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_info();
@@ -4286,67 +5015,51 @@
 // Returns the random seed used at the start of the current test run.
 int UnitTest::random_seed() const { return impl_->random_seed(); }
 
-#if GTEST_HAS_PARAM_TEST
-// Returns ParameterizedTestCaseRegistry object used to keep track of
+// Returns ParameterizedTestSuiteRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
-internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry()
-        GTEST_LOCK_EXCLUDED_(mutex_) {
+internal::ParameterizedTestSuiteRegistry &
+UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
-#endif  // GTEST_HAS_PARAM_TEST
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+void UnitTest::PushGTestTrace(const internal::TraceInfo &trace)
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().push_back(trace);
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
 
 namespace internal {
 
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
+UnitTestImpl::UnitTestImpl(UnitTest *parent)
     : parent_(parent),
       GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
-      default_global_test_part_result_reporter_(this),
+          default_global_test_part_result_reporter_(this),
       default_per_thread_test_part_result_reporter_(this),
-      GTEST_DISABLE_MSC_WARNINGS_POP_()
-      global_test_part_result_repoter_(
+      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
           &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
-      last_death_test_case_(-1),
-      current_test_case_(NULL),
-      current_test_info_(NULL),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(NULL),
-      post_flag_parse_init_performed_(false),
+      parameterized_test_registry_(), parameterized_tests_registered_(false),
+      last_death_test_suite_(-1), current_test_suite_(nullptr),
+      current_test_info_(nullptr), ad_hoc_test_result_(),
+      os_stack_trace_getter_(nullptr), post_flag_parse_init_performed_(false),
       random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),  // Will be reseeded before first use.
-      start_timestamp_(0),
-      elapsed_time_(0),
+      random_(0),       // Will be reseeded before first use.
+      start_timestamp_(0), elapsed_time_(0),
 #if GTEST_HAS_DEATH_TEST
       death_test_factory_(new DefaultDeathTestFactory),
 #endif
@@ -4356,8 +5069,8 @@
 }
 
 UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestCase.
-  ForEach(test_cases_, internal::Delete<TestCase>);
+  // Deletes every TestSuite.
+  ForEach(test_suites_, internal::Delete<TestSuite>);
 
   // Deletes every Environment.
   ForEach(environments_, internal::Delete<Environment>);
@@ -4366,20 +5079,20 @@
 }
 
 // Adds a TestProperty to the current TestResult object when invoked in a
-// context of a test, to current test case's ad_hoc_test_result when invoke
-// from SetUpTestCase/TearDownTestCase, or to the global property set
+// context of a test, to current test suite's ad_hoc_test_result when invoke
+// from SetUpTestSuite/TearDownTestSuite, or to the global property set
 // otherwise.  If the result already contains a property with the same key,
 // the value will be updated.
-void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+void UnitTestImpl::RecordProperty(const TestProperty &test_property) {
   std::string xml_element;
-  TestResult* test_result;  // TestResult appropriate for property recording.
+  TestResult *test_result;  // TestResult appropriate for property recording.
 
-  if (current_test_info_ != NULL) {
+  if (current_test_info_ != nullptr) {
     xml_element = "testcase";
     test_result = &(current_test_info_->result_);
-  } else if (current_test_case_ != NULL) {
+  } else if (current_test_suite_ != nullptr) {
     xml_element = "testsuite";
-    test_result = &(current_test_case_->ad_hoc_test_result_);
+    test_result = &(current_test_suite_->ad_hoc_test_result_);
   } else {
     xml_element = "testsuites";
     test_result = &ad_hoc_test_result_;
@@ -4391,7 +5104,7 @@
 // Disables event forwarding if the control is currently in a death test
 // subprocess. Must not be called before InitGoogleTest.
 void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != NULL)
+  if (internal_run_death_test_flag_.get() != nullptr)
     listeners()->SuppressEventForwarding();
 }
 #endif  // GTEST_HAS_DEATH_TEST
@@ -4399,14 +5112,16 @@
 // Initializes event listeners performing XML output as specified by
 // UnitTestOptions. Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureXmlOutput() {
-  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  const std::string &output_format = UnitTestOptions::GetOutputFormat();
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
   } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
   }
 }
 
@@ -4414,16 +5129,15 @@
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
+  const std::string &target = GTEST_FLAG(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
     } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
     }
   }
 }
@@ -4462,83 +5176,89 @@
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
 #endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
   }
 }
 
-// A predicate that checks the name of a TestCase against a known
+// A predicate that checks the name of a TestSuite against a known
 // value.
 //
 // This is used for implementation of the UnitTest class only.  We put
 // it in the anonymous namespace to prevent polluting the outer
 // namespace.
 //
-// TestCaseNameIs is copyable.
-class TestCaseNameIs {
+// TestSuiteNameIs is copyable.
+class TestSuiteNameIs {
  public:
   // Constructor.
-  explicit TestCaseNameIs(const std::string& name)
-      : name_(name) {}
+  explicit TestSuiteNameIs(const std::string &name) : name_(name) {}
 
-  // Returns true iff the name of test_case matches name_.
-  bool operator()(const TestCase* test_case) const {
-    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  // Returns true if and only if the name of test_suite matches name_.
+  bool operator()(const TestSuite *test_suite) const {
+    return test_suite != nullptr &&
+           strcmp(test_suite->name(), name_.c_str()) == 0;
   }
 
  private:
   std::string name_;
 };
 
-// Finds and returns a TestCase with the given name.  If one doesn't
+// Finds and returns a TestSuite with the given name.  If one doesn't
 // exist, creates one and returns it.  It's the CALLER'S
 // RESPONSIBILITY to ensure that this function is only called WHEN THE
 // TESTS ARE NOT SHUFFLED.
 //
 // Arguments:
 //
-//   test_case_name: name of the test case
-//   type_param:     the name of the test case's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test case.
-//   set_up_tc:      pointer to the function that sets up the test case
-//   tear_down_tc:   pointer to the function that tears down the test case
-TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
-                                    const char* type_param,
-                                    Test::SetUpTestCaseFunc set_up_tc,
-                                    Test::TearDownTestCaseFunc tear_down_tc) {
-  // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
-                   TestCaseNameIs(test_case_name));
+//   test_suite_name: name of the test suite
+//   type_param:     the name of the test suite's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test suite.
+//   set_up_tc:      pointer to the function that sets up the test suite
+//   tear_down_tc:   pointer to the function that tears down the test suite
+TestSuite *UnitTestImpl::GetTestSuite(
+    const char *test_suite_name, const char *type_param,
+    internal::SetUpTestSuiteFunc set_up_tc,
+    internal::TearDownTestSuiteFunc tear_down_tc) {
+  // Can we find a TestSuite with the given name?
+  const auto test_suite =
+      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
+                   TestSuiteNameIs(test_suite_name));
 
-  if (test_case != test_cases_.end())
-    return *test_case;
+  if (test_suite != test_suites_.rend()) return *test_suite;
 
   // No.  Let's create one.
-  TestCase* const new_test_case =
-      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+  auto *const new_test_suite =
+      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
-  // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                               kDeathTestCaseFilter)) {
-    // Yes.  Inserts the test case after the last death test case
-    // defined so far.  This only works when the test cases haven't
+  // Is this a death test suite?
+  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
+                                               kDeathTestSuiteFilter)) {
+    // Yes.  Inserts the test suite after the last death test suite
+    // defined so far.  This only works when the test suites haven't
     // been shuffled.  Otherwise we may end up running a death test
     // after a non-death test.
-    ++last_death_test_case_;
-    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
-                       new_test_case);
+    ++last_death_test_suite_;
+    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
+                        new_test_suite);
   } else {
     // No.  Appends to the end of the list.
-    test_cases_.push_back(new_test_case);
+    test_suites_.push_back(new_test_suite);
   }
 
-  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
-  return new_test_case;
+  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
+  return new_test_suite;
 }
 
 // Helpers for setting up / tearing down the given environment.  They
 // are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+static void SetUpEnvironment(Environment *env) { env->SetUp(); }
+static void TearDownEnvironment(Environment *env) { env->TearDown(); }
 
 // Runs all tests in this UnitTest object, prints the result, and
 // returns true if all tests are successful.  If any exception is
@@ -4550,17 +5270,12 @@
 // All other functions called from RunAllTests() may safely assume that
 // parameterized tests are ready to be counted and run.
 bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
+  // True if and only if Google Test is initialized before RUN_ALL_TESTS() is
+  // called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
+  if (g_help_flag) return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -4571,17 +5286,18 @@
   // protocol.
   internal::WriteToShardStatusFileIfNeeded();
 
-  // True iff we are in a subprocess for running a thread-safe-style
+  // True if and only if we are in a subprocess for running a thread-safe-style
   // death test.
   bool in_subprocess_for_death_test = false;
 
 #if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  in_subprocess_for_death_test =
+      (internal_run_death_test_flag_.get() != nullptr);
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -4589,9 +5305,9 @@
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
   if (GTEST_FLAG(list_tests)) {
@@ -4600,13 +5316,13 @@
     return true;
   }
 
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ =
+      GTEST_FLAG(shuffle) ? GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
 
-  // True iff at least one test has failed.
+  // True if and only if at least one test has failed.
   bool failed = false;
 
-  TestEventListener* repeater = listeners()->repeater();
+  TestEventListener *repeater = listeners()->repeater();
 
   start_timestamp_ = GetTimeInMillis();
   repeater->OnTestProgramStart(*parent_);
@@ -4615,17 +5331,17 @@
   // when we are inside the subprocess of a death test.
   const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
   // Repeats forever if the repeat count is negative.
-  const bool forever = repeat < 0;
-  for (int i = 0; forever || i != repeat; i++) {
+  const bool gtest_repeat_forever = repeat < 0;
+  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
     // We want to preserve failures generated by ad-hoc test
     // assertions executed before RUN_ALL_TESTS().
     ClearNonAdHocTestResult();
 
     const TimeInMillis start = GetTimeInMillis();
 
-    // Shuffles test cases and tests if requested.
+    // Shuffles test suites and tests if requested.
     if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(random_seed_);
+      random()->Reseed(static_cast<uint32_t>(random_seed_));
       // This should be done before calling OnTestIterationStart(),
       // such that a test event listener can see the actual test order
       // in the event.
@@ -4635,19 +5351,33 @@
     // Tells the unit test event listeners that the tests are about to start.
     repeater->OnTestIterationStart(*parent_, i);
 
-    // Runs each test case if there is at least one test to run.
+    // Runs each test suite if there is at least one test to run.
     if (has_tests_to_run) {
       // Sets up all environments beforehand.
       repeater->OnEnvironmentsSetUpStart(*parent_);
       ForEach(environments_, SetUpEnvironment);
       repeater->OnEnvironmentsSetUpEnd(*parent_);
 
-      // Runs the tests only if there was no fatal failure during global
-      // set-up.
-      if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_case_count();
+      // Runs the tests only if there was no fatal failure or skip triggered
+      // during global set-up.
+      if (Test::IsSkipped()) {
+        // Emit diagnostics when global set-up calls skip, as it will not be
+        // emitted by default.
+        TestResult &test_result =
+            *internal::GetUnitTestImpl()->current_test_result();
+        for (int j = 0; j < test_result.total_part_count(); ++j) {
+          const TestPartResult &test_part_result =
+              test_result.GetTestPartResult(j);
+          if (test_part_result.type() == TestPartResult::kSkip) {
+            const std::string &result = test_part_result.message();
+            printf("%s\n", result.c_str());
+          }
+        }
+        fflush(stdout);
+      } else if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
-          GetMutableTestCase(test_index)->Run();
+          GetMutableSuiteCase(test_index)->Run();
         }
       }
 
@@ -4684,6 +5414,20 @@
 
   repeater->OnTestProgramEnd(*parent_);
 
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
   return !failed;
 }
 
@@ -4692,10 +5436,10 @@
 // function will write over it. If the variable is present, but the file cannot
 // be created, prints an error and exits.
 void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != NULL) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == NULL) {
+  const char *const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != nullptr) {
+    FILE *const file = posix::FOpen(test_shard_file, "w");
+    if (file == nullptr) {
       ColoredPrintf(COLOR_RED,
                     "Could not write to the test shard status file \"%s\" "
                     "specified by the %s environment variable.\n",
@@ -4713,41 +5457,40 @@
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
+bool ShouldShard(const char *total_shards_env, const char *shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
   }
 
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+  const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
 
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   }
@@ -4758,13 +5501,13 @@
 // Parses the environment variable var as an Int32. If it is unset,
 // returns default_val. If it is not an Int32, prints an error
 // and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == NULL) {
+int32_t Int32FromEnvOrDie(const char *var, int32_t default_val) {
+  const char *str_val = posix::GetEnv(var);
+  if (str_val == nullptr) {
     return default_val;
   }
 
-  Int32 result;
+  int32_t result;
   if (!ParseInt32(Message() << "The value of environment variable " << var,
                   str_val, &result)) {
     exit(EXIT_FAILURE);
@@ -4773,8 +5516,8 @@
 }
 
 // Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
 bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
   return (test_id % total_shards) == shard_index;
@@ -4782,16 +5525,18 @@
 
 // Compares the name of each test with the user-specified filter to
 // decide whether the test should be run, then records the result in
-// each TestCase and TestInfo object.
+// each TestSuite and TestInfo object.
 // If shard_tests == true, further filters tests based on sharding
 // variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
 
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
@@ -4799,42 +5544,40 @@
   // this shard.
   int num_runnable_tests = 0;
   int num_selected_tests = 0;
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    TestCase* const test_case = test_cases_[i];
-    const std::string &test_case_name = test_case->name();
-    test_case->set_should_run(false);
+  for (auto *test_suite : test_suites_) {
+    const std::string &test_suite_name = test_suite->name();
+    test_suite->set_should_run(false);
 
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_case->test_info_list()[j];
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      TestInfo *const test_info = test_suite->test_info_list()[j];
       const std::string test_name(test_info->name());
-      // A test is disabled if test case name or test name matches
+      // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
-      const bool is_disabled =
-          internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                                   kDisableTestFilter) ||
-          internal::UnitTestOptions::MatchesFilter(test_name,
-                                                   kDisableTestFilter);
+      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
+                                   test_suite_name, kDisableTestFilter) ||
+                               internal::UnitTestOptions::MatchesFilter(
+                                   test_name, kDisableTestFilter);
       test_info->is_disabled_ = is_disabled;
 
-      const bool matches_filter =
-          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
+      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
+          test_suite_name, test_name);
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
           (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
 
       num_runnable_tests += is_runnable;
       num_selected_tests += is_selected;
 
       test_info->should_run_ = is_selected;
-      test_case->set_should_run(test_case->should_run() || is_selected);
+      test_suite->set_should_run(test_suite->should_run() || is_selected);
     }
   }
   return num_selected_tests;
@@ -4844,8 +5587,8 @@
 // characters with string "\\n".  If the output takes more than
 // max_length characters, only prints the first max_length characters
 // and "...".
-static void PrintOnOneLine(const char* str, int max_length) {
-  if (str != NULL) {
+static void PrintOnOneLine(const char *str, int max_length) {
+  if (str != nullptr) {
     for (int i = 0; *str != '\0'; ++str) {
       if (i >= max_length) {
         printf("...");
@@ -4867,27 +5610,25 @@
   // Print at most this many characters for each type/value parameter.
   const int kMaxParamLength = 250;
 
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    const TestCase* const test_case = test_cases_[i];
-    bool printed_test_case_name = false;
+  for (auto *test_suite : test_suites_) {
+    bool printed_test_suite_name = false;
 
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      const TestInfo* const test_info =
-          test_case->test_info_list()[j];
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      const TestInfo *const test_info = test_suite->test_info_list()[j];
       if (test_info->matches_filter_) {
-        if (!printed_test_case_name) {
-          printed_test_case_name = true;
-          printf("%s.", test_case->name());
-          if (test_case->type_param() != NULL) {
+        if (!printed_test_suite_name) {
+          printed_test_suite_name = true;
+          printf("%s.", test_suite->name());
+          if (test_suite->type_param() != nullptr) {
             printf("  # %s = ", kTypeParamLabel);
             // We print the type parameter on a single line to make
             // the output easy to parse by a program.
-            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+            PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
           }
           printf("\n");
         }
         printf("  %s", test_info->name());
-        if (test_info->value_param() != NULL) {
+        if (test_info->value_param() != nullptr) {
           printf("  # %s = ", kValueParamLabel);
           // We print the value parameter on a single line to make the
           // output easy to parse by a program.
@@ -4898,6 +5639,23 @@
     }
   }
   fflush(stdout);
+  const std::string &output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE *fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_suites_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_suites_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
 }
 
 // Sets the OS stack trace getter.
@@ -4906,7 +5664,7 @@
 // the same; otherwise, deletes the old getter and makes the input the
 // current getter.
 void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
+    OsStackTraceGetterInterface *getter) {
   if (os_stack_trace_getter_ != getter) {
     delete os_stack_trace_getter_;
     os_stack_trace_getter_ = getter;
@@ -4916,8 +5674,8 @@
 // Returns the current OS stack trace getter if it is not NULL;
 // otherwise, creates an OsStackTraceGetter, makes it the current
 // getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == NULL) {
+OsStackTraceGetterInterface *UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == nullptr) {
 #ifdef GTEST_OS_STACK_TRACE_GETTER_
     os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
 #else
@@ -4928,36 +5686,40 @@
   return os_stack_trace_getter_;
 }
 
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
-TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
+// Returns the most specific TestResult currently running.
+TestResult *UnitTestImpl::current_test_result() {
+  if (current_test_info_ != nullptr) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_suite_ != nullptr) {
+    return &current_test_suite_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
 }
 
-// Shuffles all test cases, and the tests within each test case,
+// Shuffles all test suites, and the tests within each test suite,
 // making sure that death tests are still run first.
 void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test cases.
-  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+  // Shuffles the death test suites.
+  ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
 
-  // Shuffles the non-death test cases.
-  ShuffleRange(random(), last_death_test_case_ + 1,
-               static_cast<int>(test_cases_.size()), &test_case_indices_);
+  // Shuffles the non-death test suites.
+  ShuffleRange(random(), last_death_test_suite_ + 1,
+               static_cast<int>(test_suites_.size()), &test_suite_indices_);
 
-  // Shuffles the tests inside each test case.
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    test_cases_[i]->ShuffleTests(random());
+  // Shuffles the tests inside each test suite.
+  for (auto &test_suite : test_suites_) {
+    test_suite->ShuffleTests(random());
   }
 }
 
-// Restores the test cases and tests to their order before the first shuffle.
+// Restores the test suites and tests to their order before the first shuffle.
 void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    // Unshuffles the tests in each test case.
-    test_cases_[i]->UnshuffleTests();
-    // Resets the index of each test case.
-    test_case_indices_[i] = static_cast<int>(i);
+  for (size_t i = 0; i < test_suites_.size(); i++) {
+    // Unshuffles the tests in each test suite.
+    test_suites_[i]->UnshuffleTests();
+    // Resets the index of each test suite.
+    test_suite_indices_[i] = static_cast<int>(i);
   }
 }
 
@@ -4971,7 +5733,7 @@
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+std::string GetCurrentOsStackTraceExceptTop(UnitTest * /*unit_test*/,
                                             int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
@@ -4982,7 +5744,7 @@
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}
+}  // namespace
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -4990,8 +5752,7 @@
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -4999,7 +5760,7 @@
 // If *pstr starts with the given prefix, modifies *pstr to be right
 // past the prefix and returns true; otherwise leaves *pstr unchanged
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
+bool SkipPrefix(const char *prefix, const char **pstr) {
   const size_t prefix_len = strlen(prefix);
   if (strncmp(*pstr, prefix, prefix_len) == 0) {
     *pstr += prefix_len;
@@ -5013,19 +5774,18 @@
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
+static const char *ParseFlagValue(const char *str, const char *flag,
+                                  bool def_optional) {
   // str and flag must not be NULL.
-  if (str == NULL || flag == NULL) return NULL;
+  if (str == nullptr || flag == nullptr) return nullptr;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
   const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
   const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
   // Skips the flag name.
-  const char* flag_end = str + flag_len;
+  const char *flag_end = str + flag_len;
 
   // When def_optional is true, it's OK to not have a "=value" part.
   if (def_optional && (flag_end[0] == '\0')) {
@@ -5035,7 +5795,7 @@
   // If def_optional is true and there are more characters after the
   // flag name, or if def_optional is false, there must be a '=' after
   // the flag name.
-  if (flag_end[0] != '=') return NULL;
+  if (flag_end[0] != '=') return nullptr;
 
   // Returns the string after "=".
   return flag_end + 1;
@@ -5051,46 +5811,45 @@
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseBoolFlag(const char *str, const char *flag, bool *value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
+  const char *const value_str = ParseFlagValue(str, flag, true);
 
   // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
+  if (value_str == nullptr) return false;
 
   // Converts the string value to a bool.
   *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
   return true;
 }
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an int32_t flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+bool ParseInt32Flag(const char *str, const char *flag, int32_t *value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char *const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
+  if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag, value_str,
+                    value);
 }
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+template <typename String>
+static bool ParseStringFlag(const char *str, const char *flag, String *value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char *const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
+  if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
   *value = value_str;
@@ -5103,9 +5862,8 @@
 // recognized, it will print its help message. Flags starting with
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
+static bool HasGoogleTestFlagPrefix(const char *str) {
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -5121,9 +5879,7 @@
 //   @Y    changes the color to yellow.
 //   @D    changes to the default terminal text color.
 //
-// TODO(wan@google.com): Write tests for this once we add stdout
-// capturing to Google Test.
-static void PrintColorEncoded(const char* str) {
+static void PrintColorEncoded(const char *str) {
   GTestColor color = COLOR_DEFAULT;  // The current color.
 
   // Conceptually, we split the string into segments divided by escape
@@ -5131,8 +5887,8 @@
   // each iteration, the str pointer advances to the beginning of the
   // next segment.
   for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == NULL) {
+    const char *p = strchr(str, '@');
+    if (p == nullptr) {
       ColoredPrintf(color, "%s", str);
       return;
     }
@@ -5158,117 +5914,137 @@
 }
 
 static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSTIVE_PATTERNS"
     "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  -@G-" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
 #if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
 #endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
+    "\n"
+    "Assertion Behavior:\n"
 #if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
 #endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
     "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
     "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
-bool ParseGoogleTestFlag(const char* const arg) {
+static bool ParseGoogleTestFlag(const char *const arg) {
   return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
                        &GTEST_FLAG(also_run_disabled_tests)) ||
-      ParseBoolFlag(arg, kBreakOnFailureFlag,
-                    &GTEST_FLAG(break_on_failure)) ||
-      ParseBoolFlag(arg, kCatchExceptionsFlag,
-                    &GTEST_FLAG(catch_exceptions)) ||
-      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-      ParseStringFlag(arg, kDeathTestStyleFlag,
-                      &GTEST_FLAG(death_test_style)) ||
-      ParseBoolFlag(arg, kDeathTestUseFork,
-                    &GTEST_FLAG(death_test_use_fork)) ||
-      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-      ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                      &GTEST_FLAG(internal_run_death_test)) ||
-      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-      ParseInt32Flag(arg, kStackTraceDepthFlag,
-                     &GTEST_FLAG(stack_trace_depth)) ||
-      ParseStringFlag(arg, kStreamResultToFlag,
-                      &GTEST_FLAG(stream_result_to)) ||
-      ParseBoolFlag(arg, kThrowOnFailureFlag,
-                    &GTEST_FLAG(throw_on_failure));
+         ParseBoolFlag(arg, kBreakOnFailureFlag,
+                       &GTEST_FLAG(break_on_failure)) ||
+         ParseBoolFlag(arg, kCatchExceptionsFlag,
+                       &GTEST_FLAG(catch_exceptions)) ||
+         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+         ParseStringFlag(arg, kDeathTestStyleFlag,
+                         &GTEST_FLAG(death_test_style)) ||
+         ParseBoolFlag(arg, kDeathTestUseFork,
+                       &GTEST_FLAG(death_test_use_fork)) ||
+         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+         ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                         &GTEST_FLAG(internal_run_death_test)) ||
+         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+         ParseInt32Flag(arg, kStackTraceDepthFlag,
+                        &GTEST_FLAG(stack_trace_depth)) ||
+         ParseStringFlag(arg, kStreamResultToFlag,
+                         &GTEST_FLAG(stream_result_to)) ||
+         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-void LoadFlagsFromFile(const std::string& path) {
-  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+static void LoadFlagsFromFile(const std::string &path) {
+  FILE *flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            GTEST_FLAG(flagfile).c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
   posix::FClose(flagfile);
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -5277,10 +6053,10 @@
 // other parts of Google Test.  The type parameter CharType can be
 // instantiated to either char or wchar_t.
 template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+void ParseGoogleTestFlagsOnlyImpl(int *argc, CharType **argv) {
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
+    const char *const arg = arg_string.c_str();
 
     using internal::ParseBoolFlag;
     using internal::ParseInt32Flag;
@@ -5330,10 +6106,21 @@
 
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+void ParseGoogleTestFlagsOnly(int *argc, char **argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but if and only if
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
 }
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
 }
 
@@ -5342,7 +6129,7 @@
 // The type parameter CharType can be instantiated to either char or
 // wchar_t.
 template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
+void InitGoogleTestImpl(int *argc, CharType **argv) {
   // We don't want to run the initialization code twice.
   if (GTestIsInitialized()) return;
 
@@ -5353,6 +6140,10 @@
     g_argvs.push_back(StreamableToString(argv[i]));
   }
 
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
+
   ParseGoogleTestFlagsOnly(argc, argv);
   GetUnitTestImpl()->PostFlagParsingInit();
 }
@@ -5368,22 +6159,82 @@
 // updated.
 //
 // Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
+void InitGoogleTest(int *argc, char **argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
+void InitGoogleTest(int *argc, wchar_t **argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+void InitGoogleTest() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char *argv0 = const_cast<char *>(arg0);
+  char **argv = &argv0;
+
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(&argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char *temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == nullptr || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  const char *temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0')
+    return "/data/local/tmp/";
+  else
+    return temp_dir;
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char *file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
 }  // namespace testing

diff --git a/libaom/third_party/googletest/src/googletest/src/gtest_main.cc b/libaom/third_party/googletest/src/googletest/src/gtest_main.cc
index f302822..77c90ce 100644
--- a/libaom/third_party/googletest/src/googletest/src/gtest_main.cc
+++ b/libaom/third_party/googletest/src/googletest/src/gtest_main.cc

@@ -27,12 +27,26 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <stdio.h>
-
+#include <cstdio>
 #include "gtest/gtest.h"
 
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() { testing::InitGoogleTest(); }
+
+void loop() { RUN_ALL_TESTS(); }
+
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
 GTEST_API_ int main(int argc, char **argv) {
-  printf("Running main() from gtest_main.cc\n");
+  printf("Running main() from %s\n", __FILE__);
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+#endif

diff --git a/libaom/third_party/libwebm/README.libaom b/libaom/third_party/libwebm/README.libaom
index 17b2f47..1e87afd 100644
--- a/libaom/third_party/libwebm/README.libaom
+++ b/libaom/third_party/libwebm/README.libaom

@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9f23fbc50e7a76c815b1d3f0309abe1066301331
+Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
 License: BSD
 License File: LICENSE.txt
 

diff --git a/libaom/third_party/libwebm/common/file_util.cc b/libaom/third_party/libwebm/common/file_util.cc
index e6109d5..6eb6428 100644
--- a/libaom/third_party/libwebm/common/file_util.cc
+++ b/libaom/third_party/libwebm/common/file_util.cc

@@ -24,8 +24,8 @@
 std::string GetTempFileName() {
 #if !defined _MSC_VER && !defined __MINGW32__
   std::string temp_file_name_template_str =
-      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
-                                               ".") +
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR")
+                                             : ".") +
       "/libwebm_temp.XXXXXX";
   char* temp_file_name_template =
       new char[temp_file_name_template_str.length() + 1];

diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 3bff7cd..6436817 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc

@@ -508,7 +508,7 @@
   if (WriteUInt(writer, length))
     return false;
 
-  if (writer->Write(value, static_cast<const uint32>(length)))
+  if (writer->Write(value, static_cast<uint32>(length)))
     return false;
 
   return true;
@@ -562,10 +562,10 @@
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
 
-  return frame->CanBeSimpleBlock() ?
-             WriteSimpleBlock(writer, frame, relative_timecode) :
-             WriteBlock(writer, frame, relative_timecode,
-                        cluster->timecode_scale());
+  return frame->CanBeSimpleBlock()
+             ? WriteSimpleBlock(writer, frame, relative_timecode)
+             : WriteBlock(writer, frame, relative_timecode,
+                          cluster->timecode_scale());
 }
 
 uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {

diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 132388d..3355428 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h

@@ -31,6 +31,9 @@
 // Writes out |value| in Big Endian order. Returns 0 on success.
 int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
 
+// Writes out |f| in Big Endian order. Returns 0 on success.
+int32 SerializeFloat(IMkvWriter* writer, float f);
+
 // Returns the size in bytes of the element.
 int32 GetUIntSize(uint64 value);
 int32 GetIntSize(int64 value);

diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.cc b/libaom/third_party/libwebm/mkvparser/mkvparser.cc
index 9c78ead..ace65bd 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libaom/third_party/libwebm/mkvparser/mkvparser.cc

@@ -4230,6 +4230,7 @@
         new (std::nothrow) ContentEncryption*[encryption_count];
     if (!encryption_entries_) {
       delete[] compression_entries_;
+      compression_entries_ = NULL;
       return -1;
     }
     encryption_entries_end_ = encryption_entries_;
@@ -4261,6 +4262,7 @@
         delete compression;
         return status;
       }
+      assert(compression_count > 0);
       *compression_entries_end_++ = compression;
     } else if (id == libwebm::kMkvContentEncryption) {
       ContentEncryption* const encryption =
@@ -4273,6 +4275,7 @@
         delete encryption;
         return status;
       }
+      assert(encryption_count > 0);
       *encryption_entries_end_++ = encryption;
     }
 
@@ -4325,6 +4328,12 @@
         return status;
       }
 
+      // There should be only one settings element per content compression.
+      if (compression->settings != NULL) {
+        delete[] buf;
+        return E_FILE_FORMAT_INVALID;
+      }
+
       compression->settings = buf;
       compression->settings_len = buflen;
     }
@@ -5311,7 +5320,7 @@
 
   const long long stop = pos + s.size;
 
-  Colour* colour = NULL;
+  std::unique_ptr<Colour> colour_ptr;
   std::unique_ptr<Projection> projection_ptr;
 
   while (pos < stop) {
@@ -5361,8 +5370,12 @@
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
     } else if (id == libwebm::kMkvColour) {
-      if (!Colour::Parse(pReader, pos, size, &colour))
+      Colour* colour = NULL;
+      if (!Colour::Parse(pReader, pos, size, &colour)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        colour_ptr.reset(colour);
+      }
     } else if (id == libwebm::kMkvProjection) {
       Projection* projection = NULL;
       if (!Projection::Parse(pReader, pos, size, &projection)) {
@@ -5404,7 +5417,7 @@
   pTrack->m_display_unit = display_unit;
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
-  pTrack->m_colour = colour;
+  pTrack->m_colour = colour_ptr.release();
   pTrack->m_colour_space = colour_space;
   pTrack->m_projection = projection_ptr.release();
 

diff --git a/libaom/third_party/vector/README.libaom b/libaom/third_party/vector/README.libaom
index 2bb8b2d..729446d 100644
--- a/libaom/third_party/vector/README.libaom
+++ b/libaom/third_party/vector/README.libaom

@@ -10,5 +10,7 @@
 the entire C++ std::vector API, including iterators.
 
 Local Modifications:
-Renamed some functions to fit in with the AOMedia
+1. Renamed some functions to fit in with the AOMedia
 naming convention.
+2. Removed non-global functions from vector.h.
+3. Made all non-global functions in vector.c static.

diff --git a/libaom/third_party/vector/vector.c b/libaom/third_party/vector/vector.c
index fe46246..4b8b9c6 100644
--- a/libaom/third_party/vector/vector.c
+++ b/libaom/third_party/vector/vector.c

@@ -28,6 +28,134 @@
 
 #include "third_party/vector/vector.h"
 
+/***** PRIVATE *****/
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static bool _vector_should_grow(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity;
+}
+
+static bool _vector_should_shrink(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD;
+}
+
+static void *_vector_offset(Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static const void *_vector_const_offset(const Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static void _vector_assign(Vector *vector, size_t index, void *element) {
+  /* Insert the element */
+  void *offset = _vector_offset(vector, index);
+  memcpy(offset, element, vector->element_size);
+}
+
+static int _vector_move_right(Vector *vector, size_t index) {
+  assert(vector->size < vector->capacity);
+
+  /* The location where to start to move from. */
+  void *offset = _vector_offset(vector, index);
+
+  /* How many to move to the right. */
+  size_t elements_in_bytes = (vector->size - index) * vector->element_size;
+
+#ifdef __STDC_LIB_EXT1__
+  size_t right_capacity_in_bytes =
+      (vector->capacity - (index + 1)) * vector->element_size;
+
+  /* clang-format off */
+    int return_code =  memmove_s(
+        offset + vector->element_size,
+        right_capacity_in_bytes,
+        offset,
+        elements_in_bytes);
+
+  /* clang-format on */
+
+  return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR;
+
+#else
+  // memmove(offset + vector->element_size, offset, elements_in_bytes);
+  memmove((unsigned char *)offset + vector->element_size, offset,
+          elements_in_bytes);
+  return VECTOR_SUCCESS;
+#endif
+}
+
+static void _vector_move_left(Vector *vector, size_t index) {
+  size_t right_elements_in_bytes;
+  void *offset;
+
+  /* The offset into the memory */
+  offset = _vector_offset(vector, index);
+
+  /* How many to move to the left */
+  right_elements_in_bytes = (vector->size - index - 1) * vector->element_size;
+
+  // memmove(offset, offset + vector->element_size, right_elements_in_bytes);
+  memmove(offset, (unsigned char *)offset + vector->element_size,
+          right_elements_in_bytes);
+}
+
+static int _vector_reallocate(Vector *vector, size_t new_capacity) {
+  size_t new_capacity_in_bytes;
+  void *old;
+  assert(vector != NULL);
+
+  if (new_capacity < VECTOR_MINIMUM_CAPACITY) {
+    if (vector->capacity > VECTOR_MINIMUM_CAPACITY) {
+      new_capacity = VECTOR_MINIMUM_CAPACITY;
+    } else {
+      /* NO-OP */
+      return VECTOR_SUCCESS;
+    }
+  }
+
+  new_capacity_in_bytes = new_capacity * vector->element_size;
+  old = vector->data;
+
+  if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) {
+    return VECTOR_ERROR;
+  }
+
+#ifdef __STDC_LIB_EXT1__
+  /* clang-format off */
+    if (memcpy_s(vector->data,
+                             new_capacity_in_bytes,
+                             old,
+                             aom_vector_byte_size(vector)) != 0) {
+        return VECTOR_ERROR;
+    }
+/* clang-format on */
+#else
+  memcpy(vector->data, old, aom_vector_byte_size(vector));
+#endif
+
+  vector->capacity = new_capacity;
+
+  free(old);
+
+  return VECTOR_SUCCESS;
+}
+
+static int _vector_adjust_capacity(Vector *vector) {
+  return _vector_reallocate(vector,
+                            MAX(1, vector->size * VECTOR_GROWTH_FACTOR));
+}
+
+static void _vector_swap(size_t *first, size_t *second) {
+  size_t temp = *first;
+  *first = *second;
+  *second = temp;
+}
+
 int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) {
   assert(vector != NULL);
 
@@ -346,10 +474,10 @@
   return iterator;
 }
 
-void *iterator_get(Iterator *iterator) { return iterator->pointer; }
+void *aom_iterator_get(Iterator *iterator) { return iterator->pointer; }
 
-int iterator_erase(Vector *vector, Iterator *iterator) {
-  size_t index = iterator_index(vector, iterator);
+int aom_iterator_erase(Vector *vector, Iterator *iterator) {
+  size_t index = aom_iterator_index(vector, iterator);
 
   if (aom_vector_erase(vector, index) == VECTOR_ERROR) {
     return VECTOR_ERROR;
@@ -360,184 +488,53 @@
   return VECTOR_SUCCESS;
 }
 
-void iterator_increment(Iterator *iterator) {
+void aom_iterator_increment(Iterator *iterator) {
   assert(iterator != NULL);
   // iterator->pointer += iterator->element_size;
   iterator->pointer =
       (unsigned char *)iterator->pointer + iterator->element_size;
 }
 
-void iterator_decrement(Iterator *iterator) {
+void aom_iterator_decrement(Iterator *iterator) {
   assert(iterator != NULL);
   // iterator->pointer -= iterator->element_size;
   iterator->pointer =
       (unsigned char *)iterator->pointer - iterator->element_size;
 }
 
-void *iterator_next(Iterator *iterator) {
+void *aom_iterator_next(Iterator *iterator) {
   void *current = iterator->pointer;
-  iterator_increment(iterator);
+  aom_iterator_increment(iterator);
 
   return current;
 }
 
-void *iterator_previous(Iterator *iterator) {
+void *aom_iterator_previous(Iterator *iterator) {
   void *current = iterator->pointer;
-  iterator_decrement(iterator);
+  aom_iterator_decrement(iterator);
 
   return current;
 }
 
-bool iterator_equals(Iterator *first, Iterator *second) {
+bool aom_iterator_equals(Iterator *first, Iterator *second) {
   assert(first->element_size == second->element_size);
   return first->pointer == second->pointer;
 }
 
-bool iterator_is_before(Iterator *first, Iterator *second) {
+bool aom_iterator_is_before(Iterator *first, Iterator *second) {
   assert(first->element_size == second->element_size);
   return first->pointer < second->pointer;
 }
 
-bool iterator_is_after(Iterator *first, Iterator *second) {
+bool aom_iterator_is_after(Iterator *first, Iterator *second) {
   assert(first->element_size == second->element_size);
   return first->pointer > second->pointer;
 }
 
-size_t iterator_index(Vector *vector, Iterator *iterator) {
+size_t aom_iterator_index(Vector *vector, Iterator *iterator) {
   assert(vector != NULL);
   assert(iterator != NULL);
   // return (iterator->pointer - vector->data) / vector->element_size;
   return ((unsigned char *)iterator->pointer - (unsigned char *)vector->data) /
          vector->element_size;
 }
-
-/***** PRIVATE *****/
-
-bool _vector_should_grow(Vector *vector) {
-  assert(vector->size <= vector->capacity);
-  return vector->size == vector->capacity;
-}
-
-bool _vector_should_shrink(Vector *vector) {
-  assert(vector->size <= vector->capacity);
-  return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD;
-}
-
-size_t _vector_free_bytes(const Vector *vector) {
-  return aom_vector_free_space(vector) * vector->element_size;
-}
-
-void *_vector_offset(Vector *vector, size_t index) {
-  // return vector->data + (index * vector->element_size);
-  return (unsigned char *)vector->data + (index * vector->element_size);
-}
-
-const void *_vector_const_offset(const Vector *vector, size_t index) {
-  // return vector->data + (index * vector->element_size);
-  return (unsigned char *)vector->data + (index * vector->element_size);
-}
-
-void _vector_assign(Vector *vector, size_t index, void *element) {
-  /* Insert the element */
-  void *offset = _vector_offset(vector, index);
-  memcpy(offset, element, vector->element_size);
-}
-
-int _vector_move_right(Vector *vector, size_t index) {
-  assert(vector->size < vector->capacity);
-
-  /* The location where to start to move from. */
-  void *offset = _vector_offset(vector, index);
-
-  /* How many to move to the right. */
-  size_t elements_in_bytes = (vector->size - index) * vector->element_size;
-
-#ifdef __STDC_LIB_EXT1__
-  size_t right_capacity_in_bytes =
-      (vector->capacity - (index + 1)) * vector->element_size;
-
-  /* clang-format off */
-    int return_code =  memmove_s(
-        offset + vector->element_size,
-        right_capacity_in_bytes,
-        offset,
-        elements_in_bytes);
-
-  /* clang-format on */
-
-  return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR;
-
-#else
-  // memmove(offset + vector->element_size, offset, elements_in_bytes);
-  memmove((unsigned char *)offset + vector->element_size, offset,
-          elements_in_bytes);
-  return VECTOR_SUCCESS;
-#endif
-}
-
-void _vector_move_left(Vector *vector, size_t index) {
-  size_t right_elements_in_bytes;
-  void *offset;
-
-  /* The offset into the memory */
-  offset = _vector_offset(vector, index);
-
-  /* How many to move to the left */
-  right_elements_in_bytes = (vector->size - index - 1) * vector->element_size;
-
-  // memmove(offset, offset + vector->element_size, right_elements_in_bytes);
-  memmove(offset, (unsigned char *)offset + vector->element_size,
-          right_elements_in_bytes);
-}
-
-int _vector_adjust_capacity(Vector *vector) {
-  return _vector_reallocate(vector,
-                            MAX(1, vector->size * VECTOR_GROWTH_FACTOR));
-}
-
-int _vector_reallocate(Vector *vector, size_t new_capacity) {
-  size_t new_capacity_in_bytes;
-  void *old;
-  assert(vector != NULL);
-
-  if (new_capacity < VECTOR_MINIMUM_CAPACITY) {
-    if (vector->capacity > VECTOR_MINIMUM_CAPACITY) {
-      new_capacity = VECTOR_MINIMUM_CAPACITY;
-    } else {
-      /* NO-OP */
-      return VECTOR_SUCCESS;
-    }
-  }
-
-  new_capacity_in_bytes = new_capacity * vector->element_size;
-  old = vector->data;
-
-  if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) {
-    return VECTOR_ERROR;
-  }
-
-#ifdef __STDC_LIB_EXT1__
-  /* clang-format off */
-    if (memcpy_s(vector->data,
-                             new_capacity_in_bytes,
-                             old,
-                             aom_vector_byte_size(vector)) != 0) {
-        return VECTOR_ERROR;
-    }
-/* clang-format on */
-#else
-  memcpy(vector->data, old, aom_vector_byte_size(vector));
-#endif
-
-  vector->capacity = new_capacity;
-
-  free(old);
-
-  return VECTOR_SUCCESS;
-}
-
-void _vector_swap(size_t *first, size_t *second) {
-  size_t temp = *first;
-  *first = *second;
-  *second = temp;
-}

diff --git a/libaom/third_party/vector/vector.h b/libaom/third_party/vector/vector.h
index 02743f5..d09eb64 100644
--- a/libaom/third_party/vector/vector.h
+++ b/libaom/third_party/vector/vector.h

@@ -112,48 +112,27 @@
 Iterator aom_vector_end(Vector *vector);
 Iterator aom_vector_iterator(Vector *vector, size_t index);
 
-void *iterator_get(Iterator *iterator);
-#define ITERATOR_GET_AS(type, iterator) *((type *)iterator_get((iterator)))
+void *aom_iterator_get(Iterator *iterator);
+#define ITERATOR_GET_AS(type, iterator) *((type *)aom_iterator_get((iterator)))
 
-int iterator_erase(Vector *vector, Iterator *iterator);
+int aom_iterator_erase(Vector *vector, Iterator *iterator);
 
-void iterator_increment(Iterator *iterator);
-void iterator_decrement(Iterator *iterator);
+void aom_iterator_increment(Iterator *iterator);
+void aom_iterator_decrement(Iterator *iterator);
 
-void *iterator_next(Iterator *iterator);
-void *iterator_previous(Iterator *iterator);
+void *aom_iterator_next(Iterator *iterator);
+void *aom_iterator_previous(Iterator *iterator);
 
-bool iterator_equals(Iterator *first, Iterator *second);
-bool iterator_is_before(Iterator *first, Iterator *second);
-bool iterator_is_after(Iterator *first, Iterator *second);
+bool aom_iterator_equals(Iterator *first, Iterator *second);
+bool aom_iterator_is_before(Iterator *first, Iterator *second);
+bool aom_iterator_is_after(Iterator *first, Iterator *second);
 
-size_t iterator_index(Vector *vector, Iterator *iterator);
+size_t aom_iterator_index(Vector *vector, Iterator *iterator);
 
-#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name)           \
+#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name)               \
   for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \
       end = aom_vector_end((aom_vector_pointer));                        \
-       !iterator_equals(&(iterator_name), &end);                 \
-       iterator_increment(&(iterator_name)))
-
-/***** PRIVATE *****/
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-bool _vector_should_grow(Vector *vector);
-bool _vector_should_shrink(Vector *vector);
-
-size_t _vector_free_bytes(const Vector *vector);
-void *_vector_offset(Vector *vector, size_t index);
-const void *_vector_const_offset(const Vector *vector, size_t index);
-
-void _vector_assign(Vector *vector, size_t index, void *element);
-
-int _vector_move_right(Vector *vector, size_t index);
-void _vector_move_left(Vector *vector, size_t index);
-
-int _vector_adjust_capacity(Vector *vector);
-int _vector_reallocate(Vector *vector, size_t new_capacity);
-
-void _vector_swap(size_t *first, size_t *second);
+       !aom_iterator_equals(&(iterator_name), &end);                     \
+       aom_iterator_increment(&(iterator_name)))
 
 #endif /* VECTOR_H */

diff --git a/libaom/tools/aom_entropy_optimizer.c b/libaom/tools/aom_entropy_optimizer.c
index 551adf4..9f529d9 100644
--- a/libaom/tools/aom_entropy_optimizer.c
+++ b/libaom/tools/aom_entropy_optimizer.c

@@ -271,7 +271,10 @@
 
   FRAME_COUNTS fc;
   const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
-  if (!bytes) return 1;
+  if (!bytes) {
+    fclose(statsfile);
+    return 1;
+  }
 
   FILE *const probsfile = fopen("optimized_probs.c", "w");
   if (probsfile == NULL) {
@@ -323,9 +326,9 @@
   /* block partition */
   cts_each_dim[0] = PARTITION_CONTEXTS;
   cts_each_dim[1] = EXT_PARTITION_TYPES;
-  int part_types_each_ctx[PARTITION_CONTEXTS] = {
-    4, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8
-  };
+  int part_types_each_ctx[PARTITION_CONTEXTS] = { 4,  4,  4,  4,  10, 10, 10,
+                                                  10, 10, 10, 10, 10, 10, 10,
+                                                  10, 10, 8,  8,  8,  8 };
   optimize_cdf_table_var_modes_2d(
       &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
       "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"

diff --git a/libaom/usage.dox b/libaom/usage.dox
index 062d35a..4004f4a 100644
--- a/libaom/usage.dox
+++ b/libaom/usage.dox

@@ -59,7 +59,6 @@
 
     \if decoder
     Currently defined decoder features include:
-    - \ref usage_cb
     \endif
 
     \section usage_init Initialization
@@ -81,7 +80,6 @@
     The available initialization methods are:
     \if encoder
     \li #aom_codec_enc_init (calls aom_codec_enc_init_ver())
-    \li #aom_codec_enc_init_multi (calls aom_codec_enc_init_multi_ver())
     \endif
     \if decoder
     \li #aom_codec_dec_init (calls aom_codec_dec_init_ver())

diff --git a/libaom/usage_dx.dox b/libaom/usage_dx.dox
index eef7837..76dc213 100644
--- a/libaom/usage_dx.dox
+++ b/libaom/usage_dx.dox

@@ -10,31 +10,8 @@
     \ref samples
 
 
-    \section usage_cb Callback Based Decoding
-    There are two methods for the application to access decoded frame data. Some
-    codecs support asynchronous (callback-based) decoding \ref usage_features
-    that allow the application to register a callback to be invoked by the
-    decoder when decoded data becomes available. Decoders are not required to
-    support this feature, however. Like all \ref usage_features, support can be
-    determined by calling aom_codec_get_caps(). Callbacks are available in both
-    frame-based and slice-based variants. Frame based callbacks conform to the
-    signature of #aom_codec_put_frame_cb_fn_t and are invoked once the entire
-    frame has been decoded. Slice based callbacks conform to the signature of
-    #aom_codec_put_slice_cb_fn_t and are invoked after a subsection of the frame
-    is decoded. For example, a slice callback could be issued for each
-    macroblock row. However, the number and size of slices to return is
-    implementation specific. Also, the image data passed in a slice callback is
-    not necessarily in the same memory segment as the data will be when it is
-    assembled into a full frame. For this reason, the application \ref MUST
-    examine the rectangles that describe what data is valid to access and what
-    data has been updated in this call. For all their additional complexity,
-    slice based decoding callbacks provide substantial speed gains to the
-    overall application in some cases, due to improved cache behavior.
-
-
     \section usage_frame_iter Frame Iterator Based Decoding
-    If the codec does not support callback based decoding, or the application
-    chooses not to make use of that feature, decoded frames are made available
+    Decoded frames are made available to the application
     through the aom_codec_get_frame() iterator. The application initializes the
     iterator storage (of type #aom_codec_iter_t) to NULL, then calls
     aom_codec_get_frame repeatedly until it returns NULL, indicating that all
@@ -42,16 +19,4 @@
     frames that are ready for display, depending on the codec.
 
 
-    \section usage_postproc Postprocessing
-    Postprocessing is a process that is applied after a frame is decoded to
-    enhance the image's appearance by removing artifacts introduced in the
-    compression process. It is not required to properly decode the frame, and
-    is generally done only when there is enough spare CPU time to execute
-    the required filters. Codecs may support a number of different
-    postprocessing filters, and the available filters may differ from platform
-    to platform. Embedded devices often do not have enough CPU to implement
-    postprocessing in software. The filter selection is generally handled
-    automatically by the codec.
-
-
 */
commit	d1176f5c7c9f6816ca632f3a512faec108134b2a	[log] [tgz]
author	James Zern <jzern@google.com>	Fri Jul 24 21:24:41 2020 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	Fri Jul 24 21:24:41 2020 +0000
tree	b86816b6b139a3b6c9153ed4cd22706b802eeedb
parent	151299aacf7b59078f6515687d79746680495af2 [diff]
parent	f44c3dd504951c753ad446544e15d4f5c5549d65 [diff]